├── LocalAggregation
    ├── src
    │   ├── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── setup.pyc
    │   │   ├── utils.pyc
    │   │   ├── __init__.pyc
    │   │   ├── constants.py
    │   │   ├── tensor.py
    │   │   ├── utils.py
    │   │   └── setup.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── agents.pyc
    │   │   └── __init__.pyc
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── target_transforms.py
    │   │   ├── temporal_transforms.py
    │   │   ├── kinetics.py
    │   │   └── spatial_transforms.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── resnet3d.py
    │   ├── objectives
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── localagg.pyc
    │   │   ├── instance.py
    │   │   └── localagg.py
    │   ├── __init__.pyc
    │   └── idt
    │   │   ├── idt.sh
    │   │   ├── sketching.py
    │   │   ├── compute_fv_models.py
    │   │   ├── cluster_fv.py
    │   │   └── extract_idt.py
    ├── init_env.sh
    ├── scripts
    │   ├── instance.py
    │   └── localagg.py
    └── config
    │   ├── kinetics_ir.json
    │   ├── kinetics_la_tune.json
    │   └── kinetics_la.json
├── 3D-ResNet
    ├── models
    │   ├── __pycache__
    │   │   └── resnet.cpython-36.pyc
    │   └── resnet.py
    ├── target_transforms.py
    ├── mean.py
    ├── utils
    │   ├── n_frames_ucf101_hmdb51.py
    │   ├── video_jpg.py
    │   ├── n_frames_kinetics.py
    │   ├── fps.py
    │   ├── video_jpg_kinetics.py
    │   ├── video_jpg_ucf101_hmdb51.py
    │   ├── kinetics_json.py
    │   ├── hmdb51_json.py
    │   └── ucf101_json.py
    ├── results
    │   └── opts.json
    ├── utils.py
    ├── train.py
    ├── test.py
    ├── temporal_transforms.py
    ├── validation.py
    ├── dataset.py
    ├── model.py
    ├── main.py
    ├── datasets
    │   ├── kinetics.py
    │   ├── hmdb51.py
    │   └── ucf101.py
    ├── opts.py
    ├── nohup.out
    └── spatial_transforms.py
├── LICENSE
└── README.md


/LocalAggregation/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/objectives/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LocalAggregation/init_env.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=${PYTHONPATH}:$(pwd)


--------------------------------------------------------------------------------
/LocalAggregation/src/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/__init__.pyc


--------------------------------------------------------------------------------
/LocalAggregation/src/utils/setup.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/utils/setup.pyc


--------------------------------------------------------------------------------
/LocalAggregation/src/utils/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/utils/utils.pyc


--------------------------------------------------------------------------------
/LocalAggregation/src/agents/agents.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/agents/agents.pyc


--------------------------------------------------------------------------------
/LocalAggregation/src/utils/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/utils/__init__.pyc


--------------------------------------------------------------------------------
/LocalAggregation/src/agents/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/agents/__init__.pyc


--------------------------------------------------------------------------------
/LocalAggregation/src/objectives/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/objectives/__init__.pyc


--------------------------------------------------------------------------------
/LocalAggregation/src/objectives/localagg.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/objectives/localagg.pyc


--------------------------------------------------------------------------------
/3D-ResNet/models/__pycache__/resnet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/3D-ResNet/models/__pycache__/resnet.cpython-36.pyc


--------------------------------------------------------------------------------
/LocalAggregation/src/datasets/target_transforms.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | class ClassLabel(object):
4 | 
5 |     def __call__(self, target):
6 |         return target['label']
7 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/utils/constants.py:
--------------------------------------------------------------------------------
1 | import os
2 | ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '../../'))
3 | SRC_DIR = os.path.join(ROOT_DIR, 'src')
4 | SCRIPTS_DIR = os.path.join(ROOT_DIR, 'scripts')
5 | EVAL_DIR = os.path.join(SRC_DIR, 'evaluation')
6 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/utils/tensor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def l2_normalize(x, dim=1):
 5 |     return x / torch.sqrt(torch.sum(x**2, dim=dim).unsqueeze(dim))
 6 | 
 7 | 
 8 | def repeat_1d_tensor(t, num_reps):
 9 |     return t.unsqueeze(1).expand(-1, num_reps)
10 | 
11 | 


--------------------------------------------------------------------------------
/3D-ResNet/target_transforms.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | import torch
 4 | 
 5 | 
 6 | class Compose(object):
 7 | 
 8 |     def __init__(self, transforms):
 9 |         self.transforms = transforms
10 | 
11 |     def __call__(self, target):
12 |         dst = []
13 |         for t in self.transforms:
14 |             dst.append(t(target))
15 |         return dst
16 | 
17 | 
18 | class ClassLabel(object):
19 | 
20 |     def __call__(self, target):
21 |         return target['label']
22 | 
23 | 
24 | class VideoID(object):
25 | 
26 |     def __call__(self, target):
27 |         return target['video_id']
28 | 
29 | 


--------------------------------------------------------------------------------
/3D-ResNet/mean.py:
--------------------------------------------------------------------------------
 1 | def get_mean(norm_value=255, dataset='activitynet'):
 2 |     assert dataset in ['activitynet', 'kinetics']
 3 | 
 4 |     if dataset == 'activitynet':
 5 |         return [
 6 |             114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value
 7 |         ]
 8 |     elif dataset == 'kinetics':
 9 |         # Kinetics (10 videos for each class)
10 |         return [
11 |             110.63666788 / norm_value, 103.16065604 / norm_value,
12 |             96.29023126 / norm_value
13 |         ]
14 | 
15 | 
16 | def get_std(norm_value=255):
17 |     # Kinetics (10 videos for each class)
18 |     return [
19 |         38.7568578 / norm_value, 37.88248729 / norm_value,
20 |         40.02898126 / norm_value
21 |     ]
22 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/idt/idt.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PATH=/home/ptokmako/src/opencv/lib:/home/ptokmako/miniconda2/bin:/home/ptokmako/src/ffmpeg/lib:/home/ptokmako/src/ffmpeg/include:/home/ptokmako/miniconda2/envs/idt/lib:/home/ptokmako/miniconda2/envs/idt/include:/opt/cuda/9.1/bin:/home/ptokmako/torch/install/bin:/home/ptokmako/miniconda2/envs/idt/bin:/home/ptokmako/miniconda2/condabin:/opt/gcc49/bin:/opt/openmpi/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/rocks/bin:/opt/rocks/sbin:/home/ptokmako/bin
 4 | export LD_LIBRARY_PATH=/home/ptokmako/src/opencv/lib/:/home/ptokmako/src/ffmpeg/lib/:/home/ptokmako/src/ffmpeg/include/:/home/ptokmako/miniconda2/envs/idt/lib:/home/ptokmako/miniconda2/envs/idt/include:/opt/cuda/9.1/lib64:/opt/cuda/9.1/lib:/home/ptokmako/torch/install/lib:/opt/openmpi/lib:/home/ptokmako/local/readline-8.0/lib
 5 | 
 6 | source ~/miniconda2/etc/profile.d/conda.sh
 7 | 
 8 | conda activate idt
 9 | 
10 | /home/ptokmako/src/improved_trajectory_release/release/DenseTrackStab $1 -H $2 | gzip > $4/$3.gz
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 | 
3 | Copyright (c) 2020 Pavel Tokmakov
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/3D-ResNet/utils/n_frames_ucf101_hmdb51.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   for file_name in os.listdir(class_path):
12 |     video_dir_path = os.path.join(class_path, file_name)
13 |     image_indices = []
14 |     for image_file_name in os.listdir(video_dir_path):
15 |       if 'image' not in image_file_name:
16 |         continue
17 |       image_indices.append(int(image_file_name[6:11]))
18 | 
19 |     if len(image_indices) == 0:
20 |       print('no image files', video_dir_path)
21 |       n_frames = 0
22 |     else:
23 |       image_indices.sort(reverse=True)
24 |       n_frames = image_indices[0]
25 |       print(video_dir_path, n_frames)
26 |     with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file:
27 |       dst_file.write(str(n_frames))
28 | 
29 | 
30 | if __name__=="__main__":
31 |   dir_path = sys.argv[1]
32 |   for class_name in os.listdir(dir_path):
33 |     class_process(dir_path, class_name)
34 | 


--------------------------------------------------------------------------------
/3D-ResNet/utils/video_jpg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | 
 7 | if __name__=="__main__":
 8 |   dir_path = sys.argv[1]
 9 |   dst_dir_path = sys.argv[2]
10 | 
11 |   for file_name in os.listdir(dir_path):
12 |     if '.mp4' not in file_name:
13 |       continue
14 |     name, ext = os.path.splitext(file_name)
15 |     dst_directory_path = os.path.join(dst_dir_path, name)
16 | 
17 |     video_file_path = os.path.join(dir_path, file_name)
18 |     try:
19 |       if os.path.exists(dst_directory_path):
20 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
21 |           subprocess.call('rm -r {}'.format(dst_directory_path), shell=True)
22 |           print('remove {}'.format(dst_directory_path))
23 |           os.mkdir(dst_directory_path)
24 |         else:
25 |           continue
26 |       else:
27 |         os.mkdir(dst_directory_path)
28 |     except:
29 |       print(dst_directory_path)
30 |       continue
31 |     cmd = 'ffmpeg -i {} -vf scale=-1:360 {}/image_%05d.jpg'.format(video_file_path, dst_directory_path)
32 |     print(cmd)
33 |     subprocess.call(cmd, shell=True)
34 |     print('\n')
35 | 


--------------------------------------------------------------------------------
/3D-ResNet/utils/n_frames_kinetics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     print(class_path)
10 |     return
11 | 
12 |   print(class_name)
13 | 
14 |   for file_name in os.listdir(class_path):
15 |     video_dir_path = os.path.join(class_path, file_name)
16 |     image_indices = []
17 |     for image_file_name in os.listdir(video_dir_path):
18 |       if 'frame' not in image_file_name or "n_frames" in image_file_name:
19 |         continue
20 |       frame_ind = int(image_file_name.split("frame")[1].split(".")[0])
21 |       image_indices.append(frame_ind)
22 | 
23 |     if len(image_indices) == 0:
24 |       print('no image files', video_dir_path)
25 |       n_frames = 0
26 |     else:
27 |       image_indices.sort(reverse=True)
28 |       n_frames = image_indices[0]
29 |       print(video_dir_path, n_frames)
30 |     with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file:
31 |       dst_file.write(str(n_frames))
32 | 
33 | 
34 | if __name__=="__main__":
35 |   dir_path = sys.argv[1]
36 |   for class_name in os.listdir(dir_path):
37 |     class_process(dir_path, class_name)
38 | 
39 |   class_name = 'test'
40 |   class_process(dir_path, class_name)
41 | 


--------------------------------------------------------------------------------
/3D-ResNet/results/opts.json:
--------------------------------------------------------------------------------
1 | {"root_path": "/root/data/ActivityNet", "video_path": "/scratch/ptokmako/hmdb_jpeg/", "annotation_path": "/scratch/ptokmako/testTrainMulti_7030_splits/hmdb51_1.json", "result_path": "results", "dataset": "hmdb51", "n_classes": 101, "n_finetune_classes": 400, "sample_size": 112, "sample_duration": 16, "temp_stride": 1, "initial_scale": 1.0, "n_scales": 5, "scale_step": 0.84081289641525, "train_crop": "corner", "learning_rate": 0.1, "momentum": 0.9, "dampening": 0.9, "weight_decay": 0.001, "mean_dataset": "activitynet", "no_mean_norm": false, "std_norm": false, "nesterov": false, "optimizer": "sgd", "lr_patience": 10, "batch_size": 128, "n_epochs": 200, "begin_epoch": 1, "lr_threshold": 0.0001, "n_val_samples": 3, "resume_path": "/data2/ptokmako/results/hmdb_debug2/save_20.pth", "label_folder": "", "pretrain_path": "", "ft_begin_index": 0, "no_train": true, "no_val": true, "test": true, "test_subset": "val", "scale_in_test": 1.0, "crop_position_in_test": "c", "no_softmax_in_test": false, "no_cuda": false, "n_threads": 16, "checkpoint": 10, "no_hflip": false, "norm_value": 1, "model": "resnet", "model_depth": 18, "resnet_shortcut": "B", "manual_seed": 1, "gpu": ["2"], "shot": -1, "scales": [1.0, 0.84081289641525, 0.706966326778202, 0.5944264048864302, 0.4998013871982635], "arch": "resnet-18", "mean": [114.7748, 107.7354, 99.475], "std": [38.7568578, 37.88248729, 40.02898126]}


--------------------------------------------------------------------------------
/LocalAggregation/scripts/instance.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from src.agents.agents import *
 3 | from src.utils.setup import process_config
 4 | from src.utils.utils import load_json
 5 | import os
 6 | 
 7 | 
 8 | def run(config_path, pre_checkpoint_dir):
 9 |     config = process_config(config_path)
10 |     AgentClass = globals()[config.agent]
11 |     agent = AgentClass(config)
12 | 
13 |     if pre_checkpoint_dir is not None:
14 |         # this will load both the weights and memory bank
15 |         agent.load_checkpoint('checkpoint.pth.tar', pre_checkpoint_dir, load_memory_bank=True,
16 |                               load_model=True, load_optim=True, load_epoch=True)
17 | 
18 |     try:
19 |         agent.run()
20 |         agent.finalise()
21 |     except KeyboardInterrupt:
22 |         pass
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     import argparse
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument('config', type=str, default='path to config file')
29 |     args = parser.parse_args()
30 | 
31 |     config_json = load_json(args.config)
32 | 
33 |     pre_checkpoint_dir = None
34 |     if config_json['pretrained_exp_dir'] is not None:
35 |         print("NOTE: found pretrained model...continue training")
36 |         pre_checkpoint_dir = os.path.join(config_json['pretrained_exp_dir'], 'checkpoints')
37 | 
38 |     run(args.config, pre_checkpoint_dir)
39 | 
40 | 


--------------------------------------------------------------------------------
/3D-ResNet/utils/fps.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | 
 7 | if __name__=="__main__":
 8 |   dir_path = sys.argv[1]
 9 |   dst_dir_path = sys.argv[2]
10 | 
11 |   for file_name in os.listdir(dir_path):
12 |     if '.mp4' not in file_name:
13 |       continue
14 |     name, ext = os.path.splitext(file_name)
15 |     dst_directory_path = os.path.join(dst_dir_path, name)
16 | 
17 |     video_file_path = os.path.join(dir_path, file_name)
18 |     p = subprocess.Popen('ffprobe {}'.format(video_file_path),
19 |                          shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
20 |     _, res = p.communicate()
21 |     res = res.decode('utf-8')
22 | 
23 |     duration_index = res.find('Duration:')
24 |     duration_str = res[(duration_index + 10):(duration_index + 21)]
25 |     hour = float(duration_str[0:2])
26 |     minute = float(duration_str[3:5])
27 |     sec = float(duration_str[6:10])
28 |     total_sec = hour * 3600 + minute * 60 + sec
29 | 
30 |     n_frames = len(os.listdir(dst_directory_path))
31 |     if os.path.exists(os.path.join(dst_directory_path, 'fps')):
32 |       n_frames -= 1
33 | 
34 |     fps = round(n_frames / total_sec, 2)
35 | 
36 |     print(video_file_path, os.path.exists(video_file_path), fps)
37 |     with open(os.path.join(dst_directory_path, 'fps'), 'w') as fps_file:
38 |       fps_file.write('{}\n'.format(fps))
39 | 


--------------------------------------------------------------------------------
/3D-ResNet/utils/video_jpg_kinetics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, dst_dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   dst_class_path = os.path.join(dst_dir_path, class_name)
12 |   if not os.path.exists(dst_class_path):
13 |     os.mkdir(dst_class_path)
14 | 
15 |   for file_name in os.listdir(class_path):
16 |     if '.mp4.' not in file_name:
17 |       continue
18 |     name, ext = os.path.splitext(file_name)
19 |     if "." in name:
20 |       name, ext = os.path.splitext(name)
21 | 
22 |     dst_directory_path = os.path.join(dst_class_path, name)
23 | 
24 |     video_file_path = os.path.join(class_path, file_name)
25 |     try:
26 |       if os.path.exists(dst_directory_path):
27 |         continue
28 |       else:
29 |         os.mkdir(dst_directory_path)
30 |     except:
31 |       print(dst_directory_path)
32 |       continue
33 |     cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
34 |     print(cmd)
35 |     subprocess.call(cmd, shell=True)
36 |     print('\n')
37 | 
38 | if __name__=="__main__":
39 |   dir_path = sys.argv[1]
40 |   dst_dir_path = sys.argv[2]
41 | 
42 |   for class_name in os.listdir(dir_path):
43 |     class_process(dir_path, dst_dir_path, class_name)
44 | 
45 |   class_name = 'test'
46 |   class_process(dir_path, dst_dir_path, class_name)
47 | 


--------------------------------------------------------------------------------
/LocalAggregation/config/kinetics_ir.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_base": "/data/ptokmako/local_agg/experiments_data",
 3 |     "debug": false,
 4 |     "exp_name": "kinetics_ir",
 5 |     "exp_id": "res18_IR",
 6 |     "agent": "KineticsAgent",
 7 |     "cuda": true,
 8 |     "gpu_device": [0],
 9 |     "seed": 1337,
10 |     "data_loader_workers": 16,
11 |     "data_params": {
12 |         "image_size": 112,
13 |         "sample_duration": 16,
14 |         "load_fvs": false,
15 |         "fv_path": "/data/ptokmako/kinetics_fv_nobox_nosketch/",
16 |         "dataset_path": "/ssd1/ptokmako/kinetics_jpg/",
17 |         "annotation_path": "splits.json"
18 |     },
19 |     "model_params": {
20 |         "embedding_dim": 128,
21 |         "hidden_dim": 256,
22 |         "n_filters": 64,
23 |         "out_dim": 128,
24 |         "resnet": true,
25 |         "resnet_version": "resnet18"
26 |     },
27 |     "loss_params": {
28 |         "k": 4096,
29 |         "t": 0.07,
30 |         "m": 0.5,
31 |         "kmeans_k": 30000,
32 |         "n_kmeans": 10,
33 |         "kmeans_freq": null,
34 |         "loss": "InstanceDiscriminationLossModule"
35 |     },
36 |     "optim_params": {
37 |         "batch_size": 128,
38 |         "learning_rate": 0.03,
39 |         "lr_decay_schedule": [160, 190],
40 |         "lr_decay_rate": 0.1,
41 |         "momentum": 0.9,
42 |         "weight_decay": 1e-4,
43 |         "validate_freq": 1
44 |     },
45 |     "num_epochs": 40,
46 |     "validate": true,
47 |     "copy_checkpoint_freq": null,
48 |     "pretrained_exp_dir": null
49 | }
50 | 


--------------------------------------------------------------------------------
/LocalAggregation/config/kinetics_la_tune.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_base": "/data/ptokmako/local_agg/experiments_data",
 3 |     "debug":false,
 4 |     "exp_name": "kinetics_la_tune",
 5 | 
 6 |     "exp_id": "res18_LA",
 7 |     "agent": "KineticsAgent",
 8 |     "cuda": true,
 9 |     "gpu_device": [0],
10 |     "faiss_gpu_device": null,
11 |     "seed": 1337,
12 |     "data_loader_workers": 32,
13 |     "data_params": {
14 |         "image_size": 112,
15 |         "sample_duration": 16,
16 |         "load_fvs": true,
17 |         "fv_path": "/data/ptokmako/kinetics_fv/",
18 |         "dataset_path": "/ssd1/ptokmako/kinetics_jpg/",
19 |         "annotation_path": "splits.json"
20 |     },
21 |     "model_params": {
22 |         "out_dim": 128,
23 |         "resnet_version": "resnet18"
24 |     },
25 |     "loss_params": {
26 |         "k": 4096,
27 |         "t": 0.07,
28 |         "m": 0.5,
29 |         "kmeans_k": 6000,
30 |         "n_kmeans": 3,
31 |         "kmeans_freq": null,
32 |         "loss": "LocalAggregationLossModule"
33 |     },
34 |     "optim_params": {
35 |         "batch_size": 256,
36 |         "learning_rate": 0.03,
37 |         "lr_decay_schedule": [10, 20],
38 |         "lr_decay_rate": 0.1,
39 |         "momentum": 0.9,
40 |         "weight_decay": 1e-4,
41 |         "validate_freq": 1
42 |     },
43 |     "num_epochs": 25,
44 |     "validate": false,
45 |     "copy_checkpoint_freq": null,
46 |     "instance_exp_dir": "/data/ptokmako/local_agg/experiments_data/experiments/kinetics_la/res18_LA",
47 |     "pretrained_exp_dir": null,
48 |     "cluster_checkpoint_dir": null
49 | }
50 | 


--------------------------------------------------------------------------------
/LocalAggregation/config/kinetics_la.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_base": "/data/ptokmako/local_agg/experiments_data",
 3 |     "debug":false,
 4 |     "exp_name": "kinetics_la_idt",
 5 | 
 6 |     "exp_id": "res18_LA",
 7 |     "agent": "KineticsAgent",
 8 |     "cuda": true,
 9 |     "gpu_device": [0],
10 |     "faiss_gpu_device": null,
11 |     "seed": 1337,
12 |     "data_loader_workers": 32,
13 |     "data_params": {
14 |         "image_size": 112,
15 |         "sample_duration": 16,
16 |         "load_fvs": false,
17 |         "fv_path": "/data/ptokmako/kinetics_fv_nobox_nosketch/",
18 |         "dataset_path": "/ssd1/ptokmako/kinetics_jpg/",
19 |         "annotation_path": "splits.json"
20 |     },
21 |     "model_params": {
22 |         "out_dim": 128,
23 |         "resnet_version": "resnet18"
24 |     },
25 |     "loss_params": {
26 |         "k": 4096,
27 |         "t": 0.07,
28 |         "m": 0.5,
29 |         "kmeans_k": 6000,
30 |         "n_kmeans": 3,
31 |         "kmeans_freq": null,
32 |         "loss": "LocalAggregationLossModule"
33 |     },
34 |     "optim_params": {
35 |         "batch_size": 256,
36 |         "learning_rate": 0.03,
37 |         "lr_decay_schedule": [160, 190],
38 |         "lr_decay_rate": 0.1,
39 |         "momentum": 0.9,
40 |         "weight_decay": 1e-4,
41 |         "validate_freq": 1
42 |     },
43 |     "num_epochs": 200,
44 |     "validate": true,
45 |     "copy_checkpoint_freq": null,
46 |     "instance_exp_dir": "/data/ptokmako/local_agg/experiments_data/experiments/kinetics_ir/res18_IR",
47 |     "pretrained_exp_dir": null,
48 |     "cluster_checkpoint_dir": "models/idt_clusters"
49 | }
50 | 


--------------------------------------------------------------------------------
/3D-ResNet/utils.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | class AverageMeter(object):
 5 |     """Computes and stores the average and current value"""
 6 | 
 7 |     def __init__(self):
 8 |         self.reset()
 9 | 
10 |     def reset(self):
11 |         self.val = 0
12 |         self.avg = 0
13 |         self.sum = 0
14 |         self.count = 0
15 | 
16 |     def update(self, val, n=1):
17 |         self.val = val
18 |         self.sum += val * n
19 |         self.count += n
20 |         self.avg = self.sum / self.count
21 | 
22 | 
23 | class Logger(object):
24 | 
25 |     def __init__(self, path, header):
26 |         self.log_file = open(path, 'w')
27 |         self.logger = csv.writer(self.log_file, delimiter='\t')
28 | 
29 |         self.logger.writerow(header)
30 |         self.header = header
31 | 
32 |     def __del(self):
33 |         self.log_file.close()
34 | 
35 |     def log(self, values):
36 |         write_values = []
37 |         for col in self.header:
38 |             assert col in values
39 |             write_values.append(values[col])
40 | 
41 |         self.logger.writerow(write_values)
42 |         self.log_file.flush()
43 | 
44 | 
45 | def load_value_file(file_path):
46 |     with open(file_path, 'r') as input_file:
47 |         value = float(input_file.read().rstrip('\n\r'))
48 | 
49 |     return value
50 | 
51 | 
52 | def calculate_accuracy(outputs, targets, k=1):
53 |     batch_size = targets.size(0)
54 | 
55 |     _, pred = outputs.topk(k, 1, True)
56 |     pred = pred.t()
57 |     correct = pred.eq(targets.view(1, -1))
58 |     n_correct_elems = correct.float().sum().item()
59 | 
60 |     return n_correct_elems / batch_size
61 | 


--------------------------------------------------------------------------------
/3D-ResNet/utils/video_jpg_ucf101_hmdb51.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, dst_dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   dst_class_path = os.path.join(dst_dir_path, class_name)
12 |   if not os.path.exists(dst_class_path):
13 |     os.mkdir(dst_class_path)
14 | 
15 |   for file_name in os.listdir(class_path):
16 |     if '.avi' not in file_name:
17 |       continue
18 |     name, ext = os.path.splitext(file_name)
19 |     dst_directory_path = os.path.join(dst_class_path, name)
20 | 
21 |     video_file_path = os.path.join(class_path, file_name)
22 |     try:
23 |       if os.path.exists(dst_directory_path):
24 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
25 |           subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
26 |           print('remove {}'.format(dst_directory_path))
27 |           os.mkdir(dst_directory_path)
28 |         else:
29 |           continue
30 |       else:
31 |         os.mkdir(dst_directory_path)
32 |     except:
33 |       print(dst_directory_path)
34 |       continue
35 |     cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
36 |     print(cmd)
37 |     subprocess.call(cmd, shell=True)
38 |     print('\n')
39 | 
40 | if __name__=="__main__":
41 |   dir_path = sys.argv[1]
42 |   dst_dir_path = sys.argv[2]
43 | 
44 |   for class_name in os.listdir(dir_path):
45 |     class_process(dir_path, dst_dir_path, class_name)
46 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/idt/sketching.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def sketch_batch(inp, sd1, nSketchDimC, nFeatDimC, nIsFloatC):
 5 |     inp = inp.t()
 6 |     sd1_t = torch.from_numpy(sd1)
 7 |     sd1 = sd1_t
 8 |     # print('sd1: ', sd1.shape)
 9 |     # print(inp.shape)
10 |     if nIsFloatC == 1:
11 |         inp = inp.type(torch.FloatTensor)
12 |     else:
13 |         inp = inp.type(torch.DoubleTensor)
14 | 
15 |     inp = inp.t()
16 |     # print(inp.shape)
17 |     out1 = inp.mm(sd1)
18 | 
19 |     res = out1
20 |  
21 |     if nIsFloatC != 1:
22 |         res = res.type(torch.FloatTensor)
23 | 
24 |     return res
25 | 
26 | def choose_h_sk_mat(nSketchDimC, nFeatDimC):
27 |     nRep=int( np.ceil(nFeatDimC / nSketchDimC) )
28 | 
29 |     rand_array=np.array([]).astype(int)
30 |     for i in range(nRep):
31 |         rand_array_i = np.random.permutation(int(nSketchDimC))
32 |         rand_array = np.concatenate( (rand_array, rand_array_i), axis=0 )
33 | 
34 |     return rand_array[0:nFeatDimC]
35 | 
36 | def choose_s_sk_mat(nSketchDimC, nFeatDimC):
37 |     nRep = int( np.ceil(nFeatDimC / nSketchDimC) )
38 | 
39 |     rand_array = np.array([]).astype(int)
40 |     for i in range(nRep):
41 |         rand_array_i = np.array([-1, 1]).astype(int)
42 |         rand_array = np.concatenate( (rand_array, rand_array_i), axis=0 )
43 | 
44 |     rand_array = np.random.permutation(rand_array)
45 | 
46 |     return rand_array[0: nFeatDimC]
47 | 
48 | def create_s_dense(hi, si):
49 |     c = np.max(hi) + 1
50 |     d = len(hi)
51 |     out = np.zeros((d, c))  # in*out
52 |     for i in range(d):
53 |         out[i, hi[i]] = si[i]
54 |     return out
55 | 
56 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/datasets/temporal_transforms.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | class TemporalRandomCrop(object):
 5 |     """Temporally crop the given frame indices at a random location.
 6 | 
 7 |     If the number of frames is less than the size,
 8 |     loop the indices as many times as necessary to satisfy the size.
 9 | 
10 |     Args:
11 |         size (int): Desired output size of the crop.
12 |     """
13 | 
14 |     def __init__(self, size):
15 |         self.size = size
16 | 
17 |     def __call__(self, frame_indices):
18 |         """
19 |         Args:
20 |             frame_indices (list): frame indices to be cropped.
21 |         Returns:
22 |             list: Cropped frame indices.
23 |         """
24 | 
25 |         rand_end = max(0, len(frame_indices) - self.size - 1)
26 |         begin_index = random.randint(0, rand_end)
27 |         end_index = min(begin_index + self.size, len(frame_indices))
28 | 
29 |         out = frame_indices[begin_index:end_index]
30 | 
31 |         for index in out:
32 |             if len(out) >= self.size:
33 |                 break
34 |             out.append(index)
35 | 
36 |         return out
37 | 
38 | 
39 | class TemporalCenterCrop(object):
40 |     """Temporally crop the given frame indices at a center.
41 | 
42 |     If the number of frames is less than the size,
43 |     loop the indices as many times as necessary to satisfy the size.
44 | 
45 |     Args:
46 |         size (int): Desired output size of the crop.
47 |     """
48 | 
49 |     def __init__(self, size):
50 |         self.size = size
51 | 
52 |     def __call__(self, frame_indices):
53 |         """
54 |         Args:
55 |             frame_indices (list): frame indices to be cropped.
56 |         Returns:
57 |             list: Cropped frame indices.
58 |         """
59 | 
60 |         center_index = len(frame_indices) // 2
61 |         begin_index = max(0, center_index - (self.size // 2))
62 |         end_index = min(begin_index + self.size, len(frame_indices))
63 | 
64 |         out = frame_indices[begin_index:end_index]
65 | 
66 |         for index in out:
67 |             if len(out) >= self.size:
68 |                 break
69 |             out.append(index)
70 | 
71 |         return out
72 | 


--------------------------------------------------------------------------------
/3D-ResNet/utils/kinetics_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_path, subset):
 8 |     data = pd.read_csv(csv_path)
 9 |     keys = []
10 |     key_labels = []
11 |     for i in range(data.shape[0]):
12 |         row = data.ix[i, :]
13 |         basename = '%s_%s_%s' % (row['youtube_id'],
14 |                                  '%06d' % row['time_start'],
15 |                                  '%06d' % row['time_end'])
16 |         keys.append(basename)
17 |         if subset != 'testing':
18 |             key_labels.append(row['label'])
19 | 
20 |     database = {}
21 |     for i in range(len(keys)):
22 |         key = keys[i]
23 |         database[key] = {}
24 |         database[key]['subset'] = subset
25 |         if subset != 'testing':
26 |             label = key_labels[i]
27 |             database[key]['annotations'] = {'label': label}
28 |         else:
29 |             database[key]['annotations'] = {}
30 | 
31 |     return database
32 | 
33 | def load_labels(train_csv_path):
34 |     data = pd.read_csv(train_csv_path)
35 |     return data['label'].unique().tolist()
36 | 
37 | def convert_kinetics_csv_to_activitynet_json(train_csv_path, val_csv_path, test_csv_path, dst_json_path):
38 |     labels = load_labels(train_csv_path)
39 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
40 |     print(len(train_database))
41 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
42 |     test_database = convert_csv_to_dict(test_csv_path, 'testing')
43 | 
44 |     dst_data = {}
45 |     dst_data['labels'] = labels
46 |     dst_data['database'] = {}
47 |     dst_data['database'].update(train_database)
48 |     dst_data['database'].update(val_database)
49 |     dst_data['database'].update(test_database)
50 | 
51 |     with open(dst_json_path, 'w') as dst_file:
52 |         json.dump(dst_data, dst_file)
53 | 
54 | if __name__=="__main__":
55 |   train_csv_path = sys.argv[1]
56 |   val_csv_path = sys.argv[2]
57 |   test_csv_path = sys.argv[3]
58 |   dst_json_path = sys.argv[4]
59 | 
60 |   convert_kinetics_csv_to_activitynet_json(
61 |     train_csv_path, val_csv_path, test_csv_path, dst_json_path)
62 | 


--------------------------------------------------------------------------------
/3D-ResNet/utils/hmdb51_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_dir_path, split_index):
 8 |     database = {}
 9 |     for filename in os.listdir(csv_dir_path):
10 |         if 'split{}'.format(split_index) not in filename:
11 |             continue
12 |         
13 |         data = pd.read_csv(os.path.join(csv_dir_path, filename),
14 |                            delimiter=' ', header=None)
15 |         keys = []
16 |         subsets = []
17 |         for i in range(data.shape[0]):
18 |             row = data.ix[i, :]
19 |             if row[1] == 0:
20 |                 continue
21 |             elif row[1] == 1:
22 |                 subset = 'training'
23 |             elif row[1] == 2:
24 |                 subset = 'validation'
25 |             
26 |             keys.append(row[0].split('.')[0])
27 |             subsets.append(subset)        
28 |         
29 |         for i in range(len(keys)):
30 |             key = keys[i]
31 |             database[key] = {}
32 |             database[key]['subset'] = subsets[i]
33 |             label = '_'.join(filename.split('_')[:-2])
34 |             database[key]['annotations'] = {'label': label}
35 |     
36 |     return database
37 | 
38 | def get_labels(csv_dir_path):
39 |     labels = []
40 |     for name in os.listdir(csv_dir_path):
41 |         labels.append('_'.join(name.split('_')[:-2]))
42 |     return sorted(list(set(labels)))
43 | 
44 | def convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path):
45 |     labels = get_labels(csv_dir_path)
46 |     database = convert_csv_to_dict(csv_dir_path, split_index)
47 |     
48 |     dst_data = {}
49 |     dst_data['labels'] = labels
50 |     dst_data['database'] = {}
51 |     dst_data['database'].update(database)
52 | 
53 |     with open(dst_json_path, 'w') as dst_file:
54 |         json.dump(dst_data, dst_file)
55 | 
56 | if __name__ == '__main__':
57 |     csv_dir_path = sys.argv[1]
58 | 
59 |     for split_index in range(1, 4):
60 |         dst_json_path = os.path.join(csv_dir_path, 'hmdb51_{}.json'.format(split_index))
61 |         convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path)


--------------------------------------------------------------------------------
/3D-ResNet/utils/ucf101_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_path, subset):
 8 |     data = pd.read_csv(csv_path, delimiter=' ', header=None)
 9 |     keys = []
10 |     key_labels = []
11 |     for i in range(data.shape[0]):
12 |         row = data.ix[i, :]
13 |         slash_rows = data.ix[i, 0].split('/')
14 |         class_name = slash_rows[0]
15 |         basename = slash_rows[1].split('.')[0]
16 |         
17 |         keys.append(basename)
18 |         key_labels.append(class_name)
19 |         
20 |     database = {}
21 |     for i in range(len(keys)):
22 |         key = keys[i]
23 |         database[key] = {}
24 |         database[key]['subset'] = subset
25 |         label = key_labels[i]
26 |         database[key]['annotations'] = {'label': label}
27 |     
28 |     return database
29 | 
30 | def load_labels(label_csv_path):
31 |     data = pd.read_csv(label_csv_path, delimiter=' ', header=None)
32 |     labels = []
33 |     for i in range(data.shape[0]):
34 |         labels.append(data.ix[i, 1])
35 |     return labels
36 | 
37 | def convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 
38 |                                            val_csv_path, dst_json_path):
39 |     labels = load_labels(label_csv_path)
40 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
41 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
42 |     
43 |     dst_data = {}
44 |     dst_data['labels'] = labels
45 |     dst_data['database'] = {}
46 |     dst_data['database'].update(train_database)
47 |     dst_data['database'].update(val_database)
48 | 
49 |     with open(dst_json_path, 'w') as dst_file:
50 |         json.dump(dst_data, dst_file)
51 | 
52 | if __name__ == '__main__':
53 |     csv_dir_path = sys.argv[1]
54 | 
55 |     for split_index in range(1, 4):
56 |         label_csv_path = os.path.join(csv_dir_path, 'classInd.txt')
57 |         train_csv_path = os.path.join(csv_dir_path, 'trainlist0{}.txt'.format(split_index))
58 |         val_csv_path = os.path.join(csv_dir_path, 'testlist0{}.txt'.format(split_index))
59 |         dst_json_path = os.path.join(csv_dir_path, 'ucf101_0{}.json'.format(split_index))
60 | 
61 |         convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path,
62 |                                                val_csv_path, dst_json_path)
63 | 


--------------------------------------------------------------------------------
/LocalAggregation/scripts/localagg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from copy import deepcopy
 4 | from src.agents.agents import *
 5 | from src.utils.setup import process_config, _process_config
 6 | from src.utils.utils import load_json
 7 | 
 8 | 
 9 | def run(config_path, ir_checkpoint_dir=None, pre_checkpoint_dir=None, cluster_checkpoint_dir=None):
10 |     config = process_config(config_path)
11 |     AgentClass = globals()[config.agent]
12 |     agent = AgentClass(config)
13 | 
14 |     if ir_checkpoint_dir is not None:
15 |         agent.load_checkpoint('checkpoint.pth.tar', ir_checkpoint_dir, load_memory_bank=True,
16 |                               load_model=True, load_optim=False, load_epoch=False,
17 |                               cluster_label_dir=cluster_checkpoint_dir)
18 | 
19 |     if pre_checkpoint_dir is not None:
20 |         agent.load_checkpoint('checkpoint.pth.tar', pre_checkpoint_dir, load_memory_bank=True,
21 |                               load_model=True, load_optim=True, load_epoch=True,
22 |                               cluster_label_dir=cluster_checkpoint_dir)
23 | 
24 |     try:
25 |         agent.run()
26 |         agent.finalise()
27 |     except KeyboardInterrupt:
28 |         pass
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     import argparse
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument('config', type=str, default='path to config file')
35 |     args = parser.parse_args()
36 | 
37 |     config_json = load_json(args.config)
38 | 
39 |     ir_checkpoint_dir = None
40 |     pre_checkpoint_dir = None
41 |     cluster_checkpoint_dir = None
42 | 
43 |     if config_json['pretrained_exp_dir'] is not None:
44 |         print("NOTE: found pretrained model...Continue training")
45 |         pre_checkpoint_dir = os.path.join(config_json['pretrained_exp_dir'], 'checkpoints')
46 | 
47 |     if config_json['cluster_checkpoint_dir'] is not None:
48 |         print("NOTE: loading cluster assignment")
49 |         cluster_checkpoint_dir = os.path.join(config_json['cluster_checkpoint_dir'], 'checkpoints')
50 | 
51 |     # If pre_checkpoint_dir already exits, ignore ir_checkpoint_dir
52 |     if pre_checkpoint_dir is None and config_json['instance_exp_dir'] is not None:
53 |         print("NOTE: found IR model...")
54 |         ir_checkpoint_dir = os.path.join(config_json['instance_exp_dir'], 'checkpoints')
55 | 
56 |     run(args.config, ir_checkpoint_dir=ir_checkpoint_dir, pre_checkpoint_dir=pre_checkpoint_dir,
57 |         cluster_checkpoint_dir=cluster_checkpoint_dir)
58 | 


--------------------------------------------------------------------------------
/3D-ResNet/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import os
 5 | import sys
 6 | import numpy as np
 7 | 
 8 | from utils import AverageMeter, calculate_accuracy
 9 | 
10 | 
11 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt,
12 |                 epoch_logger, batch_logger, writer):
13 |     print('train at epoch {}'.format(epoch))
14 | 
15 |     model.train()
16 | 
17 |     batch_time = AverageMeter()
18 |     data_time = AverageMeter()
19 |     losses = AverageMeter()
20 |     accuracies = AverageMeter()
21 | 
22 |     dataset = "ucf"
23 |     if opt.dataset == "hmdb51":
24 |         dataset = "hmdb"
25 | 
26 |     end_time = time.time()
27 |     for i, (inputs, targets) in enumerate(data_loader):
28 |         acc = 0
29 |         data_time.update(time.time() - end_time)
30 | 
31 |         if not opt.no_cuda:
32 |             targets = targets.cuda(async=True)
33 |         if opt.dataset == "hmdb51":
34 |             targets -= 1
35 | 
36 |         outputs = model(inputs.cuda())
37 |         loss = criterion(outputs, targets)
38 | 
39 |         losses.update(loss.item(), inputs.size(0))
40 | 
41 |         acc = calculate_accuracy(outputs, targets)
42 |         accuracies.update(acc, inputs.size(0))
43 | 
44 |         optimizer.zero_grad()
45 |         loss.backward()
46 |         optimizer.step()
47 | 
48 |         batch_time.update(time.time() - end_time)
49 |         end_time = time.time()
50 | 
51 |         writer.add_scalar('%s/train_loss' % dataset, losses.val, (epoch - 1) * len(data_loader) + (i + 1))
52 | 
53 |         print('Epoch: [{0}][{1}/{2}]\t'
54 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
55 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
56 |               'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
57 |               'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
58 |                   epoch,
59 |                   i + 1,
60 |                   len(data_loader),
61 |                   batch_time=batch_time,
62 |                   data_time=data_time,
63 |                   loss=losses,
64 |                   acc=accuracies))
65 | 
66 |     if epoch % opt.checkpoint == 0:
67 |         save_file_path = os.path.join(opt.result_path,
68 |                                       'save_{}.pth'.format(epoch))
69 |         states = {
70 |             'epoch': epoch + 1,
71 |             'arch': opt.arch,
72 |             'state_dict': model.state_dict(),
73 |             'optimizer': optimizer.state_dict(),
74 |         }
75 |         torch.save(states, save_file_path)
76 | 


--------------------------------------------------------------------------------
/3D-ResNet/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import torch.nn.functional as F
 4 | import time
 5 | import os
 6 | import sys
 7 | import json
 8 | 
 9 | from utils import AverageMeter
10 | 
11 | 
12 | def calculate_video_results(output_buffer, video_id, test_results, class_names):
13 |     video_outputs = torch.stack(output_buffer)
14 |     average_scores = torch.mean(video_outputs, dim=0)
15 |     sorted_scores, locs = torch.topk(average_scores, k=10)
16 | 
17 |     video_results = []
18 |     for i in range(sorted_scores.size(0)):
19 |         video_results.append({
20 |             'label': class_names[locs[i].item()],
21 |             'score': sorted_scores[i].item()
22 |         })
23 | 
24 |     test_results['results'][video_id] = video_results
25 | 
26 | 
27 | def test(data_loader, model, opt, class_names):
28 |     print('test')
29 | 
30 |     model.eval()
31 | 
32 |     batch_time = AverageMeter()
33 |     data_time = AverageMeter()
34 | 
35 |     end_time = time.time()
36 |     output_buffer = []
37 |     previous_video_id = ''
38 |     test_results = {'results': {}}
39 |     with torch.no_grad():
40 |         for i, (inputs, targets) in enumerate(data_loader):
41 |             data_time.update(time.time() - end_time)
42 | 
43 |             inputs = Variable(inputs)
44 |             outputs = model(inputs)
45 |             if not opt.no_softmax_in_test:
46 |                 outputs = F.softmax(outputs)
47 | 
48 |             for j in range(outputs.size(0)):
49 |                 if not (i == 0 and j == 0) and targets[j] != previous_video_id:
50 |                     calculate_video_results(output_buffer, previous_video_id,
51 |                                             test_results, class_names)
52 |                     output_buffer = []
53 |                 output_buffer.append(outputs[j].data.cpu())
54 |                 previous_video_id = targets[j]
55 | 
56 |             if (i % 100) == 0:
57 |                 with open(
58 |                         os.path.join(opt.result_path, '{}.json'.format(
59 |                             opt.test_subset)), 'w') as f:
60 |                     json.dump(test_results, f)
61 | 
62 |             batch_time.update(time.time() - end_time)
63 |             end_time = time.time()
64 | 
65 |             print('[{}/{}]\t'
66 |                   'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
67 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
68 |                 i + 1,
69 |                 len(data_loader),
70 |                 batch_time=batch_time,
71 |                 data_time=data_time))
72 | 
73 |     with open(
74 |             os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)),
75 |             'w') as f:
76 |         json.dump(test_results, f)
77 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/idt/compute_fv_models.py:
--------------------------------------------------------------------------------
 1 | import shutil, random, os
 2 | import pandas as pd
 3 | import numpy as np
 4 | import gzip
 5 | from sklearn import decomposition
 6 | import pickle
 7 | from sklearn.mixture import GaussianMixture
 8 | from sketching import *
 9 | from extract_idt import fisher_vector
10 | import argparse
11 | from os.path import join
12 | 
13 | 
14 | def get_parser():
15 |     parser = argparse.ArgumentParser(description="FV model estimation")
16 |     parser.add_argument("--idt_path", help="Path to precomputed IDTs.")
17 |     return parser
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     # define an empty numpy array for concatenating features
22 |     all_features = np.array([])
23 | 
24 |     args = get_parser().parse_args()
25 |     dirpath = args.idt_path
26 | 
27 |     counter = 0
28 |     num_feats = 500
29 | 
30 |     filenames = os.listdir(dirpath)
31 |     for fname in filenames:
32 |         if fname.endswith(".gz"):
33 |             srcpath = os.path.join(dirpath, fname)
34 |             print("concatenating features from: ", srcpath)
35 |             counter = counter + 1
36 |             df = pd.read_table(gzip.open(srcpath), sep='\s+', header=None)
37 | 
38 |             # turn pandas dataframe into array
39 |             df_array = np.round(df.values, decimals=3)
40 | 
41 |             array_sum = np.sum(df_array)
42 |             array_has_nan = np.isnan(array_sum)
43 |             if (array_has_nan):
44 |                 continue
45 | 
46 |             if df_array.shape[0] < num_feats:
47 |                 print('less than %d' % num_feats)
48 |             else:
49 |                 idx = np.random.randint(df_array.shape[0], size=num_feats)
50 |                 df_array = df_array[idx, :]
51 |             
52 |             # concatenate all the features
53 |             print('stack feature vectors...', counter)
54 |             all_features = np.vstack([all_features, df_array]) if all_features.size else df_array
55 |             print('Done!-----------------------------')
56 | 
57 |     features = all_features[:, 10:436]
58 | 
59 |     trajectories = pd.DataFrame(features)
60 |     print('The feature dimension after random sampling is: ', trajectories.shape)
61 |     print(trajectories.describe())
62 | 
63 |     pca = decomposition.PCA(0.90)
64 |     pca_features = pca.fit_transform(trajectories)
65 | 
66 |     print(pca_features.shape)
67 | 
68 |     filename = join(dirpath, 'pca_model.sav')
69 |     pickle.dump(pca, open(filename, 'wb'))
70 | 
71 |     K = 256
72 |     gmm = GaussianMixture(n_components=K, covariance_type='diag', n_init=2, max_iter=200)
73 | 
74 |     print("Start the GMM estimation...")
75 |     gmm.fit(pca_features)
76 |     print("A GMM estimation has been finished!")
77 | 
78 |     filename = join(dirpath, 'gmm_diag_model.sav')
79 |     pickle.dump(gmm, open(filename, 'wb'))
80 | 
81 |     SKETCH_DIM_feat = 2000
82 |     fv = fisher_vector(pca_features[0, :], gmm)
83 |     print(fv.shape)
84 | 
85 |     d_fv = fv.shape[0]
86 |     h_fv = choose_h_sk_mat(SKETCH_DIM_feat, d_fv)
87 |     s_fv = choose_s_sk_mat(2, d_fv)
88 |     sdense_fv = create_s_dense(h_fv, s_fv)
89 | 
90 |     filename = join(dirpath, 'sketching_proj.sav')
91 |     pickle.dump(sdense_fv, open(filename, 'wb'))
92 | 


--------------------------------------------------------------------------------
/3D-ResNet/temporal_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | 
  4 | 
  5 | class LoopPadding(object):
  6 | 
  7 |     def __init__(self, size):
  8 |         self.size = size
  9 | 
 10 |     def __call__(self, frame_indices):
 11 |         out = frame_indices
 12 | 
 13 |         for index in out:
 14 |             if len(out) >= self.size:
 15 |                 break
 16 |             out.append(index)
 17 | 
 18 |         return out
 19 | 
 20 | 
 21 | class TemporalBeginCrop(object):
 22 |     """Temporally crop the given frame indices at a beginning.
 23 | 
 24 |     If the number of frames is less than the size,
 25 |     loop the indices as many times as necessary to satisfy the size.
 26 | 
 27 |     Args:
 28 |         size (int): Desired output size of the crop.
 29 |     """
 30 | 
 31 |     def __init__(self, size):
 32 |         self.size = size
 33 | 
 34 |     def __call__(self, frame_indices):
 35 |         out = frame_indices[:self.size]
 36 | 
 37 |         for index in out:
 38 |             if len(out) >= self.size:
 39 |                 break
 40 |             out.append(index)
 41 | 
 42 |         return out
 43 | 
 44 | 
 45 | class TemporalCenterCrop(object):
 46 |     """Temporally crop the given frame indices at a center.
 47 | 
 48 |     If the number of frames is less than the size,
 49 |     loop the indices as many times as necessary to satisfy the size.
 50 | 
 51 |     Args:
 52 |         size (int): Desired output size of the crop.
 53 |     """
 54 | 
 55 |     def __init__(self, size):
 56 |         self.size = size
 57 | 
 58 |     def __call__(self, frame_indices):
 59 |         """
 60 |         Args:
 61 |             frame_indices (list): frame indices to be cropped.
 62 |         Returns:
 63 |             list: Cropped frame indices.
 64 |         """
 65 | 
 66 |         center_index = len(frame_indices) // 2
 67 |         begin_index = max(0, center_index - (self.size // 2))
 68 |         end_index = min(begin_index + self.size, len(frame_indices))
 69 | 
 70 |         out = frame_indices[begin_index:end_index]
 71 | 
 72 |         for index in out:
 73 |             if len(out) >= self.size:
 74 |                 break
 75 |             out.append(index)
 76 | 
 77 |         return out
 78 | 
 79 | 
 80 | class TemporalStride(object):
 81 | 
 82 |     def __init__(self, stride):
 83 |         self.stride  = stride
 84 | 
 85 |     def __call__(self, frame_indices):
 86 |         out = frame_indices[0::self.stride]
 87 | 
 88 |         return out
 89 | 
 90 | 
 91 | class TemporalRandomCrop(object):
 92 |     """Temporally crop the given frame indices at a random location.
 93 | 
 94 |     If the number of frames is less than the size,
 95 |     loop the indices as many times as necessary to satisfy the size.
 96 | 
 97 |     Args:
 98 |         size (int): Desired output size of the crop.
 99 |     """
100 | 
101 |     def __init__(self, size):
102 |         self.size = size
103 | 
104 |     def __call__(self, frame_indices):
105 |         """
106 |         Args:
107 |             frame_indices (list): frame indices to be cropped.
108 |         Returns:
109 |             list: Cropped frame indices.
110 |         """
111 | 
112 |         rand_end = max(0, len(frame_indices) - self.size - 1)
113 |         begin_index = random.randint(0, rand_end)
114 |         end_index = min(begin_index + self.size, len(frame_indices))
115 | 
116 |         out = frame_indices[begin_index:end_index]
117 | 
118 |         for index in out:
119 |             if len(out) >= self.size:
120 |                 break
121 |             out.append(index)
122 | 
123 |         return out
124 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import shutil
  4 | import torch
  5 | import numpy as np
  6 | from collections import Counter, OrderedDict
  7 | 
  8 | 
  9 | class AverageMeter(object):
 10 |     """Computes and stores the average and current value"""
 11 |     def __init__(self):
 12 |         self.reset()
 13 | 
 14 |     def reset(self):
 15 |         self.val = 0
 16 |         self.avg = 0
 17 |         self.sum = 0
 18 |         self.count = 0
 19 | 
 20 |     def update(self, val, n=1):
 21 |         self.val = val
 22 |         self.sum += val * n
 23 |         self.count += n
 24 |         self.avg = self.sum / self.count
 25 | 
 26 | 
 27 | class ProgressMeter(object):
 28 |     def __init__(self, num_batches, meters, prefix=""):
 29 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
 30 |         self.meters = meters
 31 |         self.prefix = prefix
 32 | 
 33 |     def display(self, batch):
 34 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
 35 |         entries += [str(meter) for meter in self.meters]
 36 |         print('\t'.join(entries))
 37 | 
 38 |     def _get_batch_fmtstr(self, num_batches):
 39 |         num_digits = len(str(num_batches // 1))
 40 |         fmt = '{:' + str(num_digits) + 'd}'
 41 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
 42 | 
 43 | def copy_checkpoint(folder='./', filename='checkpoint.pth.tar',
 44 |                     copyname='copy.pth.tar'):
 45 |     shutil.copyfile(os.path.join(folder, filename),
 46 |                     os.path.join(folder, copyname))
 47 | 
 48 | 
 49 | def save_checkpoint(state, is_best, folder='./', filename='checkpoint.pth.tar'):
 50 |     if not os.path.isdir(folder):
 51 |         os.mkdir(folder)
 52 |     torch.save(state, os.path.join(folder, filename))
 53 |     if is_best:
 54 |         shutil.copyfile(os.path.join(folder, filename),
 55 |                         os.path.join(folder, 'model_best.pth.tar'))
 56 | 
 57 | 
 58 | def load_json(f_path):
 59 |     with open(f_path, 'r') as f:
 60 |         return json.load(f)
 61 | 
 62 | 
 63 | def save_json(obj, f_path):
 64 |     with open(f_path, 'w') as f:
 65 |         json.dump(obj, f, ensure_ascii=False)
 66 | 
 67 | 
 68 | class OrderedCounter(Counter, OrderedDict):
 69 |     """Counter that remembers the order elements are first encountered"""
 70 | 
 71 |     def __repr__(self):
 72 |         return '%s(%r)' % (self.__class__.__name__, OrderedDict(self))
 73 | 
 74 |     def __reduce__(self):
 75 |         return self.__class__, (OrderedDict(self),)
 76 | 
 77 | 
 78 | def adjust_learning_rate(epoch, opt_params, optimizer):
 79 |     if opt_params.lr_decay_schedule is not None:
 80 |         steps = np.sum(epoch > np.asarray(opt_params.lr_decay_schedule))
 81 |         assert isinstance(opt_params.lr_decay_rate, float)
 82 |         if steps > 0:
 83 |             new_lr = opt_params.learning_rate * (opt_params.lr_decay_rate ** steps)
 84 |             for param_group in optimizer.param_groups:
 85 |                 param_group['lr'] = new_lr
 86 | 
 87 | 
 88 | def exclude_bn_weight_bias_from_weight_decay(model, weight_decay):
 89 |     decay = []
 90 |     no_decay = []
 91 |     for name, param in model.named_parameters():
 92 |         if not param.requires_grad:
 93 |             continue
 94 |         # if len(param.shape) == 1 or name in skip_list:
 95 |         if 'bn' in name:
 96 |             no_decay.append(param)
 97 |         else:
 98 |             decay.append(param)
 99 |     return [
100 |         {'params': no_decay, 'weight_decay': 0.},
101 |         {'params': decay, 'weight_decay': weight_decay}
102 |     ]


--------------------------------------------------------------------------------
/3D-ResNet/validation.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Variable
  3 | import time
  4 | import sys
  5 | import numpy as np
  6 | 
  7 | from utils import AverageMeter, calculate_accuracy
  8 | 
  9 | 
 10 | def val_epoch(epoch, data_loader, model, criterion, opt, logger, writer):
 11 |     print('validation at epoch {}'.format(epoch))
 12 | 
 13 |     model.eval()
 14 | 
 15 |     batch_time = AverageMeter()
 16 |     data_time = AverageMeter()
 17 |     losses = AverageMeter()
 18 |     accuracies1 = AverageMeter()
 19 |     accuracies5 = AverageMeter()
 20 | 
 21 |     end_time = time.time()
 22 |     with torch.no_grad():
 23 |         for i, (inputs, targets) in enumerate(data_loader):
 24 |             acc1 = 0
 25 |             data_time.update(time.time() - end_time)
 26 | 
 27 |             if not opt.no_cuda:
 28 |                 targets = targets.cuda(async=True)
 29 |             if opt.dataset == "hmdb51":
 30 |                 targets -= 1
 31 | 
 32 |             outputs = model(inputs.cuda())
 33 |             loss = criterion(outputs, targets)
 34 | 
 35 |             losses.update(loss.item(), inputs.size(0))
 36 | 
 37 |             acc1 = calculate_accuracy(outputs, targets)
 38 |             acc5 = calculate_accuracy(outputs, targets, 5)
 39 |             accuracies5.update(acc5, inputs.size(0))
 40 |             accuracies1.update(acc1, inputs.size(0))
 41 | 
 42 |             batch_time.update(time.time() - end_time)
 43 |             end_time = time.time()
 44 | 
 45 |             print('Epoch: [{0}][{1}/{2}]\t'
 46 |                   'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
 47 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
 48 |                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
 49 |                   'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
 50 |                 epoch,
 51 |                 i + 1,
 52 |                 len(data_loader),
 53 |                 batch_time=batch_time,
 54 |                 data_time=data_time,
 55 |                 loss=losses,
 56 |                 acc=accuracies1))
 57 | 
 58 |     dataset = "ucf"
 59 |     if opt.dataset == "hmdb51":
 60 |         dataset = "hmdb"
 61 | 
 62 |     writer.add_scalar('%s/val_top5' % dataset, accuracies5.avg, epoch)
 63 |     writer.add_scalar('%s/val_top1' % dataset, accuracies1.avg, epoch)
 64 |     writer.add_scalar('%s/val_loss' % dataset, losses.avg, epoch)
 65 | 
 66 |     return losses.avg
 67 | 
 68 | 
 69 | def val_final(data_loader, model, opt):
 70 |     print('Final validation')
 71 | 
 72 |     model.eval()
 73 | 
 74 |     batch_time = AverageMeter()
 75 |     data_time = AverageMeter()
 76 |     accuracies1 = AverageMeter()
 77 |     accuracies5 = AverageMeter()
 78 | 
 79 |     end_time = time.time()
 80 |     with torch.no_grad():
 81 |         for i, inputs in enumerate(data_loader):
 82 |             data_time.update(time.time() - end_time)
 83 | 
 84 |             inputs1, inputs2, inputs3, inputs4, inputs5, targets = inputs
 85 | 
 86 |             if not opt.no_cuda:
 87 |                 targets = targets.cuda(async=True)
 88 |             if opt.dataset == "hmdb51":
 89 |                 targets -= 1
 90 | 
 91 |             outputs1 = model(inputs1)
 92 |             outputs2 = model(inputs2)
 93 |             outputs3 = model(inputs3)
 94 |             outputs4 = model(inputs4)
 95 |             outputs5 = model(inputs5)
 96 | 
 97 |             outputs = (outputs1 + outputs2 + outputs3 + outputs4 + outputs5) / 5.0
 98 | 
 99 |             acc1 = calculate_accuracy(outputs, targets)
100 |             acc5 = calculate_accuracy(outputs, targets, 5)
101 | 
102 |             accuracies5.update(acc5, inputs1.size(0))
103 |             accuracies1.update(acc1, inputs1.size(0))
104 | 
105 |             batch_time.update(time.time() - end_time)
106 |             end_time = time.time()
107 | 
108 |             print('Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
109 |                   'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
110 |                   'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
111 |                 i + 1,
112 |                 len(data_loader),
113 |                 batch_time=batch_time,
114 |                 data_time=data_time,
115 |                 acc=accuracies1))
116 | 


--------------------------------------------------------------------------------
/3D-ResNet/dataset.py:
--------------------------------------------------------------------------------
  1 | from datasets.kinetics import Kinetics
  2 | from datasets.ucf101 import UCF101
  3 | from datasets.hmdb51 import HMDB51
  4 | 
  5 | 
  6 | def get_training_set(opt, spatial_transform, temporal_transform,
  7 |                      target_transform):
  8 |     assert opt.dataset in ['kinetics', 'ucf101', 'hmdb51']
  9 | 
 10 |     if opt.dataset == 'kinetics':
 11 |         training_data = Kinetics(
 12 |             opt.video_path,
 13 |             opt.annotation_path,
 14 |             'training',
 15 |             spatial_transform=spatial_transform,
 16 |             temporal_transform=temporal_transform,
 17 |             target_transform=target_transform)
 18 |     elif opt.dataset == 'ucf101':
 19 |         training_data = UCF101(
 20 |             opt.video_path,
 21 |             opt.annotation_path,
 22 |             'training',
 23 |             spatial_transform=spatial_transform,
 24 |             temporal_transform=temporal_transform,
 25 |             target_transform=target_transform,
 26 |             shot=opt.shot)
 27 |     elif opt.dataset == 'hmdb51':
 28 |         training_data = HMDB51(
 29 |             opt.video_path,
 30 |             opt.annotation_path,
 31 |             'training',
 32 |             spatial_transform=spatial_transform,
 33 |             temporal_transform=temporal_transform,
 34 |             target_transform=target_transform,
 35 |             shot=opt.shot)
 36 | 
 37 |     return training_data
 38 | 
 39 | 
 40 | def get_validation_set(opt, spatial_transform, temporal_transform,
 41 |                        target_transform):
 42 |     assert opt.dataset in ['kinetics', 'ucf101', 'hmdb51']
 43 | 
 44 |     if opt.dataset == 'kinetics':
 45 |         validation_data = Kinetics(
 46 |             opt.video_path,
 47 |             opt.annotation_path,
 48 |             'validation',
 49 |             opt.n_val_samples,
 50 |             spatial_transform,
 51 |             temporal_transform,
 52 |             target_transform,
 53 |             sample_duration=opt.sample_duration)
 54 |     elif opt.dataset == 'ucf101':
 55 |         validation_data = UCF101(
 56 |             opt.video_path,
 57 |             opt.annotation_path,
 58 |             'validation',
 59 |             opt.n_val_samples,
 60 |             spatial_transform,
 61 |             temporal_transform,
 62 |             target_transform,
 63 |             sample_duration=opt.sample_duration)
 64 |     elif opt.dataset == 'hmdb51':
 65 |         validation_data = HMDB51(
 66 |             opt.video_path,
 67 |             opt.annotation_path,
 68 |             'validation',
 69 |             opt.n_val_samples,
 70 |             spatial_transform,
 71 |             temporal_transform,
 72 |             target_transform,
 73 |             sample_duration=opt.sample_duration)
 74 | 
 75 |     return validation_data
 76 | 
 77 | def get_test_set(opt, spatial_transform, temporal_transform, target_transform):
 78 |     assert opt.dataset in ['kinetics', 'ucf101', 'hmdb51']
 79 |     assert opt.test_subset in ['val', 'test']
 80 | 
 81 |     if opt.test_subset == 'val':
 82 |         subset = 'validation'
 83 |     elif opt.test_subset == 'test':
 84 |         subset = 'testing'
 85 |     if opt.dataset == 'kinetics':
 86 |         test_data = Kinetics(
 87 |             opt.video_path,
 88 |             opt.annotation_path,
 89 |             subset,
 90 |             0,
 91 |             spatial_transform,
 92 |             temporal_transform,
 93 |             target_transform,
 94 |             sample_duration=opt.sample_duration,
 95 |             num_vid_samples=5)
 96 |     elif opt.dataset == 'ucf101':
 97 |         test_data = UCF101(
 98 |             opt.video_path,
 99 |             opt.annotation_path,
100 |             subset,
101 |             1,
102 |             spatial_transform,
103 |             temporal_transform,
104 |             target_transform,
105 |             sample_duration=opt.sample_duration,
106 |             num_vid_samples=5)
107 |     elif opt.dataset == 'hmdb51':
108 |         test_data = HMDB51(
109 |             opt.video_path,
110 |             opt.annotation_path,
111 |             subset,
112 |             1,
113 |             spatial_transform,
114 |             temporal_transform,
115 |             target_transform,
116 |             sample_duration=opt.sample_duration,
117 |             num_vid_samples=5)
118 | 
119 |     return test_data
120 | 


--------------------------------------------------------------------------------
/3D-ResNet/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from models import resnet
  5 | 
  6 | def generate_model(opt):
  7 |     assert opt.model in [
  8 |         'resnet'
  9 |     ]
 10 | 
 11 |     if opt.model == 'resnet':
 12 |         assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200]
 13 |         input_chan = 3
 14 | 
 15 |         from models.resnet import get_fine_tuning_parameters
 16 | 
 17 |         if opt.model_depth == 10:
 18 |             model = resnet.resnet10(
 19 |                 num_classes=opt.n_classes,
 20 |                 shortcut_type=opt.resnet_shortcut,
 21 |                 sample_size=opt.sample_size,
 22 |                 sample_duration=opt.sample_duration, \
 23 |                 input_chan=input_chan)
 24 |         elif opt.model_depth == 18:
 25 |             model = resnet.resnet18(
 26 |                 num_classes=opt.n_classes,
 27 |                 shortcut_type=opt.resnet_shortcut,
 28 |                 sample_size=opt.sample_size,
 29 |                 sample_duration=opt.sample_duration, \
 30 |                 input_chan=input_chan)
 31 |         elif opt.model_depth == 34:
 32 |             model = resnet.resnet34(
 33 |                 num_classes=opt.n_classes,
 34 |                 shortcut_type=opt.resnet_shortcut,
 35 |                 sample_size=opt.sample_size,
 36 |                 sample_duration=opt.sample_duration, \
 37 |                 input_chan=input_chan)
 38 |         elif opt.model_depth == 50:
 39 |             model = resnet.resnet50(
 40 |                 num_classes=opt.n_classes,
 41 |                 shortcut_type=opt.resnet_shortcut,
 42 |                 sample_size=opt.sample_size,
 43 |                 sample_duration=opt.sample_duration, \
 44 |                 input_chan=input_chan)
 45 |         elif opt.model_depth == 101:
 46 |             model = resnet.resnet101(
 47 |                 num_classes=opt.n_classes,
 48 |                 shortcut_type=opt.resnet_shortcut,
 49 |                 sample_size=opt.sample_size,
 50 |                 sample_duration=opt.sample_duration)
 51 |         elif opt.model_depth == 152:
 52 |             model = resnet.resnet152(
 53 |                 num_classes=opt.n_classes,
 54 |                 shortcut_type=opt.resnet_shortcut,
 55 |                 sample_size=opt.sample_size,
 56 |                 sample_duration=opt.sample_duration)
 57 |         elif opt.model_depth == 200:
 58 |             model = resnet.resnet200(
 59 |                 num_classes=opt.n_classes,
 60 |                 shortcut_type=opt.resnet_shortcut,
 61 |                 sample_size=opt.sample_size,
 62 |                 sample_duration=opt.sample_duration)
 63 | 
 64 |     if not opt.no_cuda:
 65 |         model = model.cuda()
 66 |         model = nn.DataParallel(model, device_ids=None)
 67 | 
 68 |         if opt.pretrain_path:
 69 |             print('loading pretrained model {}'.format(opt.pretrain_path))
 70 |             pretrain = torch.load(opt.pretrain_path)
 71 |             if 'arch' in pretrain:
 72 |                 assert opt.arch == pretrain['arch']
 73 |                 model.load_state_dict(pretrain['state_dict'])
 74 |             else:
 75 |                 if "state_dict" in pretrain.keys():
 76 |                     model.module.load_state_dict(pretrain['state_dict'])
 77 |                 else:
 78 |                     model.module.fc = nn.Linear(
 79 |                         model.module.fc.in_features, 128)
 80 |                     model.load_state_dict(pretrain['model_state_dict'])
 81 | 
 82 |             model.module.fc = nn.Linear(model.module.fc.in_features, opt.n_finetune_classes)
 83 |             model.module.fc = model.module.fc.cuda()
 84 | 
 85 |             model.module.freeze_layers(opt.ft_begin_index)
 86 |             parameters = get_fine_tuning_parameters(model, opt.ft_begin_index)
 87 |             return model, parameters
 88 |     else:
 89 |         if opt.pretrain_path:
 90 |             print('loading pretrained model {}'.format(opt.pretrain_path))
 91 |             pretrain = torch.load(opt.pretrain_path)
 92 |             assert opt.arch == pretrain['arch']
 93 | 
 94 |             model.load_state_dict(pretrain['state_dict'])
 95 | 
 96 |             model.fc = nn.Linear(model.fc.in_features, opt.n_finetune_classes)
 97 | 
 98 |             parameters = get_fine_tuning_parameters(model, opt.ft_begin_index)
 99 |             return model, parameters
100 | 
101 |     return model, model.parameters()
102 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/idt/cluster_fv.py:
--------------------------------------------------------------------------------
  1 | from src.datasets.kinetics import load_annotation_data, get_video_names_and_annotations, load_value_file
  2 | import os
  3 | import torch
  4 | import json
  5 | import argparse
  6 | from os.path import join
  7 | import numpy as np
  8 | from src.objectives.localagg import run_kmeans_multi_gpu, run_kmeans
  9 | 
 10 | 
 11 | DEFAULT_KMEANS_SEED = 1234
 12 | 
 13 | 
 14 | def get_parser():
 15 |     parser = argparse.ArgumentParser(description="IDT video inference")
 16 |     parser.add_argument("--k", type=int, help="Number of clusters.")
 17 |     parser.add_argument("--num_c", type=int, help="Number of clusterings.")
 18 |     parser.add_argument("--frames_path", help="Path to Kinetics frames.")
 19 |     parser.add_argument("--annotation_path", help="Path to Kinetics annotation.")
 20 |     parser.add_argument("--fv_path", help="Path to Fisher vectors.")
 21 |     parser.add_argument("--clusters_path", help="Path to save cluster.")
 22 |     parser.add_argument("--processed_annotation_path", help="Path to output annotation file.")
 23 |     parser.add_argument('--gpu', nargs='*', help='GPU id')
 24 |     return parser
 25 | 
 26 | 
 27 | def compute_clusters(data, k, gpu_devices):
 28 |     pred_labels = []
 29 |     data_npy = data.cpu().detach().numpy()
 30 |     data_npy = np.float32(data_npy)
 31 |     for k_idx, each_k in enumerate(k):
 32 |         # cluster the data
 33 | 
 34 |         if len(gpu_devices) == 1: # single gpu
 35 |             I, _ = run_kmeans(data_npy, each_k, seed=k_idx + DEFAULT_KMEANS_SEED,
 36 |                               gpu_device=gpu_devices[0])
 37 |         else: # multigpu
 38 |             I, _ = run_kmeans_multi_gpu(data_npy, each_k, seed=k_idx + DEFAULT_KMEANS_SEED, gpu_device=gpu_devices)
 39 | 
 40 |         clust_labels = np.asarray(I)
 41 |         pred_labels.append(clust_labels)
 42 |     pred_labels = np.stack(pred_labels, axis=0)
 43 |     pred_labels = torch.from_numpy(pred_labels).long()
 44 | 
 45 |     return pred_labels
 46 | 
 47 | 
 48 | if __name__ == "__main__":
 49 |     args = get_parser().parse_args()
 50 |  
 51 |     gpu_devices = []
 52 |     if args.gpu:
 53 |         ids_list = ''
 54 |         for i in range(len(args.gpu)):
 55 |             ids_list += args.gpu[i] + ','
 56 |             gpu_devices.append(int(args.gpu[i]))
 57 |         ids_list = ids_list[:-1]
 58 | 
 59 |         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 60 |         os.environ["CUDA_VISIBLE_DEVICES"] = ids_list
 61 | 
 62 |     frames_path = args.frames_path
 63 |     annotation_path = args.annotation_path
 64 |     fv_path = args.fv_path
 65 |     k = args.k
 66 |     n_clusters = args.num_c
 67 | 
 68 |     data = load_annotation_data(annotation_path)
 69 | 
 70 |     video_names, annotations = get_video_names_and_annotations(data, "training")
 71 | 
 72 |     count_valid = 0
 73 |     count_missing = 0
 74 |     fvs = []
 75 |     database = {}
 76 |     labels = set([])
 77 |     for i in range(len(video_names)):
 78 |         if i % 1000 == 0:
 79 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
 80 | 
 81 |         vid_key = video_names[i].split("/")[-1]
 82 |         vid_label = video_names[i].split("/")[-2].replace("_", " ")
 83 | 
 84 |         video_path = os.path.join(frames_path, video_names[i])
 85 |         if not os.path.exists(video_path):
 86 |             continue
 87 | 
 88 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
 89 |         n_frames = int(load_value_file(n_frames_file_path))
 90 |         if n_frames <= 0:
 91 |             continue
 92 | 
 93 |         count_valid += 1
 94 | 
 95 |         fv_vid_path = os.path.join(fv_path, video_names[i]) + ".dat" 
 96 |         if not os.path.exists(fv_vid_path):
 97 |             count_missing += 1
 98 |             continue
 99 |         else:
100 |             fv = torch.load(fv_vid_path)
101 | 
102 |         value = {}
103 |         value['subset'] = 'training'
104 |         value['annotations'] = {}
105 |         value['annotations']['label'] = vid_label
106 |         database[vid_key] = value
107 | 
108 |         labels.add(vid_label)
109 | 
110 |         fvs.append(fv.cpu().squeeze())
111 | 
112 |     for key, value in data['database'].items():
113 |         this_subset = value['subset']
114 |         if (this_subset == 'validation' and (value['annotations']['label'] in labels)) or this_subset == 'testing':
115 |             database[key] = value
116 | 
117 |     print("%d missing out of %d\n" % (count_missing, count_valid))
118 | 
119 |     fvs = torch.stack(fvs)
120 | 
121 |     k = [k for _ in range(n_clusters)]
122 | 
123 |     cluster_labels = compute_clusters(fvs, k, gpu_devices)
124 | 
125 |     os.mkdir(join(args.clusters_path, "checkpoints"))
126 |     torch.save({'cluster_labels': cluster_labels}, join(args.clusters_path, "checkpoints/checkpoint.pth.tar"))
127 | 
128 |     out = {}
129 |     out['labels'] = list(labels)
130 |     out['database'] = database
131 | 
132 |     with open(args.processed_annotation_path, 'w') as dst_file:
133 |         json.dump(out, dst_file)
134 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/objectives/instance.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Non-parametric Instance Discrimination Loss
  3 | https://github.com/zhirongw/lemniscate.pytorch
  4 | 
  5 | Code is based on Tensorflow implementation: 
  6 | https://github.com/neuroailab/LocalAggregation
  7 | 
  8 | This script wraps the InstanceDiscriminationLoss function as a torch.nn.Module,
  9 | so that the loss can be computed parallelly across multi-gpus using Dataparallel
 10 | 
 11 | """
 12 | import math
 13 | import torch
 14 | import numpy as np
 15 | 
 16 | from src.utils.tensor import l2_normalize
 17 | 
 18 | 
 19 | class InstanceDiscriminationLossModule(torch.nn.Module):
 20 |     def __init__(self, memory_bank_broadcast, cluster_labels_broadcast=None, k=4096, t=0.07, m=0.5):
 21 |         super(InstanceDiscriminationLossModule, self).__init__()
 22 |         self.k, self.t, self.m = k, t, m
 23 | 
 24 |         self.indices = None
 25 |         self.outputs = None
 26 |         self._bank = None  # pass in via forward function
 27 |         self.memory_bank_broadcast = memory_bank_broadcast
 28 |         self.data_len = memory_bank_broadcast[0].size(0)
 29 |         self.Z_est = 0
 30 | 
 31 |     def _softmax(self, dot_prods):
 32 |         Z = 2876934.2
 33 |         return torch.exp(dot_prods / self.t) / Z
 34 | 
 35 |     def updated_new_data_memory(self, indices, outputs):
 36 |         outputs = l2_normalize(outputs)
 37 |         data_memory = torch.index_select(self._bank, 0, indices)
 38 |         new_data_memory = data_memory * self.m + (1 - self.m) * outputs
 39 |         return l2_normalize(new_data_memory, dim=1)
 40 | 
 41 |     def synchronization_check(self):
 42 |         for i in range(len(self.memory_bank_broadcast)):
 43 |             if i == 0:
 44 |                 device = self.memory_bank_broadcast[0].device
 45 |             else:
 46 |                 assert torch.equal(self.memory_bank_broadcast[0], self.memory_bank_broadcast[i].to(device))
 47 | 
 48 |     def _get_dot_products(self, vec, idxs):
 49 |         """
 50 |         This function is copied from the get_dot_products in Memory_Bank class
 51 |         Since we want to register self._bank as a buffer (to be broadcasted to multigpus) instead of self.memory_bank,
 52 |         we need to avoid calling self.memory_bank get_dot_products
 53 | 
 54 |         """
 55 |         vec_shape = list(vec.size())  # [bs, dim]
 56 |         idxs_shape = list(idxs.size())  # [bs, ...]
 57 | 
 58 |         assert len(idxs_shape) in [1, 2]
 59 |         assert len(vec_shape) == 2
 60 |         assert vec_shape[0] == idxs_shape[0]
 61 | 
 62 |         if len(idxs_shape) == 1:
 63 |             with torch.no_grad():
 64 |                 memory_vecs = torch.index_select(self._bank, 0, idxs)
 65 |                 memory_vecs_shape = list(memory_vecs.size())
 66 |                 assert memory_vecs_shape[0] == idxs_shape[0]
 67 |         else:  # len(idxs_shape) == 2
 68 |             with torch.no_grad():
 69 |                 batch_size, k_dim = idxs.size(0), idxs.size(1)
 70 |                 flat_idxs = idxs.view(-1)
 71 |                 memory_vecs = torch.index_select(self._bank, 0, flat_idxs)
 72 |                 memory_vecs = memory_vecs.view(batch_size, k_dim, self._bank.size(1))
 73 |                 memory_vecs_shape = list(memory_vecs.size())
 74 | 
 75 |             vec_shape[1:1] = [1] * (len(idxs_shape) - 1)
 76 |             vec = vec.view(vec_shape)  # [bs, 1, dim]
 77 | 
 78 |         prods = memory_vecs * vec
 79 |         assert list(prods.size()) == memory_vecs_shape
 80 | 
 81 |         return torch.sum(prods, dim=-1)
 82 | 
 83 |     def compute_data_prob(self):
 84 |         logits = self._get_dot_products(self.outputs, self.indices)
 85 |         return self._softmax(logits)
 86 | 
 87 |     def compute_noise_prob(self):
 88 |         batch_size = self.indices.size(0)
 89 |         noise_indx = torch.randint(0, self.data_len, (batch_size, self.k),
 90 |                                    device=self.outputs.device)  # U(0, data_len)
 91 |         noise_indx = noise_indx.long()
 92 |         logits = self._get_dot_products(self.outputs, noise_indx)
 93 |         noise_probs = self._softmax(logits)
 94 |         return noise_probs
 95 | 
 96 |     def forward(self, indices, outputs, gpu_idx):
 97 |         self.indices = indices.detach()
 98 |         self.outputs = l2_normalize(outputs, dim=1)
 99 |         self._bank = self.memory_bank_broadcast[gpu_idx]
100 | 
101 |         batch_size = self.indices.size(0)
102 |         data_prob = self.compute_data_prob()
103 |         noise_prob = self.compute_noise_prob()
104 | 
105 |         assert data_prob.size(0) == batch_size
106 |         assert noise_prob.size(0) == batch_size
107 |         assert noise_prob.size(1) == self.k
108 | 
109 |         base_prob = 1.0 / self.data_len
110 |         eps = 1e-7
111 | 
112 |         ## Pmt
113 |         data_div = data_prob + (self.k * base_prob + eps)
114 | 
115 |         ln_data = torch.log(data_prob) - torch.log(data_div)
116 | 
117 |         ## Pon
118 |         noise_div = noise_prob + (self.k * base_prob + eps)
119 |         ln_noise = math.log(self.k * base_prob) - torch.log(noise_div)
120 | 
121 |         curr_loss = -(torch.sum(ln_data) + torch.sum(ln_noise))
122 |         curr_loss = curr_loss / batch_size
123 | 
124 |         new_data_memory = self.updated_new_data_memory(self.indices, self.outputs)
125 | 
126 |         return curr_loss.unsqueeze(0), new_data_memory
127 | 
128 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/utils/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import logging
  5 | import numpy as np
  6 | from pprint import pprint
  7 | from dotmap import DotMap
  8 | from logging import Formatter
  9 | from logging.handlers import RotatingFileHandler
 10 | from time import strftime, localtime, time
 11 | 
 12 | from src.utils.utils import load_json, save_json
 13 | 
 14 | 
 15 | def makedirs(dir_list):
 16 |     for dir in dir_list:
 17 |         if not os.path.exists(dir):
 18 |             os.makedirs(dir)
 19 | 
 20 | 
 21 | def process_config(config_path, override_dotmap=None):
 22 |     config_json = load_json(config_path)
 23 |     return _process_config(config_json, override_dotmap=override_dotmap)
 24 | 
 25 | 
 26 | def _process_config(config_json, override_dotmap=None):
 27 |     """
 28 |     Processes config file:
 29 |         1) Converts it to a DotMap
 30 |         2) Creates experiments path and required subdirs
 31 |         3) Set up logging
 32 |     """
 33 |     config = DotMap(config_json)
 34 |     if override_dotmap is not None:
 35 |         config.update(override_dotmap)
 36 | 
 37 |     print("Loaded configuration: ")
 38 |     print(config)
 39 | 
 40 |     print()
 41 |     print(" *************************************** ")
 42 |     print("      Running experiment {}".format(config.exp_name))
 43 |     print(" *************************************** ")
 44 |     print()
 45 | 
 46 |     # if config.pretrained_exp_dir is not None:
 47 |     #     # don't make new dir more continuing training
 48 |     #     exp_dir = config.pretrained_exp_dir
 49 |     #     print("[INFO]: Continuing from previously finished training at %s." % exp_dir)
 50 |     # else:
 51 |     exp_base = config.exp_base
 52 | 
 53 |     if config.debug:
 54 |         exp_dir = os.path.join(exp_base, "experiments",
 55 |                                config.exp_name, 'debug')
 56 |     else:
 57 |         if config.pretrained_exp_dir is not None and isinstance(config.pretrained_exp_dir, str):
 58 |             # don't make new dir more continuing training
 59 |             exp_dir = config.pretrained_exp_dir
 60 |             print('[INFO]: Backup previously trained model and config json')
 61 |             os.system("cp %s/config.json %s/prev_config.json" % (exp_dir, exp_dir))
 62 |             os.system("cp %s/checkpoints/checkpoint.pth.tar %s/checkpoints/prev_checkpoint.pth.tar" % (exp_dir, exp_dir))
 63 |             os.system("cp %s/checkpoints/model_best.pth.tar %s/checkpoints/prev_model_best.pth.tar" % (exp_dir, exp_dir))
 64 |         elif config.continue_exp_dir is not None and isinstance(config.continue_exp_dir, str):
 65 |             exp_dir = config.continue_exp_dir
 66 |             print('[INFO]: Backup previously trained model and config json')
 67 |             os.system("cp %s/config.json %s/prev_config.json" % (exp_dir, exp_dir))
 68 |             os.system(
 69 |                 "cp %s/checkpoints/checkpoint.pth.tar %s/checkpoints/prev_checkpoint.pth.tar" % (exp_dir, exp_dir))
 70 |             os.system(
 71 |                 "cp %s/checkpoints/model_best.pth.tar %s/checkpoints/prev_model_best.pth.tar" % (exp_dir, exp_dir))
 72 |         else:
 73 |             if config.exp_id is None:
 74 |                 config.exp_id = strftime('%Y-%m-%d--%H_%M_%S', localtime())
 75 |             exp_dir = os.path.join(exp_base, "experiments",
 76 |                                    config.exp_name, config.exp_id)
 77 | 
 78 |     # create some important directories to be used for the experiment.
 79 |     config.summary_dir = "summaries/"
 80 |     config.checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
 81 |     config.out_dir = os.path.join(exp_dir, "out/")
 82 |     config.log_dir = os.path.join(exp_dir, "logs/")
 83 | 
 84 |     makedirs([config.summary_dir, config.checkpoint_dir,
 85 |               config.out_dir, config.log_dir])
 86 | 
 87 |     # save config to experiment dir
 88 |     config_out = os.path.join(exp_dir, 'config.json')
 89 |     save_json(config.toDict(), config_out)
 90 | 
 91 |     # setup logging in the project
 92 |     setup_logging(config.log_dir)
 93 | 
 94 |     logging.getLogger().info("Experiment directory is located at %s" % exp_dir)
 95 | 
 96 |     logging.getLogger().info(
 97 |         "Configurations and directories successfully set up.")
 98 |     return config
 99 | 
100 | 
101 | def setup_logging(log_dir):
102 |     log_file_format = "[%(levelname)s] - %(asctime)s - %(name)s - : %(message)s in %(pathname)s:%(lineno)d"
103 |     log_console_format = "[%(levelname)s]: %(message)s"
104 | 
105 |     # Main logger
106 |     main_logger = logging.getLogger()
107 |     main_logger.setLevel(logging.INFO)
108 | 
109 |     console_handler = logging.StreamHandler()
110 |     console_handler.setLevel(logging.INFO)
111 |     console_handler.setFormatter(Formatter(log_console_format))
112 | 
113 |     exp_file_handler = RotatingFileHandler(
114 |         '{}exp_debug.log'.format(log_dir), maxBytes=10**6, backupCount=5)
115 |     exp_file_handler.setLevel(logging.DEBUG)
116 |     exp_file_handler.setFormatter(Formatter(log_file_format))
117 | 
118 |     exp_errors_file_handler = RotatingFileHandler(
119 |         '{}exp_error.log'.format(log_dir), maxBytes=10**6, backupCount=5)
120 |     exp_errors_file_handler.setLevel(logging.WARNING)
121 |     exp_errors_file_handler.setFormatter(Formatter(log_file_format))
122 | 
123 |     main_logger.addHandler(console_handler)
124 |     main_logger.addHandler(exp_file_handler)
125 |     main_logger.addHandler(exp_errors_file_handler)
126 | 
127 | 
128 | def print_cuda_statistics():
129 |     logger = logging.getLogger("Cuda Statistics")
130 |     import sys
131 |     from subprocess import call
132 |     import torch
133 |     logger.info('__Python VERSION:  {}'.format(sys.version))
134 |     logger.info('__pyTorch VERSION:  {}'.format(torch.__version__))
135 |     logger.info('__CUDA VERSION')
136 |     try:
137 |         call(["nvcc", "--version"])
138 |     except:
139 |         pass
140 |     logger.info('__CUDNN VERSION:  {}'.format(torch.backends.cudnn.version()))
141 |     logger.info('__Number CUDA Devices:  {}'.format(torch.cuda.device_count()))
142 |     logger.info('__Devices')
143 |     call(["nvidia-smi", "--format=csv",
144 |           "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
145 |     logger.info('Active CUDA Device: GPU {}'.format(torch.cuda.current_device()))
146 |     logger.info('Available devices  {}'.format(torch.cuda.device_count()))
147 |     logger.info('Current cuda device  {}'.format(torch.cuda.current_device()))
148 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/idt/extract_idt.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import gzip
  3 | import numpy as np
  4 | import pickle
  5 | import warnings
  6 | import glob
  7 | from sketching import *
  8 | from os.path import isfile, join
  9 | import os
 10 | import torch
 11 | import argparse
 12 | 
 13 | 
 14 | def get_parser():
 15 |     parser = argparse.ArgumentParser(description="IDT video inference")
 16 |     parser.add_argument("--category", help="Category to process.")
 17 |     parser.add_argument("--model_path", help="Path to FV models.")
 18 |     parser.add_argument("--videos_path", help="Path to videos.")
 19 |     parser.add_argument("--boxes_path", help="Path to boxes.")
 20 |     parser.add_argument("--out_path", help="Output path.")
 21 |     return parser
 22 | 
 23 | 
 24 | def fisher_vector(xx, gmm):
 25 |     """Computes the Fisher vector on a set of descriptors/features.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     xx: array_like, shape (N, D) or (D, )
 30 |         The set of descriptors
 31 | 
 32 |     gmm: instance of sklearn mixture.GMM object
 33 |         Gauassian mixture model of the descriptors.
 34 | 
 35 |     Returns
 36 |     -------
 37 |     fv: array_like, shape (K + 2 * D * K, )
 38 |         Fisher vector (derivatives with respect to the mixing weights, means
 39 |         and variances) of the given descriptors.
 40 | 
 41 |     Reference
 42 |     ---------
 43 |     J. Krapac, J. Verbeek, F. Jurie.  Modeling Spatial Layout with Fisher
 44 |     Vectors for Image Categorization.  In ICCV, 2011.
 45 |     http://hal.inria.fr/docs/00/61/94/03/PDF/final.r1.pdf
 46 |     """
 47 | 
 48 |     xx = np.atleast_2d(xx)
 49 |     N = xx.shape[0]
 50 | 
 51 |     # Compute posterior probabilities.
 52 |     Q = gmm.predict_proba(xx)  # NxK
 53 | 
 54 |     # Compute the sufficient statistics of descriptors.
 55 |     Q_sum = np.sum(Q, 0)[:, np.newaxis] / N
 56 |     Q_xx = np.dot(Q.T, xx) / N
 57 |     Q_xx_2 = np.dot(Q.T, xx ** 2) / N
 58 | 
 59 |     # Compute derivatives with respect to mixing weights, means and variances.
 60 |     d_pi = Q_sum.squeeze() - gmm.weights_
 61 |     d_mu = Q_xx - Q_sum * gmm.means_
 62 |     d_sigma = (
 63 |             - Q_xx_2
 64 |             - Q_sum * gmm.means_ ** 2
 65 |             + Q_sum * gmm.covariances_ + 2 * Q_xx * gmm.means_)
 66 | 
 67 |     # Merge derivatives into a vector.
 68 |     return np.hstack((d_mu.flatten(), d_sigma.flatten()))
 69 | 
 70 | 
 71 | def power_normalize(xx, alpha=0.5):
 72 |     """Computes a alpha-power normalization for the matrix xx."""
 73 |     return np.sign(xx) * np.abs(xx) ** alpha
 74 | 
 75 | 
 76 | def L2_normalize(xx):
 77 |     """L2-normalizes each row of the data xx."""
 78 |     Zx = np.sum(xx * xx, 1)
 79 |     xx_norm = np.divide(xx, np.sqrt(Zx[:, np.newaxis]))
 80 |     xx_norm[np.isnan(xx_norm)] = 0
 81 |     return xx_norm
 82 | 
 83 | 
 84 | def compute_fv(filename, gmm, pca, sdense_fv):
 85 |     # use '\s+' for the \t and \n
 86 |     df = pd.read_table(gzip.open(filename), sep='\s+', header=None)
 87 |     df = df.iloc[:, 10:436]
 88 | 
 89 |     # turn pandas dataframe into array
 90 |     df_array = df.values
 91 |     if np.any(np.isnan(df_array)):
 92 |         return None
 93 |     # use stored PCA
 94 | 
 95 |     if df_array.shape[0] > 3000000:
 96 |         print("Dropping tracks to save memory")
 97 |         df_array = df_array[:3000000]
 98 | 
 99 |     df_array = pca.transform(df_array)
100 |     # get the fisher vector for each video sequence
101 |     fv = fisher_vector(df_array, gmm)
102 | 
103 |     fv = power_normalize(fv, alpha=0.5)
104 |     fv = np.expand_dims(fv, axis=0)
105 |     fv = L2_normalize(fv)
106 | 
107 |     SKETCH_FLOAT = 0
108 |     SKETCH_DIM_feat = 2000
109 | 
110 |     FEAT_DIM_fv = fv.shape[1]
111 | 
112 |     fv = sketch_batch(torch.from_numpy(fv), sdense_fv, SKETCH_DIM_feat, FEAT_DIM_fv, SKETCH_FLOAT)
113 | 
114 |     return fv
115 | 
116 | 
117 | def process_category(model_path, videos_path, boxes_path, out_path, category):
118 |     print(category)
119 |     pca = pickle.load(open(join(model_path, "pca_model.sav"), 'rb'))
120 |     gmm = pickle.load(open(join(model_path, "gmm_diag_model.sav"), 'rb'))
121 |     sdense_fv = pickle.load(open(join(model_path, "sketching_proj.sav"), 'rb'))
122 | 
123 |     box_path = join(boxes_path, category)
124 |     boxes = [f for f in os.listdir(box_path) if isfile(join(box_path, f))]
125 | 
126 |     vid_path = join(videos_path, category)
127 |     vids = [f for f in os.listdir(vid_path) if isfile(join(vid_path, f))]
128 | 
129 |     out_path = join(out_path, category)
130 | 
131 |     if not os.path.exists(out_path):
132 |         os.makedirs(out_path)
133 | 
134 |     count = 0
135 |     temp_path = "/scratch/ptokmako/IDT_features_temp"
136 |     for vid in vids:
137 |         vid_name = vid.split(".")[0]
138 |         
139 |         if (vid_name + ".bb") not in boxes:
140 |             print("Boxes not found for %s !!!!!!!!!!" % vid_name)
141 |             continue
142 |         count += 1
143 |         if os.path.exists(out_path + "%s.dat" % vid_name):
144 |             continue
145 |         print("%d/%d %s" % (count, len(vids), vid_name))
146 | 
147 |         sz = 0
148 |         filename = "%s/%s.gz" % (temp_path, category + "_" + vid_name)
149 |         attempts = 0
150 |         while sz < 100 and attempts < 5:
151 |             stream = os.popen('sh src/idt/idt.sh "%s" "%s" "%s" %s' % (join(vid_path, vid), join(box_path, vid_name + ".bb"),
152 |                                                             category + "_" + vid_name, temp_path))
153 |             output = stream.read()
154 |             print(output)
155 |             sz = os.path.getsize(filename)
156 |             attempts += 1
157 | 
158 |         if sz < 100:
159 |             print("Could not process video!!!!!!!!!!!!!!")
160 |             continue
161 | 
162 |         fv = compute_fv(filename, gmm, pca, sdense_fv)
163 |         if fv is not None:
164 |             torch.save(fv, join(out_path, vid_name + ".dat"))
165 |         else:
166 |             print("NaNs in IDT\n")
167 | 
168 |         stream = os.popen('rm -f "%s/%s.gz"' % (temp_path, category + "_" + vid_name))
169 |         output = stream.read()
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     warnings.filterwarnings("ignore")
174 |     args = get_parser().parse_args()
175 |     model_path = args.model_path
176 |     videos_path = args.videos_path
177 |     boxes_path = args.boxes_path
178 |     out_path = args.out_path
179 |     category = args.category
180 |     
181 |     process_category(model_path, videos_path, boxes_path, out_path, category)
182 | 
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/3D-ResNet/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import numpy as np
  5 | import torch
  6 | from torch import nn
  7 | from torch import optim
  8 | from torch.optim import lr_scheduler
  9 | from tensorboardX import SummaryWriter
 10 | 
 11 | from opts import parse_opts
 12 | from model import generate_model
 13 | from mean import get_mean, get_std
 14 | from spatial_transforms import (
 15 |     Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop,
 16 |     MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor)
 17 | from temporal_transforms import LoopPadding, TemporalRandomCrop, TemporalStride
 18 | from target_transforms import ClassLabel, VideoID
 19 | from target_transforms import Compose as TargetCompose
 20 | from dataset import get_training_set, get_validation_set, get_test_set
 21 | from utils import Logger
 22 | from train import train_epoch
 23 | from validation import val_epoch, val_final
 24 | import test
 25 | import warnings
 26 | 
 27 | 
 28 | def get_lr(optimizer):
 29 |     for i, param_group in enumerate(optimizer.param_groups):
 30 |         if float(param_group['lr']) != 0:
 31 |             return float(param_group['lr'])
 32 | 
 33 |     return 0
 34 | 
 35 | 
 36 | if __name__ == '__main__':
 37 |     opt = parse_opts()
 38 |     model_name = opt.result_path.split("/")[-1]
 39 |     print(model_name)
 40 |     if opt.root_path != '':
 41 |         if opt.resume_path:
 42 |             opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
 43 |         if opt.pretrain_path:
 44 |             opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
 45 |     opt.scales = [opt.initial_scale]
 46 |     for i in range(1, opt.n_scales):
 47 |         opt.scales.append(opt.scales[-1] * opt.scale_step)
 48 |     opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
 49 |     opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
 50 |     opt.std = get_std(opt.norm_value)
 51 | 
 52 |     warnings.filterwarnings("ignore", category=UserWarning)
 53 | 
 54 |     if opt.gpu:
 55 |         ids_list = ''
 56 |         for i in range(len(opt.gpu)):
 57 |             ids_list += opt.gpu[i] + ','
 58 |         ids_list = ids_list[:-1]
 59 | 
 60 |         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 61 |         os.environ["CUDA_VISIBLE_DEVICES"] = ids_list
 62 | 
 63 |     print(opt)
 64 |     if not os.path.exists(opt.result_path):
 65 |         os.mkdir(opt.result_path)
 66 |     with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
 67 |         json.dump(vars(opt), opt_file)
 68 | 
 69 |     writer = None
 70 |     writer = SummaryWriter(log_dir='/home/ptokmako/src/LocalAggregation-Pytorch/summaries/3DResNet/' + model_name)
 71 | 
 72 |     torch.manual_seed(opt.manual_seed)
 73 | 
 74 |     model, parameters = generate_model(opt)
 75 |     print(model)
 76 |     criterion = nn.CrossEntropyLoss()
 77 | 
 78 |     if not opt.no_cuda:
 79 |         criterion = criterion.cuda()
 80 | 
 81 |     if opt.no_mean_norm and not opt.std_norm:
 82 |         norm_method = Normalize([0, 0, 0], [1, 1, 1])
 83 |     elif not opt.std_norm:
 84 |         norm_method = Normalize(opt.mean, [1, 1, 1])
 85 |     else:
 86 |         norm_method = Normalize(opt.mean, opt.std)
 87 | 
 88 |     if not opt.no_train:
 89 |         assert opt.train_crop in ['random', 'corner', 'center']
 90 |         if opt.train_crop == 'random':
 91 |             crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size)
 92 |         elif opt.train_crop == 'corner':
 93 |             crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size)
 94 |         elif opt.train_crop == 'center':
 95 |             crop_method = MultiScaleCornerCrop(
 96 |                 opt.scales, opt.sample_size, crop_positions=['c'])
 97 |         spatial_transform = Compose([
 98 |             crop_method,
 99 |             RandomHorizontalFlip(),
100 |             ToTensor(opt.norm_value), norm_method
101 |         ])
102 |         temporal_transform = Compose([ TemporalRandomCrop(opt.sample_duration)])
103 |         target_transform = ClassLabel()
104 |         training_data = get_training_set(opt, spatial_transform,
105 |                                          temporal_transform, target_transform)
106 |         train_loader = torch.utils.data.DataLoader(
107 |             training_data,
108 |             batch_size=opt.batch_size,
109 |             shuffle=True,
110 |             num_workers=opt.n_threads,
111 |             pin_memory=True)
112 |         train_logger = Logger(
113 |             os.path.join(opt.result_path, 'train.log'),
114 |             ['epoch', 'loss', 'acc', 'lr'])
115 |         train_batch_logger = Logger(
116 |             os.path.join(opt.result_path, 'train_batch.log'),
117 |             ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])
118 | 
119 |     if opt.nesterov:
120 |         dampening = 0
121 |     else:
122 |         dampening = opt.dampening
123 | 
124 |     optimizer = optim.SGD(
125 |         parameters,
126 |         lr=opt.learning_rate,
127 |         momentum=opt.momentum,
128 |         dampening=dampening,
129 |         weight_decay=opt.weight_decay,
130 |         nesterov=opt.nesterov)
131 | 
132 |     scheduler = lr_scheduler.ReduceLROnPlateau(
133 |         optimizer, 'min', patience=opt.lr_patience, threshold=opt.lr_threshold)
134 |     if not opt.no_val:
135 |         spatial_transform = Compose([
136 |             Scale(opt.sample_size),
137 |             CenterCrop(opt.sample_size),
138 |             ToTensor(opt.norm_value), norm_method
139 |         ])
140 |         temporal_transform = Compose([TemporalRandomCrop(opt.sample_duration)])
141 |         target_transform = ClassLabel()
142 |         validation_data = get_validation_set(
143 |             opt, spatial_transform, temporal_transform, target_transform)
144 |         val_loader = torch.utils.data.DataLoader(
145 |             validation_data,
146 |             batch_size=opt.batch_size,
147 |             shuffle=False,
148 |             num_workers=opt.n_threads,
149 |             pin_memory=True)
150 |         val_logger = Logger(
151 |             os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc'])
152 | 
153 |     if opt.resume_path:
154 |         print('loading checkpoint {}'.format(opt.resume_path))
155 |         checkpoint = torch.load(opt.resume_path)
156 |         assert opt.arch == checkpoint['arch']
157 | 
158 |         opt.begin_epoch = checkpoint['epoch']
159 |         model.load_state_dict(checkpoint['state_dict'])
160 |         if not opt.no_train:
161 |             optimizer.load_state_dict(checkpoint['optimizer'])
162 | 
163 |     print('run')
164 |     for i in range(opt.begin_epoch, opt.n_epochs + 1):
165 |         # noinspection PyInterpreter
166 |         writer.add_scalar('ucf/lr', get_lr(optimizer), i)
167 |         if not opt.no_train:
168 |             train_epoch(i, train_loader, model, criterion, optimizer, opt,
169 |                         train_logger, train_batch_logger, writer)
170 |         if not opt.no_val:
171 |             validation_loss = val_epoch(i, val_loader, model, criterion, opt,
172 |                                         val_logger, writer)
173 | 
174 |         if not opt.no_train and not opt.no_val:
175 |             scheduler.step(validation_loss)
176 | 
177 |     if opt.test:
178 |         spatial_transform = Compose([
179 |             Scale(int(opt.sample_size / opt.scale_in_test)),
180 |             CenterCrop(opt.sample_size),
181 |             ToTensor(opt.norm_value), norm_method
182 |         ])
183 |         temporal_transform = LoopPadding(5 * opt.sample_duration)
184 |         target_transform = ClassLabel()
185 | 
186 |         test_data = get_test_set(opt, spatial_transform, temporal_transform,
187 |                                  target_transform)
188 |         test_loader = torch.utils.data.DataLoader(
189 |             test_data,
190 |             batch_size=opt.batch_size,
191 |             shuffle=False,
192 |             num_workers=opt.n_threads,
193 |             pin_memory=True)
194 |         val_final(test_loader, model, opt)
195 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Unsupervised Learning of Video Representations via Dense Trajectory Clustering
  2 | 
  3 | This is an implementation of the [Unsupervised Learning of Video Representations via Dense Trajectory Clustering](https://arxiv.org/abs/2006.15731) algorithm.
  4 | 
  5 | The codebased is built upon [Local Aggregation](https://github.com/neuroailab/LocalAggregation-Pytorch) and [3D ResNet](https://github.com/kenshohara/3D-ResNets-PyTorch).  
  6 | 
  7 | ## Prerequisites
  8 | 
  9 | * Linux
 10 | * Pytorch 1.2.0
 11 | * [Faiss](https://github.com/facebookresearch/faiss)
 12 | * tqdm
 13 | * dotmap
 14 | * tensorboardX
 15 | * sklearn
 16 | * pandas
 17 | 
 18 | ## Unsupervised representation learning
 19 | 
 20 | ### Dataset preprocessing
 21 | Training is done on the [Kinetics-400 dataset](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Download it and preprocess as follows.
 22 | ```
 23 | cd 3D-ResNet
 24 | ```
 25 | 
 26 | * Convert from avi to jpg files using ```utils/video_jpg_kinetics.py```
 27 | 
 28 | ```bash
 29 | python utils/video_jpg_kinetics.py AVI_VIDEO_DIRECTORY JPG_VIDEO_DIRECTORY
 30 | ```
 31 | 
 32 | * Generate n_frames files using ```utils/n_frames_kinetics.py```
 33 | 
 34 | ```bash
 35 | python utils/n_frames_kinetics.py JPG_VIDEO_DIRECTORY
 36 | ```
 37 | 
 38 | * Generate annotation file in json format using ```utils/kinetics_json.py```
 39 |   * The CSV files (kinetics_{train, val, test}.csv) are included in the crawler.
 40 | 
 41 | ```bash
 42 | python utils/kinetics_json.py TRAIN_CSV_PATH VAL_CSV_PATH TEST_CSV_PATH DST_JSON_APTH
 43 | ```
 44 | 
 45 | If you want to use our precomuted IDT clusters for training, please use the Kinetics annotation file provided in this codebase (splits.json). If you find that some videos are missing in you local copy of Kinetics, then you'll have to recompute the clusters using the cluster_fv.py script below, otherwise the correspondence between cluster labels and videos will be broken.
 46 | 
 47 | ### Runtime Setup
 48 | ```
 49 | cd LocalAggregation
 50 | source init_env.sh
 51 | ```
 52 | 
 53 | ### Pretrained models
 54 | We provide several models trained using our Video LA + IDT prior objective, as well as precomputed clusters for the training set of Kinetics-400, under this [link](https://drive.google.com/file/d/1i3Vn_85Fo94BINHgpMaLNvZOKPfS3lvf/view?usp=sharing) (for the varaints trained on 370k videos we skipped the last tuning stage due to memory issues). In addition, this archive contains models finetuned on UCF101 and HMDB51, which are reported in the state-of-the-art comparison section of the paper.  
 55 | 
 56 | ### Training using precomputed IDT descriptors
 57 | Begin with training a 3D ResNet with an IR objective for 40 epochs. This is done as a warmup step. Don't forget to update data and experiment paths in the config file.
 58 | ```
 59 | CUDA_VISIBLE_DEVICES=0 python scripts/instance.py ./config/kinetics_ir.json 
 60 | ```
 61 | Then specify `instance_exp_dir` in `./config/kinetics_la.json` to point to the IR model you've just trained, and run the following command to trasfer IDT representations to the 3D ResNet via non-parametric clustering:
 62 | ```
 63 | CUDA_VISIBLE_DEVICES=0,1,2 python scripts/localagg.py ./config/kinetics_la.json
 64 | ```
 65 | To run the final fine-tuning stage, specify `instance_exp_dir` in `./config/kinetics_la_tune.json` to point to the model trained with IDTs, and run the following command:
 66 | ```
 67 | CUDA_VISIBLE_DEVICES=0,1,2 python scripts/localagg.py ./config/kinetics_la_tune.json
 68 | ```
 69 | 
 70 | ### Recomputing and clustering IDT descriptors
 71 | We provide precomputed Fisher vector-encoded IDT descriptors for the Kinetics dataset under this [link](https://drive.google.com/file/d/1I5ZWlYJfFxXhPrv6gRq1jZJah85usd1H/view?usp=sharing).
 72 | 
 73 | If you wish to recompute them, you will need to first download and install the original [IDT implementation](https://lear.inrialpes.fr/people/wang/improved_trajectories).
 74 | This codes takes person detections as input. You can download the detections we used [here](https://drive.google.com/file/d/1CDX8qkhsx9ygL27VG8UQpzAipa3MeHPu/view?usp=sharing).
 75 | 
 76 | Next, estimate the model (PCA, GMM) parameters used in Fisher vector encoding. To this end, first sample 3500 videos from Kinetics at random, and compute IDTs for them, using the script bellow (don't forget to update paths to the IDT implementation).
 77 | ```
 78 | sh src/idt/run_idt.sh PATH_TO_VIDEO PATH_TO_BOXES OUTPUT_NAME PATH_TO_IDTS
 79 | ``` 
 80 | Then run the following script to estimate model parameters based on the computed IDTs. The parameters will be saved to the same directory as the IDTs.
 81 | ```
 82 | python src/idt/compute_fv_models.py --idt_path PATH_TO_IDTS
 83 | ```
 84 | 
 85 | Now you can compute the Fisher vector encoded IDT descriptors for training set of Kinetics. The following script takes a category as input, so the in can be parallelized 400-way on a CPU cluster (pleas update the path to a temporary folder insight the script, which is used to store raw IDTs).
 86 | ```
 87 | python src/idt/extract_idt.py --category CATEGORY_NAME --model_path PATH_TO_IDTS --videos_path PATH_TO_TRAIN_VIDEOS --boxes_path PATH_TO_BOXES --out_path FV_OUTPUT_PATH
 88 | ```
 89 | 
 90 | Finally, to cluster descriptors, run the following script.
 91 | ```
 92 | python src/idt/cluster_fv.py --k 6000 --num_c 3 --frames_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATIONS_JSON --fv_path FV_OUTPUT_PATH --clusters_path PATH_TO_OUTPUT_CLUSTERS_DIRECTORY --processed_annotation_path PATH_TO_OUTPUT_ANNOTATIONS_JSON --gpu 0 1
 93 | ```
 94 | This script produces a clustering assignement for the training set videos, and a new annotation file. Make sure to use this file in all the config files to ensure correct correspondence between videos and cluster labels.
 95 | 
 96 | ## Transfer learning
 97 | ```
 98 | cd 3D-ResNet
 99 | ```
100 | 
101 | ### Dataset preprocessing
102 | Download and pre-process [UCF101](http://crcv.ucf.edu/data/UCF101.php) and [HMDB51](http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) datasets as follows.
103 | 
104 | ```bash
105 | python utils/video_jpg_ucf101_hmdb51.py AVI_VIDEO_DIRECTORY JPG_VIDEO_DIRECTORY
106 | ```
107 | 
108 | * Generate n_frames files using ```utils/n_frames_ucf101_hmdb51.py```
109 | 
110 | ```bash
111 | python utils/n_frames_ucf101_hmdb51.py JPG_VIDEO_DIRECTORY
112 | ```
113 | 
114 | * Generate annotation file in json format using ```utils/ucf101_json.py``` and ```utils/hmdb51_json.py```
115 | 
116 | ```bash
117 | python utils/ucf101_json.py ANNOTATION_DIR_PATH
118 | python utils/hmdb51_json.py ANNOTATION_DIR_PATH
119 | ```
120 | 
121 | ### Finetuning
122 | On UCF101:
123 | ```bash
124 | python main.py --video_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATION --result_path OUTPUT_MODEL_PATH --dataset ucf101 --n_finetune_classes 101 --model resnet --model_depth 18 --resnet_shortcut B --batch_size 128 --n_threads 16 --gpu 0 --pretrain_path PATH_TO_PRETRAINED_MODEL  --checkpoint 10 --ft_begin_index 2 --n_epochs 40 --lr_patience 5  --n_scales 2 --train_crop random
125 | ```
126 | 
127 | On HMDB51:
128 | ```bash
129 | python main.py --video_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATION --result_path OUTPUT_MODEL_PATH --dataset hmdb51 --n_finetune_classes 101 --model resnet --model_depth 18 --resnet_shortcut B --batch_size 128 --n_threads 16 --gpu 0 --pretrain_path PATH_TO_PRETRAINED_MODEL  --checkpoint 10 --ft_begin_index 3 --n_epochs 30 --lr_patience 5  --n_scales 2 --train_crop random
130 | ```
131 | 
132 | ### Evaluation
133 | On UCF101:
134 | ```bash
135 | python main.py --video_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATION --dataset ucf101 --n_classes 101 --model resnet --model_depth 18 --resnet_shortcut B --batch_size 128 --n_threads 16 --gpu 0 --test --no_train --no_val --resume_path OUTPUT_MODEL_PATH/save_40.pth
136 | ```
137 | 
138 | On HMDB51:
139 | ```bash
140 | python main.py --video_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATION --dataset hmdb51 --n_classes 101 --model resnet --model_depth 18 --resnet_shortcut B --batch_size 128 --n_threads 16 --gpu 0 --test --no_train --no_val --resume_path OUTPUT_MODEL_PATH/save_30.pth
141 | ```
142 | 


--------------------------------------------------------------------------------
/3D-ResNet/datasets/kinetics.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | import re
 10 | import random
 11 | 
 12 | from utils import load_value_file
 13 | 
 14 | 
 15 | def pil_loader(path):
 16 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 17 |     with open(path, 'rb') as f:
 18 |         with Image.open(f) as img:
 19 |             return img.convert('RGB')
 20 | 
 21 | 
 22 | def accimage_loader(path):
 23 |     try:
 24 |         import accimage
 25 |         return accimage.Image(path)
 26 |     except IOError:
 27 |         # Potentially a decoding problem, fall back to PIL.Image
 28 |         return pil_loader(path)
 29 | 
 30 | 
 31 | def get_default_image_loader():
 32 |     from torchvision import get_image_backend
 33 |     if get_image_backend() == 'accimage':
 34 |         return accimage_loader
 35 |     else:
 36 |         return pil_loader
 37 | 
 38 | 
 39 | def video_loader(video_dir_path, frame_indices, image_loader):
 40 |     video = []
 41 |     for i in frame_indices:
 42 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 43 |         if os.path.exists(image_path):
 44 |             video.append(image_loader(image_path))
 45 |         else:
 46 |             return video
 47 | 
 48 |     return video
 49 | 
 50 | 
 51 | def get_default_video_loader():
 52 |     image_loader = get_default_image_loader()
 53 |     return functools.partial(video_loader, image_loader=image_loader)
 54 | 
 55 | 
 56 | def load_annotation_data(data_file_path):
 57 |     with open(data_file_path, 'r') as data_file:
 58 |         return json.load(data_file)
 59 | 
 60 | 
 61 | def get_class_labels(data):
 62 |     class_labels_map = {}
 63 |     index = 0
 64 |     for class_label in data['labels']:
 65 |         class_labels_map[class_label] = index
 66 |         index += 1
 67 |     return class_labels_map
 68 | 
 69 | 
 70 | def get_video_names_and_annotations(data, subset):
 71 |     video_names = []
 72 |     annotations = []
 73 | 
 74 |     for key, value in data['database'].items():
 75 |         this_subset = value['subset']
 76 |         if this_subset == subset:
 77 |             if subset == 'validation':
 78 |                 key = re.sub("_\d+", "", key)
 79 |             if subset == 'testing':
 80 |                 video_names.append('test/{}'.format(key))
 81 |             else:
 82 |                 label = value['annotations']['label']
 83 |                 video_names.append('{}/{}/{}'.format(subset, label.replace(" ", "_"), key))
 84 |                 annotations.append(value['annotations'])
 85 | 
 86 |     return video_names, annotations
 87 | 
 88 | 
 89 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 90 |                  sample_duration):
 91 |     data = load_annotation_data(annotation_path)
 92 | 
 93 |     video_names, annotations = get_video_names_and_annotations(data, subset)
 94 |     class_to_idx = get_class_labels(data)
 95 |     idx_to_class = {}
 96 |     for name, label in class_to_idx.items():
 97 |         idx_to_class[label] = name
 98 | 
 99 |     dataset = []
100 |     for i in range(len(video_names)):
101 |         if i % 1000 == 0:
102 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
103 | 
104 |         video_path = os.path.join(root_path, video_names[i])
105 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
106 |         if not os.path.exists(video_path) or not os.path.exists(n_frames_file_path):
107 |             # print("Path not found")
108 |             continue
109 | 
110 |         n_frames = int(load_value_file(n_frames_file_path))
111 |         if n_frames <= 0:
112 |             continue
113 | 
114 |         begin_t = 1
115 |         end_t = n_frames
116 |         sample = {
117 |             'video': video_path,
118 |             'segment': [begin_t, end_t],
119 |             'n_frames': n_frames,
120 |             'video_id': video_names[i][:-14].split('/')[1]
121 |         }
122 |         if len(annotations) != 0:
123 |             sample['label'] = class_to_idx[annotations[i]['label']]
124 |         else:
125 |             sample['label'] = -1
126 | 
127 |         if n_samples_for_each_video == 1:
128 |             sample['frame_indices'] = list(range(1, n_frames + 1))
129 |             dataset.append(sample)
130 |         else:
131 |             if n_samples_for_each_video > 1:
132 |                 step = max(1,
133 |                            math.ceil((n_frames - 1 - sample_duration) /
134 |                                      (n_samples_for_each_video - 1)))
135 |             else:
136 |                 step = sample_duration
137 |             for j in range(1, n_frames, step):
138 |                 sample_j = copy.deepcopy(sample)
139 |                 sample_j['frame_indices'] = list(
140 |                     range(j, min(n_frames + 1, j + sample_duration)))
141 |                 dataset.append(sample_j)
142 | 
143 |     return dataset, idx_to_class
144 | 
145 | 
146 | class Kinetics(data.Dataset):
147 |     """
148 |     Args:
149 |         root (string): Root directory path.
150 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
151 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
152 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
153 |             and returns a transformed version
154 |         target_transform (callable, optional): A function/transform that takes in the
155 |             target and transforms it.
156 |         loader (callable, optional): A function to load an video given its path and frame indices.
157 |      Attributes:
158 |         classes (list): List of the class names.
159 |         class_to_idx (dict): Dict with items (class_name, class_index).
160 |         imgs (list): List of (image path, class_index) tuples
161 |     """
162 | 
163 |     def __init__(self,
164 |                  root_path,
165 |                  annotation_path,
166 |                  subset,
167 |                  n_samples_for_each_video=1,
168 |                  spatial_transform=None,
169 |                  temporal_transform=None,
170 |                  target_transform=None,
171 |                  sample_duration=16,
172 |                  get_loader=get_default_video_loader,
173 |                  num_vid_samples=1):
174 |         self.data, self.class_names = make_dataset(
175 |             root_path, annotation_path, subset, n_samples_for_each_video,
176 |             sample_duration)
177 | 
178 |         self.spatial_transform = spatial_transform
179 |         self.temporal_transform = temporal_transform
180 |         self.target_transform = target_transform
181 |         self.loader = get_loader()
182 |         self.num_vid_samples = num_vid_samples
183 |         self.sample_duration = sample_duration
184 | 
185 |     def __getitem__(self, index):
186 |         """
187 |         Args:
188 |             index (int): Index
189 |         Returns:
190 |             tuple: (image, target) where target is class_index of the target class.
191 |         """
192 |         path = self.data[index]['video']
193 | 
194 |         frame_indices = self.data[index]['frame_indices']
195 |         if self.temporal_transform is not None:
196 |             frame_indices = self.temporal_transform(frame_indices)
197 | 
198 |         target = self.data[index]
199 |         if self.target_transform is not None:
200 |             target = self.target_transform(target)
201 | 
202 |         if self.num_vid_samples == 1:
203 |             clip = self.loader(path, frame_indices)
204 |             if self.spatial_transform is not None:
205 |                 self.spatial_transform.randomize_parameters()
206 |                 clip = [self.spatial_transform(img) for img in clip]
207 |             clip = torch.stack(clip, 0)
208 |             clip = clip.permute(1, 0, 2, 3)
209 | 
210 |             return clip, target
211 |         else:
212 |             clips = []
213 |             for i in range(self.num_vid_samples):
214 |                 start = random.randint(0, len(frame_indices) - self.sample_duration - 1)
215 |                 inds = frame_indices[start: start + self.sample_duration]
216 | 
217 |                 clip = self.loader(path, inds)
218 |                 if self.spatial_transform is not None:
219 |                     self.spatial_transform.randomize_parameters()
220 |                     clip = [self.spatial_transform(img) for img in clip]
221 |                 clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
222 |                 clips.append(clip)
223 | 
224 |             return clips[0], clips[1], clips[2], clips[3], clips[4], target
225 | 
226 |     def __len__(self):
227 |         return len(self.data)
228 | 


--------------------------------------------------------------------------------
/3D-ResNet/opts.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def parse_opts():
  5 |     parser = argparse.ArgumentParser()
  6 |     parser.add_argument(
  7 |         '--root_path',
  8 |         default='/root/data/ActivityNet',
  9 |         type=str,
 10 |         help='Root directory path of data')
 11 |     parser.add_argument(
 12 |         '--video_path',
 13 |         default='video_kinetics_jpg',
 14 |         type=str,
 15 |         help='Directory path of Videos')
 16 |     parser.add_argument(
 17 |         '--annotation_path',
 18 |         default='kinetics.json',
 19 |         type=str,
 20 |         help='Annotation file path')
 21 |     parser.add_argument(
 22 |         '--result_path',
 23 |         default='results',
 24 |         type=str,
 25 |         help='Result directory path')
 26 |     parser.add_argument(
 27 |         '--dataset',
 28 |         default='kinetics',
 29 |         type=str,
 30 |         help='Used dataset (kinetics | ucf101 | hmdb51)')
 31 |     parser.add_argument(
 32 |         '--n_classes',
 33 |         default=400,
 34 |         type=int,
 35 |         help=
 36 |         'Number of classes (kinetics: 400, ucf101: 101, hmdb51: 51)'
 37 |     )
 38 |     parser.add_argument(
 39 |         '--n_finetune_classes',
 40 |         default=400,
 41 |         type=int,
 42 |         help=
 43 |         'Number of classes for fine-tuning. n_classes is set to the number when pretraining.'
 44 |     )
 45 |     parser.add_argument(
 46 |         '--sample_size',
 47 |         default=112,
 48 |         type=int,
 49 |         help='Height and width of inputs')
 50 |     parser.add_argument(
 51 |         '--sample_duration',
 52 |         default=16,
 53 |         type=int,
 54 |         help='Temporal duration of inputs')
 55 |     parser.add_argument(
 56 |         '--temp_stride',
 57 |         default=1,
 58 |         type=int,
 59 |         help='Temporal stride')
 60 |     parser.add_argument(
 61 |         '--initial_scale',
 62 |         default=1.0,
 63 |         type=float,
 64 |         help='Initial scale for multiscale cropping')
 65 |     parser.add_argument(
 66 |         '--n_scales',
 67 |         default=5,
 68 |         type=int,
 69 |         help='Number of scales for multiscale cropping')
 70 |     parser.add_argument(
 71 |         '--scale_step',
 72 |         default=0.84081289641525,
 73 |         type=float,
 74 |         help='Scale step for multiscale cropping')
 75 |     parser.add_argument(
 76 |         '--train_crop',
 77 |         default='corner',
 78 |         type=str,
 79 |         help=
 80 |         'Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center.  (random | corner | center)'
 81 |     )
 82 |     parser.add_argument(
 83 |         '--learning_rate',
 84 |         default=0.1,
 85 |         type=float,
 86 |         help=
 87 |         'Initial learning rate (divided by 10 while training by lr scheduler)')
 88 |     parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
 89 |     parser.add_argument(
 90 |         '--dampening', default=0.9, type=float, help='dampening of SGD')
 91 |     parser.add_argument(
 92 |         '--weight_decay', default=1e-3, type=float, help='Weight Decay')
 93 |     parser.add_argument(
 94 |         '--mean_dataset',
 95 |         default='activitynet',
 96 |         type=str,
 97 |         help=
 98 |         'dataset for mean values of mean subtraction (activitynet | kinetics)')
 99 |     parser.add_argument(
100 |         '--no_mean_norm',
101 |         action='store_true',
102 |         help='If true, inputs are not normalized by mean.')
103 |     parser.set_defaults(no_mean_norm=False)
104 |     parser.add_argument(
105 |         '--std_norm',
106 |         action='store_true',
107 |         help='If true, inputs are normalized by standard deviation.')
108 |     parser.set_defaults(std_norm=False)
109 |     parser.add_argument(
110 |         '--nesterov', action='store_true', help='Nesterov momentum')
111 |     parser.set_defaults(nesterov=False)
112 |     parser.add_argument(
113 |         '--optimizer',
114 |         default='sgd',
115 |         type=str,
116 |         help='Currently only support SGD')
117 |     parser.add_argument(
118 |         '--lr_patience',
119 |         default=10,
120 |         type=int,
121 |         help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.'
122 |     )
123 |     parser.add_argument(
124 |         '--batch_size', default=128, type=int, help='Batch Size')
125 |     parser.add_argument(
126 |         '--n_epochs',
127 |         default=200,
128 |         type=int,
129 |         help='Number of total epochs to run')
130 |     parser.add_argument(
131 |         '--begin_epoch',
132 |         default=1,
133 |         type=int,
134 |         help=
135 |         'Training begins at this epoch. Previous trained model indicated by resume_path is loaded.'
136 |     )
137 |     parser.add_argument(
138 |         '--lr_threshold',
139 |         default=0.0001,
140 |         type=float,
141 |         help='LR scheduler threshold')
142 |     parser.add_argument(
143 |         '--n_val_samples',
144 |         default=3,
145 |         type=int,
146 |         help='Number of validation samples for each activity')
147 |     parser.add_argument(
148 |         '--resume_path',
149 |         default='',
150 |         type=str,
151 |         help='Save data (.pth) of previous training')
152 |     parser.add_argument(
153 |         '--label_folder',
154 |         default='',
155 |         type=str,
156 |         help='Folder that stores label encodings for the movies')
157 |     parser.add_argument(
158 |         '--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
159 |     parser.add_argument(
160 |         '--ft_begin_index',
161 |         default=0,
162 |         type=int,
163 |         help='Begin block index of fine-tuning')
164 |     parser.add_argument(
165 |         '--no_train',
166 |         action='store_true',
167 |         help='If true, training is not performed.')
168 |     parser.set_defaults(no_train=False)
169 |     parser.add_argument(
170 |         '--no_val',
171 |         action='store_true',
172 |         help='If true, validation is not performed.')
173 |     parser.set_defaults(no_val=False)
174 |     parser.add_argument(
175 |         '--test', action='store_true', help='If true, test is performed.')
176 |     parser.set_defaults(test=False)
177 |     parser.add_argument(
178 |         '--test_subset',
179 |         default='val',
180 |         type=str,
181 |         help='Used subset in test (val | test)')
182 |     parser.add_argument(
183 |         '--scale_in_test',
184 |         default=1.0,
185 |         type=float,
186 |         help='Spatial scale in test')
187 |     parser.add_argument(
188 |         '--crop_position_in_test',
189 |         default='c',
190 |         type=str,
191 |         help='Cropping method (c | tl | tr | bl | br) in test')
192 |     parser.add_argument(
193 |         '--no_softmax_in_test',
194 |         action='store_true',
195 |         help='If true, output for each clip is not normalized using softmax.')
196 |     parser.set_defaults(no_softmax_in_test=False)
197 |     parser.add_argument(
198 |         '--no_cuda', action='store_true', help='If true, cuda is not used.')
199 |     parser.set_defaults(no_cuda=False)
200 |     parser.add_argument(
201 |         '--n_threads',
202 |         default=4,
203 |         type=int,
204 |         help='Number of threads for multi-thread loading')
205 |     parser.add_argument(
206 |         '--checkpoint',
207 |         default=10,
208 |         type=int,
209 |         help='Trained model is saved at every this epochs.')
210 |     parser.add_argument(
211 |         '--no_hflip',
212 |         action='store_true',
213 |         help='If true holizontal flipping is not performed.')
214 |     parser.set_defaults(no_hflip=False)
215 |     parser.add_argument(
216 |         '--norm_value',
217 |         default=1,
218 |         type=int,
219 |         help=
220 |         'If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
221 |     parser.add_argument(
222 |         '--model',
223 |         default='resnet',
224 |         type=str,
225 |         help='(resnet')
226 |     parser.add_argument(
227 |         '--model_depth',
228 |         default=18,
229 |         type=int,
230 |         help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
231 |     parser.add_argument(
232 |         '--resnet_shortcut',
233 |         default='B',
234 |         type=str,
235 |         help='Shortcut type of resnet (A | B)')
236 |     parser.add_argument(
237 |         '--manual_seed', default=1, type=int, help='Manually set random seed')
238 |     parser.add_argument('--gpu', nargs='*', help='GPU id')
239 |     parser.add_argument(
240 |         '--shot',
241 |         default=-1,
242 |         type=int,
243 |         help='Number of training examples per category')
244 | 
245 |     args = parser.parse_args()
246 | 
247 |     return args
248 | 


--------------------------------------------------------------------------------
/3D-ResNet/datasets/hmdb51.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | import random
 10 | import numpy as np
 11 | from temporal_transforms import LoopPadding
 12 | 
 13 | from utils import load_value_file
 14 | 
 15 | 
 16 | def pil_loader(path):
 17 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 18 |     with open(path, 'rb') as f:
 19 |         with Image.open(f) as img:
 20 |             return img.convert('RGB')
 21 | 
 22 | 
 23 | def accimage_loader(path):
 24 |     try:
 25 |         import accimage
 26 |         return accimage.Image(path)
 27 |     except IOError:
 28 |         # Potentially a decoding problem, fall back to PIL.Image
 29 |         return pil_loader(path)
 30 | 
 31 | 
 32 | def get_default_image_loader():
 33 |     from torchvision import get_image_backend
 34 |     if get_image_backend() == 'accimage':
 35 |         return accimage_loader
 36 |     else:
 37 |         return pil_loader
 38 | 
 39 | 
 40 | def video_loader(video_dir_path, frame_indices, image_loader):
 41 |     video = []
 42 |     for i in frame_indices:
 43 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 44 |         if os.path.exists(image_path):
 45 |             video.append(image_loader(image_path))
 46 |         else:
 47 |             return video
 48 | 
 49 |     return video
 50 | 
 51 | 
 52 | def get_default_video_loader():
 53 |     image_loader = get_default_image_loader()
 54 |     return functools.partial(video_loader, image_loader=image_loader)
 55 | 
 56 | 
 57 | def load_annotation_data(data_file_path):
 58 |     with open(data_file_path, 'r') as data_file:
 59 |         return json.load(data_file)
 60 | 
 61 | 
 62 | def get_class_labels(data):
 63 |     class_labels_map = {}
 64 |     index = 0
 65 |     for class_label in data['labels']:
 66 |         class_labels_map[class_label] = index
 67 |         index += 1
 68 |     return class_labels_map
 69 | 
 70 | 
 71 | def get_video_names_and_annotations(data, subset):
 72 |     video_names = []
 73 |     annotations = []
 74 | 
 75 |     for key, value in data['database'].items():
 76 |         this_subset = value['subset']
 77 |         if this_subset == subset:
 78 |             label = value['annotations']['label']
 79 |             video_names.append('{}/{}'.format(label, key))
 80 |             annotations.append(value['annotations'])
 81 | 
 82 |     return video_names, annotations
 83 | 
 84 | 
 85 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 86 |                  sample_duration, shot=-1):
 87 |     data = load_annotation_data(annotation_path)
 88 |     video_names, annotations = get_video_names_and_annotations(data, subset)
 89 |     class_to_idx = get_class_labels(data)
 90 |     instance_count = {}
 91 |     idx_to_class = {}
 92 |     for name, label in class_to_idx.items():
 93 |         instance_count[name] = 0
 94 |         idx_to_class[label] = name
 95 | 
 96 |     dataset = []
 97 |     for i in range(len(video_names)):
 98 |         if i % 1000 == 0:
 99 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
100 | 
101 |         video_path = os.path.join(root_path, video_names[i])
102 |         if not os.path.exists(video_path):
103 |             print(video_path)
104 |             continue
105 | 
106 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
107 |         n_frames = int(load_value_file(n_frames_file_path))
108 |         if n_frames <= 0:
109 |             continue
110 | 
111 |         if (shot > 0) and (instance_count[annotations[i]['label']] == shot):
112 |             continue
113 | 
114 |         begin_t = 1
115 |         end_t = n_frames
116 |         sample = {
117 |             'video': video_path,
118 |             'segment': [begin_t, end_t],
119 |             'n_frames': n_frames,
120 |             'video_id': video_names[i].split('/')[1],
121 |         }
122 |         if len(annotations) != 0:
123 |             sample['label'] = class_to_idx[annotations[i]['label']]
124 |             instance_count[annotations[i]['label']] += 1
125 |         else:
126 |             sample['label'] = -1
127 | 
128 |         if n_samples_for_each_video == 1:
129 |             sample['frame_indices'] = list(range(1, n_frames + 1))
130 |             dataset.append(sample)
131 |         else:
132 |             if n_samples_for_each_video > 1:
133 |                 step = max(1,
134 |                            math.ceil((n_frames - 1 - sample_duration) /
135 |                                      (n_samples_for_each_video - 1)))
136 |             else:
137 |                 step = sample_duration
138 |             for j in range(1, n_samples_for_each_video, step):
139 |                 sample_j = copy.deepcopy(sample)
140 |                 sample_j['frame_indices'] = list(
141 |                     range(j, min(n_frames + 1, j + sample_duration)))
142 |                 dataset.append(sample_j)
143 | 
144 |     print(len(dataset))
145 | 
146 |     return dataset, idx_to_class
147 | 
148 | 
149 | class HMDB51(data.Dataset):
150 |     """
151 |     Args:
152 |         root (string): Root directory path.
153 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
154 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
155 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
156 |             and returns a transformed version
157 |         target_transform (callable, optional): A function/transform that takes in the
158 |             target and transforms it.
159 |         loader (callable, optional): A function to load an video given its path and frame indices.
160 |      Attributes:
161 |         classes (list): List of the class names.
162 |         class_to_idx (dict): Dict with items (class_name, class_index).
163 |         imgs (list): List of (image path, class_index) tuples
164 |     """
165 | 
166 |     def __init__(self,
167 |                  root_path,
168 |                  annotation_path,
169 |                  subset,
170 |                  n_samples_for_each_video=1,
171 |                  spatial_transform=None,
172 |                  temporal_transform=None,
173 |                  target_transform=None,
174 |                  sample_duration=16,
175 |                  get_loader=get_default_video_loader,
176 |                  num_vid_samples=1,
177 |                  shot=-1):
178 |         self.data, self.class_names = make_dataset(
179 |             root_path, annotation_path, subset, n_samples_for_each_video,
180 |             sample_duration, shot=shot)
181 | 
182 |         self.spatial_transform = spatial_transform
183 |         self.temporal_transform = temporal_transform
184 |         self.target_transform = target_transform
185 |         self.loader = get_loader()
186 |         self.num_vid_samples = num_vid_samples
187 |         self.sample_duration = sample_duration
188 | 
189 |     def __getitem__(self, index):
190 |         """
191 |         Args:
192 |             index (int): Index
193 |         Returns:
194 |             tuple: (image, target) where target is class_index of the target class.
195 |         """
196 |         path = self.data[index]['video']
197 |         
198 |         target = self.data[index]
199 |         if self.target_transform is not None:
200 |             target = self.target_transform(target)
201 | 
202 |         frame_indices = self.data[index]['frame_indices']
203 |         if self.temporal_transform is not None:
204 |             frame_indices = self.temporal_transform(frame_indices)
205 | 
206 |         if self.num_vid_samples == 1:
207 |             clip = self.loader(path, frame_indices)
208 |             if self.spatial_transform is not None:
209 |                 self.spatial_transform.randomize_parameters()
210 |                 clip = [self.spatial_transform(img) for img in clip]
211 |             clip = torch.stack(clip, 0)
212 |             clip = clip.permute(1, 0, 2, 3)
213 | 
214 |             return clip, target
215 |         else:
216 |             clips = []
217 |             for i in range(self.num_vid_samples):
218 |                 start = random.randint(0, len(frame_indices) - self.sample_duration - 1)
219 |                 inds = frame_indices[start: start + self.sample_duration]
220 | 
221 |                 clip = self.loader(path, inds)
222 |                 if self.spatial_transform is not None:
223 |                     self.spatial_transform.randomize_parameters()
224 |                     clip = [self.spatial_transform(img) for img in clip]
225 |                 clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
226 |                 clips.append(clip)
227 | 
228 |             return clips[0], clips[1], clips[2], clips[3], clips[4], target
229 | 
230 |     def __len__(self):
231 |         return len(self.data)
232 | 


--------------------------------------------------------------------------------
/3D-ResNet/datasets/ucf101.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | import random
 10 | import numpy as np
 11 | from temporal_transforms import LoopPadding
 12 | 
 13 | def load_value_file(file_path):
 14 |     with open(file_path, 'r') as input_file:
 15 |         value = float(input_file.read().rstrip('\n\r'))
 16 | 
 17 |     return value
 18 | 
 19 | 
 20 | def pil_loader(path):
 21 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 22 |     with open(path, 'rb') as f:
 23 |         with Image.open(f) as img:
 24 |             return img.convert('RGB')
 25 | 
 26 | 
 27 | def accimage_loader(path):
 28 |     try:
 29 |         import accimage
 30 |         return accimage.Image(path)
 31 |     except IOError:
 32 |         # Potentially a decoding problem, fall back to PIL.Image
 33 |         return pil_loader(path)
 34 | 
 35 | 
 36 | def get_default_image_loader():
 37 |     from torchvision import get_image_backend
 38 |     if get_image_backend() == 'accimage':
 39 |         return accimage_loader
 40 |     else:
 41 |         return pil_loader
 42 | 
 43 | 
 44 | def video_loader(video_dir_path, frame_indices, image_loader):
 45 |     video = []
 46 |     for i in frame_indices:
 47 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 48 |         if os.path.exists(image_path):
 49 |             video.append(image_loader(image_path))
 50 |         else:
 51 |             return video
 52 | 
 53 |     return video
 54 | 
 55 | 
 56 | def get_default_video_loader():
 57 |     image_loader = get_default_image_loader()
 58 |     return functools.partial(video_loader, image_loader=image_loader)
 59 | 
 60 | 
 61 | def load_annotation_data(data_file_path):
 62 |     with open(data_file_path, 'r') as data_file:
 63 |         return json.load(data_file)
 64 | 
 65 | 
 66 | def get_class_labels(data):
 67 |     class_labels_map = {}
 68 |     index = 0
 69 |     for class_label in data['labels']:
 70 |         class_labels_map[class_label] = index
 71 |         index += 1
 72 |     return class_labels_map
 73 | 
 74 | 
 75 | def get_video_names_and_annotations(data, subset):
 76 |     video_names = []
 77 |     annotations = []
 78 | 
 79 |     for key, value in data['database'].items():
 80 |         this_subset = value['subset']
 81 |         if this_subset == subset:
 82 |             label = value['annotations']['label']
 83 |             video_names.append('{}/{}'.format(label, key))
 84 |             annotations.append(value['annotations'])
 85 | 
 86 |     return video_names, annotations
 87 | 
 88 | 
 89 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 90 |                  sample_duration, shot=-1):
 91 |     data = load_annotation_data(annotation_path)
 92 |     video_names, annotations = get_video_names_and_annotations(data, subset)
 93 |     class_to_idx = get_class_labels(data)
 94 |     instance_count = {}
 95 |     idx_to_class = {}
 96 |     for name, label in class_to_idx.items():
 97 |         instance_count[name] = 0
 98 |         idx_to_class[label] = name
 99 | 
100 |     dataset = []
101 |     for i in range(len(video_names)):
102 |         if i % 1000 == 0:
103 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
104 | 
105 |         video_path = os.path.join(root_path, video_names[i])
106 |         if not os.path.exists(video_path):
107 |             continue
108 | 
109 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
110 |         n_frames = int(load_value_file(n_frames_file_path))
111 |         if n_frames <= 0:
112 |             continue
113 | 
114 |         if (shot > 0) and (instance_count[annotations[i]['label']] == shot):
115 |             continue
116 | 
117 |         begin_t = 1
118 |         end_t = n_frames
119 |         sample = {
120 |             'video': video_path,
121 |             'segment': [begin_t, end_t],
122 |             'n_frames': n_frames,
123 |             'video_id': video_names[i].split('/')[1]
124 |         }
125 |         if len(annotations) != 0:
126 |             sample['label'] = class_to_idx[annotations[i]['label']]
127 |             instance_count[annotations[i]['label']] += 1
128 |         else:
129 |             sample['label'] = -1
130 | 
131 |         if n_samples_for_each_video == 1:
132 |             sample['frame_indices'] = list(range(1, n_frames + 1))
133 |             dataset.append(sample)
134 |         else:
135 |             if n_samples_for_each_video > 1:
136 |                 step = max(1,
137 |                            math.ceil((n_frames - 1 - sample_duration) /
138 |                                      (n_samples_for_each_video - 1)))
139 |             else:
140 |                 step = sample_duration
141 |             for j in range(1, n_samples_for_each_video, step):
142 |                 sample_j = copy.deepcopy(sample)
143 |                 sample_j['frame_indices'] = list(
144 |                     range(j, min(n_frames + 1, j + sample_duration)))
145 |                 dataset.append(sample_j)
146 | 
147 |     return dataset, idx_to_class
148 | 
149 | 
150 | class UCF101(data.Dataset):
151 |     """
152 |     Args:
153 |         root (string): Root directory path.
154 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
155 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
156 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
157 |             and returns a transformed version
158 |         target_transform (callable, optional): A function/transform that takes in the
159 |             target and transforms it.
160 |         loader (callable, optional): A function to load an video given its path and frame indices.
161 |      Attributes:
162 |         classes (list): List of the class names.
163 |         class_to_idx (dict): Dict with items (class_name, class_index).
164 |         imgs (list): List of (image path, class_index) tuples
165 |     """
166 | 
167 |     def __init__(self,
168 |                  root_path,
169 |                  annotation_path,
170 |                  subset,
171 |                  n_samples_for_each_video=1,
172 |                  spatial_transform=None,
173 |                  temporal_transform=None,
174 |                  target_transform=None,
175 |                  sample_duration=16,
176 |                  get_loader=get_default_video_loader,
177 |                  label_folder=None,
178 |                  num_vid_samples=1,
179 |                  shot=-1):
180 |         self.data, self.class_names = make_dataset(
181 |             root_path, annotation_path, subset, n_samples_for_each_video,
182 |             sample_duration, shot=shot)
183 |         print(len(self.data))
184 | 
185 |         self.spatial_transform = spatial_transform
186 |         self.temporal_transform = temporal_transform
187 |         self.target_transform = target_transform
188 |         self.loader = get_loader()
189 |         self.sample_duration = sample_duration
190 |         self.num_vid_samples = num_vid_samples
191 | 
192 |     def __getitem__(self, index):
193 |         """
194 |         Args:
195 |             index (int): Index
196 |         Returns:
197 |             tuple: (image, target) where target is class_index of the target class.
198 |         """
199 |         path = self.data[index]['video']
200 | 
201 |         frame_indices = self.data[index]['frame_indices']
202 |         if self.temporal_transform is not None:
203 |             frame_indices = self.temporal_transform(frame_indices)
204 | 
205 |         target = self.data[index]
206 |         if self.target_transform is not None:
207 |             target = self.target_transform(target)
208 | 
209 |         if self.num_vid_samples == 1:
210 |             clip = self.loader(path, frame_indices)
211 |             if self.spatial_transform is not None:
212 |                 self.spatial_transform.randomize_parameters()
213 |                 clip = [self.spatial_transform(img) for img in clip]
214 |             clip = torch.stack(clip, 0)
215 |             clip = clip.permute(1, 0, 2, 3)
216 | 
217 |             return clip, target
218 |         else:
219 |             clips = []
220 |             for i in range(self.num_vid_samples):
221 |                 start = random.randint(0, len(frame_indices) - self.sample_duration - 1)
222 |                 inds = frame_indices[start: start + self.sample_duration]
223 | 
224 |                 clip = self.loader(path, inds)
225 |                 if self.spatial_transform is not None:
226 |                     self.spatial_transform.randomize_parameters()
227 |                     clip = [self.spatial_transform(img) for img in clip]
228 |                 clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
229 |                 clips.append(clip)
230 | 
231 |             return clips[0], clips[1], clips[2], clips[3], clips[4], target
232 | 
233 |     def __len__(self):
234 |         return len(self.data)
235 | 


--------------------------------------------------------------------------------
/3D-ResNet/models/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = [
  9 |     'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
 10 |     'resnet152', 'resnet200'
 11 | ]
 12 | 
 13 | 
 14 | def conv3x3x3(in_planes, out_planes, stride=1):
 15 |     # 3x3x3 convolution with padding
 16 |     return nn.Conv3d(
 17 |         in_planes,
 18 |         out_planes,
 19 |         kernel_size=3,
 20 |         stride=stride,
 21 |         padding=1,
 22 |         bias=False)
 23 | 
 24 | 
 25 | def downsample_basic_block(x, planes, stride):
 26 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 27 |     zero_pads = torch.Tensor(
 28 |         out.size(0), planes - out.size(1), out.size(2), out.size(3),
 29 |         out.size(4)).zero_()
 30 |     if isinstance(out.data, torch.cuda.FloatTensor):
 31 |         zero_pads = zero_pads.cuda()
 32 | 
 33 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 34 | 
 35 |     return out
 36 | 
 37 | 
 38 | class BasicBlock(nn.Module):
 39 |     expansion = 1
 40 | 
 41 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 42 |         super(BasicBlock, self).__init__()
 43 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 44 |         self.bn1 = nn.BatchNorm3d(planes)
 45 |         self.relu = nn.ReLU(inplace=True)
 46 |         self.conv2 = conv3x3x3(planes, planes)
 47 |         self.bn2 = nn.BatchNorm3d(planes)
 48 |         self.downsample = downsample
 49 |         self.stride = stride
 50 | 
 51 |     def forward(self, x):
 52 |         residual = x
 53 | 
 54 |         out = self.conv1(x)
 55 |         out = self.bn1(out)
 56 |         out = self.relu(out)
 57 | 
 58 |         out = self.conv2(out)
 59 |         out = self.bn2(out)
 60 | 
 61 |         if self.downsample is not None:
 62 |             residual = self.downsample(x)
 63 | 
 64 |         out += residual
 65 |         out = self.relu(out)
 66 | 
 67 |         return out
 68 | 
 69 | 
 70 | class Bottleneck(nn.Module):
 71 |     expansion = 4
 72 | 
 73 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 74 |         super(Bottleneck, self).__init__()
 75 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 76 |         self.bn1 = nn.BatchNorm3d(planes)
 77 |         self.conv2 = nn.Conv3d(
 78 |             planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 79 |         self.bn2 = nn.BatchNorm3d(planes)
 80 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
 81 |         self.bn3 = nn.BatchNorm3d(planes * 4)
 82 |         self.relu = nn.ReLU(inplace=True)
 83 |         self.downsample = downsample
 84 |         self.stride = stride
 85 | 
 86 |     def forward(self, x):
 87 |         residual = x
 88 | 
 89 |         out = self.conv1(x)
 90 |         out = self.bn1(out)
 91 |         out = self.relu(out)
 92 | 
 93 |         out = self.conv2(out)
 94 |         out = self.bn2(out)
 95 |         out = self.relu(out)
 96 | 
 97 |         out = self.conv3(out)
 98 |         out = self.bn3(out)
 99 | 
100 |         if self.downsample is not None:
101 |             residual = self.downsample(x)
102 | 
103 |         out += residual
104 |         out = self.relu(out)
105 | 
106 |         return out
107 | 
108 | 
109 | class ResNet(nn.Module):
110 | 
111 |     def __init__(self,
112 |                  block,
113 |                  layers,
114 |                  sample_size,
115 |                  sample_duration,
116 |                  shortcut_type='B',
117 |                  num_classes=400,
118 |                  input_chan=3):
119 |         self.inplanes = 64
120 |         super(ResNet, self).__init__()
121 |         self.conv1 = nn.Conv3d(
122 |             input_chan,
123 |             64,
124 |             kernel_size=7,
125 |             stride=(1, 2, 2),
126 |             padding=(3, 3, 3),
127 |             bias=False)
128 |         self.bn1 = nn.BatchNorm3d(64)
129 |         self.relu = nn.ReLU(inplace=True)
130 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
131 |         self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
132 |         self.layer2 = self._make_layer(
133 |             block, 128, layers[1], shortcut_type, stride=2)
134 |         self.layer3 = self._make_layer(
135 |             block, 256, layers[2], shortcut_type, stride=2)
136 |         self.layer4 = self._make_layer(
137 |             block, 512, layers[3], shortcut_type, stride=2)
138 |         last_duration = int(math.ceil(sample_duration / 16))
139 |         last_size = int(math.ceil(sample_size / 32))
140 |         self.avgpool = nn.AvgPool3d(
141 |             (last_duration, last_size, last_size), stride=1)
142 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
143 | 
144 |         for m in self.modules():
145 |             if isinstance(m, nn.Conv3d):
146 |                 m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
147 |             elif isinstance(m, nn.BatchNorm3d):
148 |                 m.weight.data.fill_(1)
149 |                 m.bias.data.zero_()
150 | 
151 |     def freeze_layers(self, freeze_at):
152 |         if freeze_at < 0:
153 |             return
154 |         else:
155 |             for p in self.bn1.parameters():
156 |                 p.requires_grad = False
157 | 
158 |         for stage in range(freeze_at):
159 |             print("freezing at %d" % stage)
160 |             if stage == 0:
161 |                 m = self.conv1
162 |             else:
163 |                 m = getattr(self, "layer" + str(stage))
164 |             for p in m.parameters():
165 |                 p.requires_grad = False
166 | 
167 | 
168 | 
169 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
170 |         downsample = None
171 |         if stride != 1 or self.inplanes != planes * block.expansion:
172 |             if shortcut_type == 'A':
173 |                 downsample = partial(
174 |                     downsample_basic_block,
175 |                     planes=planes * block.expansion,
176 |                     stride=stride)
177 |             else:
178 |                 downsample = nn.Sequential(
179 |                     nn.Conv3d(
180 |                         self.inplanes,
181 |                         planes * block.expansion,
182 |                         kernel_size=1,
183 |                         stride=stride,
184 |                         bias=False), nn.BatchNorm3d(planes * block.expansion))
185 | 
186 |         layers = []
187 |         layers.append(block(self.inplanes, planes, stride, downsample))
188 |         self.inplanes = planes * block.expansion
189 |         for i in range(1, blocks):
190 |             layers.append(block(self.inplanes, planes))
191 | 
192 |         return nn.Sequential(*layers)
193 | 
194 |     def forward(self, x):
195 |         x = self.conv1(x)
196 |         x = self.bn1(x)
197 |         x = self.relu(x)
198 |         x = self.maxpool(x)
199 | 
200 |         x = self.layer1(x)
201 |         x = self.layer2(x)
202 |         x = self.layer3(x)
203 |         x = self.layer4(x)
204 | 
205 |         x = self.avgpool(x)
206 | 
207 |         x = x.view(x.size(0), -1)
208 | 
209 |         if self.fc is not None:
210 |             x = self.fc(x)
211 | 
212 |         return x
213 | 
214 | 
215 | def get_fine_tuning_parameters(model, ft_begin_index):
216 |     if ft_begin_index == 0:
217 |         return model.parameters()
218 | 
219 |     ft_module_names = []
220 |     for i in range(ft_begin_index, 5):
221 |         ft_module_names.append('layer{}'.format(i))
222 |     ft_module_names.append('fc')
223 | 
224 |     parameters = []
225 |     for k, v in model.named_parameters():
226 |         for ft_module in ft_module_names:
227 |             if ft_module in k:
228 |                 parameters.append({'params': v})
229 |                 break
230 |         else:
231 |             parameters.append({'params': v, 'lr': 0.0})
232 | 
233 |     return parameters
234 | 
235 | 
236 | def resnet10(**kwargs):
237 |     """Constructs a ResNet-18 model.
238 |     """
239 |     model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
240 |     return model
241 | 
242 | 
243 | def resnet18(**kwargs):
244 |     """Constructs a ResNet-18 model.
245 |     """
246 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
247 |     return model
248 | 
249 | 
250 | def resnet34(**kwargs):
251 |     """Constructs a ResNet-34 model.
252 |     """
253 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
254 |     return model
255 | 
256 | 
257 | def resnet50(**kwargs):
258 |     """Constructs a ResNet-50 model.
259 |     """
260 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
261 |     return model
262 | 
263 | 
264 | def resnet101(**kwargs):
265 |     """Constructs a ResNet-101 model.
266 |     """
267 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
268 |     return model
269 | 
270 | 
271 | def resnet152(**kwargs):
272 |     """Constructs a ResNet-101 model.
273 |     """
274 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
275 |     return model
276 | 
277 | 
278 | def resnet200(**kwargs):
279 |     """Constructs a ResNet-101 model.
280 |     """
281 |     model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)
282 |     return model
283 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/models/resnet3d.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = [
  9 |     'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
 10 |     'resnet152', 'resnet200'
 11 | ]
 12 | 
 13 | 
 14 | def conv3x3x3(in_planes, out_planes, stride=1):
 15 |     # 3x3x3 convolution with padding
 16 |     return nn.Conv3d(
 17 |         in_planes,
 18 |         out_planes,
 19 |         kernel_size=3,
 20 |         stride=stride,
 21 |         padding=1,
 22 |         bias=False)
 23 | 
 24 | 
 25 | def downsample_basic_block(x, planes, stride):
 26 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 27 |     zero_pads = torch.Tensor(
 28 |         out.size(0), planes - out.size(1), out.size(2), out.size(3),
 29 |         out.size(4)).zero_()
 30 |     if isinstance(out.data, torch.cuda.FloatTensor):
 31 |         zero_pads = zero_pads.cuda()
 32 | 
 33 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 34 | 
 35 |     return out
 36 | 
 37 | 
 38 | class BasicBlock(nn.Module):
 39 |     expansion = 1
 40 | 
 41 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 42 |         super(BasicBlock, self).__init__()
 43 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 44 |         self.bn1 = nn.BatchNorm3d(planes)
 45 |         self.relu = nn.ReLU(inplace=True)
 46 |         self.conv2 = conv3x3x3(planes, planes)
 47 |         self.bn2 = nn.BatchNorm3d(planes)
 48 |         self.downsample = downsample
 49 |         self.stride = stride
 50 | 
 51 |     def forward(self, x):
 52 |         residual = x
 53 | 
 54 |         out = self.conv1(x)
 55 |         out = self.bn1(out)
 56 |         out = self.relu(out)
 57 | 
 58 |         out = self.conv2(out)
 59 |         out = self.bn2(out)
 60 | 
 61 |         if self.downsample is not None:
 62 |             residual = self.downsample(x)
 63 | 
 64 |         out += residual
 65 |         out = self.relu(out)
 66 | 
 67 |         return out
 68 | 
 69 | 
 70 | class Bottleneck(nn.Module):
 71 |     expansion = 4
 72 | 
 73 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 74 |         super(Bottleneck, self).__init__()
 75 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 76 |         self.bn1 = nn.BatchNorm3d(planes)
 77 |         self.conv2 = nn.Conv3d(
 78 |             planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 79 |         self.bn2 = nn.BatchNorm3d(planes)
 80 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
 81 |         self.bn3 = nn.BatchNorm3d(planes * 4)
 82 |         self.relu = nn.ReLU(inplace=True)
 83 |         self.downsample = downsample
 84 |         self.stride = stride
 85 | 
 86 |     def forward(self, x):
 87 |         residual = x
 88 | 
 89 |         out = self.conv1(x)
 90 |         out = self.bn1(out)
 91 |         out = self.relu(out)
 92 | 
 93 |         out = self.conv2(out)
 94 |         out = self.bn2(out)
 95 |         out = self.relu(out)
 96 | 
 97 |         out = self.conv3(out)
 98 |         out = self.bn3(out)
 99 | 
100 |         if self.downsample is not None:
101 |             residual = self.downsample(x)
102 | 
103 |         out += residual
104 |         out = self.relu(out)
105 | 
106 |         return out
107 | 
108 | 
109 | class ResNet(nn.Module):
110 | 
111 |     def __init__(self,
112 |                  block,
113 |                  layers,
114 |                  sample_size,
115 |                  sample_duration,
116 |                  shortcut_type='B',
117 |                  num_classes=400,
118 |                  input_chan=3):
119 |         self.inplanes = 64
120 |         super(ResNet, self).__init__()
121 |         self.conv1 = nn.Conv3d(
122 |             input_chan,
123 |             64,
124 |             kernel_size=7,
125 |             stride=(1, 2, 2),
126 |             padding=(3, 3, 3),
127 |             bias=False)
128 |         self.bn1 = nn.BatchNorm3d(64)
129 |         self.relu = nn.ReLU(inplace=True)
130 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
131 |         self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
132 |         self.layer2 = self._make_layer(
133 |             block, 128, layers[1], shortcut_type, stride=2)
134 |         self.layer3 = self._make_layer(
135 |             block, 256, layers[2], shortcut_type, stride=2)
136 |         self.layer4 = self._make_layer(
137 |             block, 512, layers[3], shortcut_type, stride=2)
138 |         last_duration = int(math.ceil(sample_duration / 16))
139 |         last_size = int(math.ceil(sample_size / 32))
140 |         self.avgpool = nn.AvgPool3d(
141 |             (last_duration, last_size, last_size), stride=1)
142 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
143 | 
144 |         for m in self.modules():
145 |             if isinstance(m, nn.Conv3d):
146 |                 nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
147 |             elif isinstance(m, nn.Linear):
148 |                 nn.init.xavier_uniform_(m.weight)
149 |                 nn.init.constant_(m.bias, 0)
150 |             elif isinstance(m, (nn.BatchNorm3d, nn.GroupNorm)):
151 |                 nn.init.constant_(m.weight, 1)
152 |                 nn.init.constant_(m.bias, 0)
153 | 
154 |     def freeze_layers(self, freeze_at):
155 |         if freeze_at < 0:
156 |             return
157 |         else:
158 |             for p in self.bn1.parameters():
159 |                 p.requires_grad = False
160 | 
161 |         for stage in range(freeze_at):
162 |             print("freezing at %d" % stage)
163 |             if stage == 0:
164 |                 m = self.conv1
165 |             else:
166 |                 m = getattr(self, "layer" + str(stage))
167 |             for p in m.parameters():
168 |                 p.requires_grad = False
169 | 
170 | 
171 | 
172 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
173 |         downsample = None
174 |         if stride != 1 or self.inplanes != planes * block.expansion:
175 |             if shortcut_type == 'A':
176 |                 downsample = partial(
177 |                     downsample_basic_block,
178 |                     planes=planes * block.expansion,
179 |                     stride=stride)
180 |             else:
181 |                 downsample = nn.Sequential(
182 |                     nn.Conv3d(
183 |                         self.inplanes,
184 |                         planes * block.expansion,
185 |                         kernel_size=1,
186 |                         stride=stride,
187 |                         bias=False), nn.BatchNorm3d(planes * block.expansion))
188 | 
189 |         layers = []
190 |         layers.append(block(self.inplanes, planes, stride, downsample))
191 |         self.inplanes = planes * block.expansion
192 |         for i in range(1, blocks):
193 |             layers.append(block(self.inplanes, planes))
194 | 
195 |         return nn.Sequential(*layers)
196 | 
197 |     def forward(self, x):
198 |         x = self.conv1(x)
199 |         x = self.bn1(x)
200 |         x = self.relu(x)
201 |         x = self.maxpool(x)
202 | 
203 |         x = self.layer1(x)
204 |         x = self.layer2(x)
205 |         x = self.layer3(x)
206 |         x = self.layer4(x)
207 | 
208 |         x = self.avgpool(x)
209 | 
210 |         x = x.view(x.size(0), -1)
211 | 
212 |         if self.fc is not None:
213 |             x = self.fc(x)
214 | 
215 |         return x
216 | 
217 | 
218 | def get_fine_tuning_parameters(model, ft_begin_index):
219 |     if ft_begin_index == 0:
220 |         return model.parameters()
221 | 
222 |     ft_module_names = []
223 |     for i in range(ft_begin_index, 5):
224 |         ft_module_names.append('layer{}'.format(i))
225 |     ft_module_names.append('fc')
226 | 
227 |     parameters = []
228 |     for k, v in model.named_parameters():
229 |         for ft_module in ft_module_names:
230 |             if ft_module in k:
231 |                 parameters.append({'params': v})
232 |                 break
233 |         else:
234 |             parameters.append({'params': v, 'lr': 0.0})
235 | 
236 |     return parameters
237 | 
238 | 
239 | def resnet10(**kwargs):
240 |     """Constructs a ResNet-18 model.
241 |     """
242 |     model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
243 |     return model
244 | 
245 | 
246 | def resnet18(**kwargs):
247 |     """Constructs a ResNet-18 model.
248 |     """
249 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
250 |     return model
251 | 
252 | 
253 | def resnet34(**kwargs):
254 |     """Constructs a ResNet-34 model.
255 |     """
256 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
257 |     return model
258 | 
259 | 
260 | def resnet50(**kwargs):
261 |     """Constructs a ResNet-50 model.
262 |     """
263 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
264 |     return model
265 | 
266 | 
267 | def resnet101(**kwargs):
268 |     """Constructs a ResNet-101 model.
269 |     """
270 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
271 |     return model
272 | 
273 | 
274 | def resnet152(**kwargs):
275 |     """Constructs a ResNet-101 model.
276 |     """
277 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
278 |     return model
279 | 
280 | 
281 | def resnet200(**kwargs):
282 |     """Constructs a ResNet-101 model.
283 |     """
284 |     model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)
285 |     return model
286 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/datasets/kinetics.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | import re
 10 | import numpy as np
 11 | from src.objectives.localagg import MemoryBank
 12 | 
 13 | 
 14 | def load_value_file(file_path):
 15 |     with open(file_path, 'r') as input_file:
 16 |         value = float(input_file.read().rstrip('\n\r'))
 17 | 
 18 |     return value
 19 | 
 20 | 
 21 | def pil_loader(path):
 22 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 23 |     with open(path, 'rb') as f:
 24 |         with Image.open(f) as img:
 25 |             return img.convert('RGB')
 26 | 
 27 | 
 28 | def accimage_loader(path):
 29 |     try:
 30 |         import accimage
 31 |         return accimage.Image(path)
 32 |     except IOError:
 33 |         # Potentially a decoding problem, fall back to PIL.Image
 34 |         return pil_loader(path)
 35 | 
 36 | 
 37 | def get_default_image_loader():
 38 |     from torchvision import get_image_backend
 39 |     if get_image_backend() == 'accimage':
 40 |         return accimage_loader
 41 |     else:
 42 |         return pil_loader
 43 | 
 44 | 
 45 | def video_loader(video_dir_path, frame_indices, image_loader):
 46 |     video = []
 47 |     format = 'image_{:05d}.jpg'
 48 |     if not os.path.exists(os.path.join(video_dir_path, format.format(1))):
 49 |         format = "frame{:d}.jpg"
 50 |         frame_indices = [x - 1 for x in frame_indices]
 51 |     for i in frame_indices:
 52 |         image_path = os.path.join(video_dir_path, format.format(i))
 53 |         if os.path.exists(image_path):
 54 |             video.append(image_loader(image_path))
 55 |         else:
 56 |             return video
 57 | 
 58 |     return video
 59 | 
 60 | 
 61 | def get_default_video_loader():
 62 |     image_loader = get_default_image_loader()
 63 |     return functools.partial(video_loader, image_loader=image_loader)
 64 | 
 65 | 
 66 | def load_annotation_data(data_file_path):
 67 |     with open(data_file_path, 'r') as data_file:
 68 |         return json.load(data_file)
 69 | 
 70 | 
 71 | def get_class_labels(data):
 72 |     class_labels_map = {}
 73 |     index = 0
 74 |     for class_label in data['labels']:
 75 |         class_labels_map[class_label] = index
 76 |         index += 1
 77 |     return class_labels_map
 78 | 
 79 | 
 80 | def get_video_names_and_annotations(data, subset):
 81 |     video_names = []
 82 |     annotations = []
 83 | 
 84 |     for key, value in data['database'].items():
 85 |         this_subset = value['subset']
 86 |         if this_subset == subset:
 87 |             if subset == 'validation':
 88 |                 key = re.sub("_\d+", "", key)
 89 |             if subset == 'testing':
 90 |                 video_names.append('test/{}'.format(key))
 91 |             else:
 92 |                 label = value['annotations']['label']
 93 |                 video_names.append('{}/{}/{}'.format(subset, label.replace(" ", "_"), key))
 94 |                 annotations.append(value['annotations'])
 95 | 
 96 |     return video_names, annotations
 97 | 
 98 | 
 99 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
100 |                  sample_duration, load_fvs=False, fv_path=None):
101 |     data = load_annotation_data(annotation_path)
102 | 
103 |     video_names, annotations = get_video_names_and_annotations(data, subset)
104 |     class_to_idx = get_class_labels(data)
105 |     idx_to_class = {}
106 |     for name, label in class_to_idx.items():
107 |         idx_to_class[label] = name
108 | 
109 |     dataset = []
110 |     fvs = []
111 |     for i in range(len(video_names)):
112 |         if i % 1000 == 0:
113 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
114 | 
115 |         video_path = os.path.join(root_path, video_names[i])
116 |         if not os.path.exists(video_path):
117 |             video_path = video_path.replace("/training/", "/validation/")
118 |             video_names[i] = video_names[i].replace("training/", "validation/")
119 |         if not os.path.exists(video_path):
120 |             video_path = video_path.replace("/validation/", "/test/")
121 |             video_names[i] = video_names[i].replace("validation/", "test/")
122 |         if not os.path.exists(video_path):
123 |             continue
124 | 
125 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
126 |         n_frames = int(load_value_file(n_frames_file_path))
127 |         if n_frames <= 0:
128 |             continue
129 | 
130 |         if load_fvs:
131 |             fv_vid_path = os.path.join(fv_path, video_names[i]) + ".dat"
132 |             if os.path.exists(fv_vid_path):
133 |                 fv = torch.load(fv_vid_path)
134 |                 fvs.append(fv.cpu().squeeze())
135 |             else:
136 |                 continue
137 | 
138 |         begin_t = 1
139 |         end_t = n_frames
140 |         sample = {
141 |             'video': video_path,
142 |             'n_frames': n_frames,
143 |         }
144 |         if len(annotations) != 0:
145 |             sample['label'] = class_to_idx[annotations[i]['label']]
146 |         else:
147 |             sample['label'] = -1
148 | 
149 |         if n_samples_for_each_video == 1:
150 |             sample['frame_indices'] = list(range(1, n_frames + 1))
151 |             dataset.append(sample)
152 |         else:
153 |             if n_samples_for_each_video > 1:
154 |                 step = max(1,
155 |                            math.ceil((n_frames - 1 - sample_duration) /
156 |                                      (n_samples_for_each_video - 1)))
157 |             else:
158 |                 step = sample_duration
159 |             for j in range(1, n_frames, step):
160 |                 if (j + sample_duration) > n_frames:
161 |                     break
162 |                 sample_j = copy.deepcopy(sample)
163 |                 sample_j['frame_indices'] = list(
164 |                     range(j, min(n_frames + 1, j + sample_duration)))
165 |                 sample_j['n_frames'] = len(sample_j['frame_indices'])
166 |                 dataset.append(sample_j)
167 | 
168 |     if load_fvs:
169 |         fvs = torch.stack(fvs)
170 | 
171 |     return dataset, idx_to_class, fvs
172 | 
173 | 
174 | class Kinetics(data.Dataset):
175 |     """
176 |     Args:
177 |         root (string): Root directory path.
178 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
179 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
180 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
181 |             and returns a transformed version
182 |         target_transform (callable, optional): A function/transform that takes in the
183 |             target and transforms it.
184 |         loader (callable, optional): A function to load an video given its path and frame indices.
185 |      Attributes:
186 |         classes (list): List of the class names.
187 |         class_to_idx (dict): Dict with items (class_name, class_index).
188 |         imgs (list): List of (image path, class_index) tuples
189 |     """
190 | 
191 |     def __init__(self,
192 |                  root_path=None,
193 |                  annotation_path=None,
194 |                  train=True,
195 |                  n_samples_for_each_video=1,
196 |                  spatial_transform=None,
197 |                  temporal_transform=None,
198 |                  target_transform=None,
199 |                  sample_duration=16,
200 |                  get_loader=get_default_video_loader,
201 |                  load_fvs=False,
202 |                  fv_path=None):
203 | 
204 |         subset = 'validation'
205 |         if train:
206 |             subset = 'training'
207 | 
208 |         self.data, self.class_names, fvs = make_dataset(
209 |             root_path, annotation_path, subset, n_samples_for_each_video,
210 |             sample_duration, load_fvs=load_fvs, fv_path=fv_path)
211 | 
212 |         self.spatial_transform = spatial_transform
213 |         self.temporal_transform = temporal_transform
214 |         self.target_transform = target_transform
215 |         self.loader = get_loader()
216 | 
217 |         self.fvs = fvs
218 | 
219 |     def __getitem__(self, index):
220 |         """
221 |         Args:
222 |             index (int): Index
223 |         Returns:
224 |             tuple: (image, target) where target is class_index of the target class.
225 |         """
226 |         path = self.data[index]['video']
227 | 
228 |         frame_indices = self.data[index]['frame_indices']
229 | 
230 |         if self.temporal_transform is not None:
231 |             frame_indices = self.temporal_transform(frame_indices)
232 | 
233 |         clip = self.loader(path, frame_indices)
234 |         if len(clip) == 0:
235 |             print(path)
236 |             print(frame_indices)
237 | 
238 |         if self.spatial_transform is not None:
239 |             self.spatial_transform.randomize_parameters()
240 |             clip = [self.spatial_transform(img) for img in clip]
241 |         clip = torch.stack(clip, 0)
242 |         clip = clip.permute(1, 0, 2, 3)
243 | 
244 |         target = self.data[index]
245 |         if self.target_transform is not None:
246 |             target = self.target_transform(target)
247 | 
248 |         return index, clip, target
249 | 
250 |     def __len__(self):
251 |         return len(self.data)
252 | 


--------------------------------------------------------------------------------
/3D-ResNet/nohup.out:
--------------------------------------------------------------------------------
  1 | ucf_fromkin_ours_64fr_5scale_split3
  2 | Namespace(annotation_path='/data2/ptokmako/ucfTrainTestlist/ucf101_03.json', arch='resnet-18', batch_size=128, begin_epoch=1, checkpoint=10, crop_position_in_test='c', dampening=0.9, dataset='ucf101', ft_begin_index=4, gpu=['0,1,8,9'], initial_scale=1.0, label_folder='', learning_rate=0.001, lr_patience=5, lr_threshold=0.0001, manual_seed=1, mean=[114.7748, 107.7354, 99.475], mean_dataset='activitynet', model='resnet', model_depth=18, momentum=0.9, n_classes=400, n_epochs=200, n_finetune_classes=101, n_scales=5, n_threads=8, n_val_samples=3, nesterov=False, no_cuda=False, no_hflip=False, no_mean_norm=False, no_softmax_in_test=False, no_train=False, no_val=False, norm_value=1, optimizer='sgd', pretrain_path='/data2/ptokmako/results/kinetics_64fr/save_200.pth', resnet_shortcut='B', result_path='/data2/ptokmako/results/ucf_fromkin_ours_64fr_5scale_split3', resume_path='', root_path='/root/data/ActivityNet', sample_duration=64, sample_size=112, scale_in_test=1.0, scale_step=0.84081289641525, scales=[1.0, 0.84081289641525, 0.706966326778202, 0.5944264048864302, 0.4998013871982635], shot=-1, std=[38.7568578, 37.88248729, 40.02898126], std_norm=False, temp_stride=1, test=False, test_subset='val', train_crop='random', video_path='/scratch/ptokmako/UCF_jpeg/', weight_decay=5e-05)
  3 | loading pretrained model /data2/ptokmako/results/kinetics_64fr/save_200.pth
  4 | freezing at 0
  5 | freezing at 1
  6 | freezing at 2
  7 | freezing at 3
  8 | DataParallel(
  9 |   (module): ResNet(
 10 |     (conv1): Conv3d(3, 64, kernel_size=(7, 7, 7), stride=(1, 2, 2), padding=(3, 3, 3), bias=False)
 11 |     (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 12 |     (relu): ReLU(inplace=True)
 13 |     (maxpool): MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1, dilation=1, ceil_mode=False)
 14 |     (layer1): Sequential(
 15 |       (0): BasicBlock(
 16 |         (conv1): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 17 |         (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 18 |         (relu): ReLU(inplace=True)
 19 |         (conv2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 20 |         (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 21 |       )
 22 |       (1): BasicBlock(
 23 |         (conv1): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 24 |         (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 25 |         (relu): ReLU(inplace=True)
 26 |         (conv2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 27 |         (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 28 |       )
 29 |     )
 30 |     (layer2): Sequential(
 31 |       (0): BasicBlock(
 32 |         (conv1): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1), bias=False)
 33 |         (bn1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 34 |         (relu): ReLU(inplace=True)
 35 |         (conv2): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 36 |         (bn2): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 37 |         (downsample): Sequential(
 38 |           (0): Conv3d(64, 128, kernel_size=(1, 1, 1), stride=(2, 2, 2), bias=False)
 39 |           (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 40 |         )
 41 |       )
 42 |       (1): BasicBlock(
 43 |         (conv1): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 44 |         (bn1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 45 |         (relu): ReLU(inplace=True)
 46 |         (conv2): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 47 |         (bn2): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 48 |       )
 49 |     )
 50 |     (layer3): Sequential(
 51 |       (0): BasicBlock(
 52 |         (conv1): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1), bias=False)
 53 |         (bn1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 54 |         (relu): ReLU(inplace=True)
 55 |         (conv2): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 56 |         (bn2): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 57 |         (downsample): Sequential(
 58 |           (0): Conv3d(128, 256, kernel_size=(1, 1, 1), stride=(2, 2, 2), bias=False)
 59 |           (1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 60 |         )
 61 |       )
 62 |       (1): BasicBlock(
 63 |         (conv1): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 64 |         (bn1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 65 |         (relu): ReLU(inplace=True)
 66 |         (conv2): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 67 |         (bn2): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 68 |       )
 69 |     )
 70 |     (layer4): Sequential(
 71 |       (0): BasicBlock(
 72 |         (conv1): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1), bias=False)
 73 |         (bn1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 74 |         (relu): ReLU(inplace=True)
 75 |         (conv2): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 76 |         (bn2): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 77 |         (downsample): Sequential(
 78 |           (0): Conv3d(256, 512, kernel_size=(1, 1, 1), stride=(2, 2, 2), bias=False)
 79 |           (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 80 |         )
 81 |       )
 82 |       (1): BasicBlock(
 83 |         (conv1): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 84 |         (bn1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 85 |         (relu): ReLU(inplace=True)
 86 |         (conv2): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
 87 |         (bn2): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 88 |       )
 89 |     )
 90 |     (avgpool): AvgPool3d(kernel_size=(4, 4, 4), stride=1, padding=0)
 91 |     (fc): Linear(in_features=512, out_features=101, bias=True)
 92 |   )
 93 | )
 94 | dataset loading [0/9624]
 95 | dataset loading [1000/9624]
 96 | dataset loading [2000/9624]
 97 | dataset loading [3000/9624]
 98 | dataset loading [4000/9624]
 99 | dataset loading [5000/9624]
100 | dataset loading [6000/9624]
101 | dataset loading [7000/9624]
102 | dataset loading [8000/9624]
103 | dataset loading [9000/9624]
104 | 9624
105 | dataset loading [0/3696]
106 | dataset loading [1000/3696]
107 | dataset loading [2000/3696]
108 | dataset loading [3000/3696]
109 | 3791
110 | run
111 | train at epoch 1
112 | THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1573049301898/work/aten/src/THC/THCCachingHostAllocator.cpp line=278 error=2 : out of memory
113 | Traceback (most recent call last):
114 |   File "main.py", line 169, in <module>
115 |     train_logger, train_batch_logger, writer)
116 |   File "/home/ptokmako/src/video_cluster/3D-ResNet/train.py", line 36, in train_epoch
117 |     outputs = model(inputs.cuda())
118 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
119 |     result = self.forward(*input, **kwargs)
120 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
121 |     outputs = self.parallel_apply(replicas, inputs, kwargs)
122 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
123 |     return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
124 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
125 |     output.reraise()
126 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/_utils.py", line 385, in reraise
127 |     raise self.exc_type(msg)
128 | RuntimeError: Caught RuntimeError in replica 0 on device 0.
129 | Original Traceback (most recent call last):
130 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
131 |     output = module(*input, **kwargs)
132 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
133 |     result = self.forward(*input, **kwargs)
134 |   File "/home/ptokmako/src/video_cluster/3D-ResNet/models/resnet.py", line 195, in forward
135 |     x = self.conv1(x)
136 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
137 |     result = self.forward(*input, **kwargs)
138 |   File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 480, in forward
139 |     self.padding, self.dilation, self.groups)
140 | RuntimeError: CUDA error: invalid argument
141 | 
142 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/datasets/spatial_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from PIL import Image, ImageOps
  3 | import torchvision.transforms.functional as F
  4 | from torchvision.transforms.transforms import Lambda
  5 | import numbers
  6 | import torch
  7 | import torch
  8 | import numpy as np
  9 | try:
 10 |     import accimage
 11 | except ImportError:
 12 |     accimage = None
 13 | 
 14 | 
 15 | class MultiScaleRandomCrop(object):
 16 | 
 17 |     def __init__(self, scales, size, interpolation=Image.BILINEAR):
 18 |         self.scales = scales
 19 |         self.size = size
 20 |         self.interpolation = interpolation
 21 | 
 22 |     def __call__(self, img):
 23 |         min_length = min(img.size[0], img.size[1])
 24 |         crop_size = int(min_length * self.scale)
 25 | 
 26 |         image_width = img.size[0]
 27 |         image_height = img.size[1]
 28 | 
 29 |         x1 = self.tl_x * (image_width - crop_size)
 30 |         y1 = self.tl_y * (image_height - crop_size)
 31 |         x2 = x1 + crop_size
 32 |         y2 = y1 + crop_size
 33 | 
 34 |         img = img.crop((x1, y1, x2, y2))
 35 | 
 36 |         return img.resize((self.size, self.size), self.interpolation)
 37 | 
 38 |     def randomize_parameters(self):
 39 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
 40 |         self.tl_x = random.random()
 41 |         self.tl_y = random.random()
 42 | 
 43 | 
 44 | class RandomGrayscale(object):
 45 |     """Randomly convert image to grayscale with a probability of p (default 0.1).
 46 | 
 47 |     Args:
 48 |         p (float): probability that image should be converted to grayscale.
 49 | 
 50 |     Returns:
 51 |         PIL Image: Grayscale version of the input image with probability p and unchanged
 52 |         with probability (1-p).
 53 |         - If input image is 1 channel: grayscale version is 1 channel
 54 |         - If input image is 3 channel: grayscale version is 3 channel with r == g == b
 55 | 
 56 |     """
 57 | 
 58 |     def __init__(self, p=0.1):
 59 |         self.p = p
 60 | 
 61 |     def __call__(self, img):
 62 |         """
 63 |         Args:
 64 |             img (PIL Image): Image to be converted to grayscale.
 65 | 
 66 |         Returns:
 67 |             PIL Image: Randomly grayscaled image.
 68 |         """
 69 |         num_output_channels = 1 if img.mode == 'L' else 3
 70 |         if self.sample < self.p:
 71 |             return F.to_grayscale(img, num_output_channels=num_output_channels)
 72 |         return img
 73 | 
 74 |     def randomize_parameters(self):
 75 |         self.sample = random.random()
 76 | 
 77 |     def __repr__(self):
 78 |         return self.__class__.__name__ + '(p={0})'.format(self.p)
 79 | 
 80 | 
 81 | class ColorJitter(object):
 82 |     """Randomly change the brightness, contrast and saturation of an image.
 83 | 
 84 |     Args:
 85 |         brightness (float or tuple of float (min, max)): How much to jitter brightness.
 86 |             brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
 87 |             or the given [min, max]. Should be non negative numbers.
 88 |         contrast (float or tuple of float (min, max)): How much to jitter contrast.
 89 |             contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
 90 |             or the given [min, max]. Should be non negative numbers.
 91 |         saturation (float or tuple of float (min, max)): How much to jitter saturation.
 92 |             saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
 93 |             or the given [min, max]. Should be non negative numbers.
 94 |         hue (float or tuple of float (min, max)): How much to jitter hue.
 95 |             hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
 96 |             Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
 97 |     """
 98 |     def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
 99 |         self.brightness = self._check_input(brightness, 'brightness')
100 |         self.contrast = self._check_input(contrast, 'contrast')
101 |         self.saturation = self._check_input(saturation, 'saturation')
102 |         self.hue = self._check_input(hue, 'hue', center=0, bound=(-0.5, 0.5),
103 |                                      clip_first_on_zero=False)
104 | 
105 |     def _check_input(self, value, name, center=1, bound=(0, float('inf')), clip_first_on_zero=True):
106 |         if isinstance(value, numbers.Number):
107 |             if value < 0:
108 |                 raise ValueError("If {} is a single number, it must be non negative.".format(name))
109 |             value = [center - value, center + value]
110 |             if clip_first_on_zero:
111 |                 value[0] = max(value[0], 0)
112 |         elif isinstance(value, (tuple, list)) and len(value) == 2:
113 |             if not bound[0] <= value[0] <= value[1] <= bound[1]:
114 |                 raise ValueError("{} values should be between {}".format(name, bound))
115 |         else:
116 |             raise TypeError("{} should be a single number or a list/tuple with lenght 2.".format(name))
117 | 
118 |         # if value is 0 or (1., 1.) for brightness/contrast/saturation
119 |         # or (0., 0.) for hue, do nothing
120 |         if value[0] == value[1] == center:
121 |             value = None
122 |         return value
123 | 
124 |     def randomize_parameters(self):
125 |         transforms = []
126 | 
127 |         if self.brightness is not None:
128 |             brightness_factor = random.uniform(self.brightness[0], self.brightness[1])
129 |             transforms.append(Lambda(lambda img: F.adjust_brightness(img, brightness_factor)))
130 | 
131 |         if self.contrast is not None:
132 |             contrast_factor = random.uniform(self.contrast[0], self.contrast[1])
133 |             transforms.append(Lambda(lambda img: F.adjust_contrast(img, contrast_factor)))
134 | 
135 |         if self.saturation is not None:
136 |             saturation_factor = random.uniform(self.saturation[0], self.saturation[1])
137 |             transforms.append(Lambda(lambda img: F.adjust_saturation(img, saturation_factor)))
138 | 
139 |         if self.hue is not None:
140 |             hue_factor = random.uniform(self.hue[0], self.hue[1])
141 |             transforms.append(Lambda(lambda img: F.adjust_hue(img, hue_factor)))
142 | 
143 |         random.shuffle(transforms)
144 |         self.transform = Compose(transforms)
145 | 
146 |     def get_params(self):
147 |         """Get a randomized transform to be applied on image.
148 | 
149 |         Arguments are same as that of __init__.
150 | 
151 |         Returns:
152 |             Transform which randomly adjusts brightness, contrast and
153 |             saturation in a random order.
154 |         """
155 | 
156 | 
157 |         return self.transform
158 | 
159 |     def __call__(self, img):
160 |         """
161 |         Args:
162 |             img (PIL Image): Input image.
163 | 
164 |         Returns:
165 |             PIL Image: Color jittered image.
166 |         """
167 |         transform = self.get_params()
168 |         return transform(img)
169 | 
170 |     def __repr__(self):
171 |         format_string = self.__class__.__name__ + '('
172 |         format_string += 'brightness={0}'.format(self.brightness)
173 |         format_string += ', contrast={0}'.format(self.contrast)
174 |         format_string += ', saturation={0}'.format(self.saturation)
175 |         format_string += ', hue={0})'.format(self.hue)
176 |         return format_string
177 | 
178 | 
179 | class RandomHorizontalFlip(object):
180 |     """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""
181 | 
182 |     def __call__(self, img):
183 |         """
184 |         Args:
185 |             img (PIL.Image): Image to be flipped.
186 |         Returns:
187 |             PIL.Image: Randomly flipped image.
188 |         """
189 |         if self.p < 0.5:
190 |             return img.transpose(Image.FLIP_LEFT_RIGHT)
191 |         return img
192 | 
193 |     def randomize_parameters(self):
194 |         self.p = random.random()
195 | 
196 | 
197 | class ToTensor(object):
198 |     """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
199 |     Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
200 |     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
201 |     """
202 | 
203 |     def __init__(self, norm_value=255, flow_range=None):
204 |         self.norm_value = norm_value
205 |         self.flow_range = flow_range
206 | 
207 |     def __call__(self, pic):
208 |         """
209 |         Args:
210 |             pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
211 |         Returns:
212 |             Tensor: Converted image.
213 |         """
214 |         if isinstance(pic, np.ndarray):
215 |             # handle numpy array
216 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
217 |             # backward compatibility
218 |             return img.float().div(self.norm_value)
219 | 
220 |         if accimage is not None and isinstance(pic, accimage.Image):
221 |             nppic = np.zeros(
222 |                 [pic.channels, pic.height, pic.width], dtype=np.float32)
223 |             pic.copyto(nppic)
224 |             return torch.from_numpy(nppic)
225 | 
226 |         # handle PIL Image
227 |         if pic.mode == 'I':
228 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
229 |         elif pic.mode == 'I;16':
230 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
231 |         else:
232 |             img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
233 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
234 |         if pic.mode == 'YCbCr':
235 |             nchannel = 3
236 |         elif pic.mode == 'I;16':
237 |             nchannel = 1
238 |         else:
239 |             nchannel = len(pic.mode)
240 |         img = img.view(pic.size[1], pic.size[0], nchannel)
241 |         # put it from HWC to CHW format
242 |         # yikes, this transpose takes 80% of the loading time/CPU
243 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
244 |         if isinstance(img, torch.ByteTensor):
245 |             img =  img.float().div(self.norm_value)
246 | 
247 |         if self.flow_range:
248 |             img = img[0:2, :, :]
249 |             img = img * 40 - 20
250 |             # img = img - 0.5
251 | 
252 |         return img
253 | 
254 |     def randomize_parameters(self):
255 |         pass
256 | 
257 | 
258 | class Compose(object):
259 |     """Composes several transforms together.
260 |     Args:
261 |         transforms (list of ``Transform`` objects): list of transforms to compose.
262 |     Example:
263 |         >>> transforms.Compose([
264 |         >>>     transforms.CenterCrop(10),
265 |         >>>     transforms.ToTensor(),
266 |         >>> ])
267 |     """
268 | 
269 |     def __init__(self, transforms):
270 |         self.transforms = transforms
271 | 
272 |     def __call__(self, img):
273 |         for t in self.transforms:
274 |             img = t(img)
275 |         return img
276 | 
277 |     def randomize_parameters(self):
278 |         for t in self.transforms:
279 |             meth = getattr(t, "randomize_parameters", None)
280 |             if callable(meth):
281 |                 t.randomize_parameters()


--------------------------------------------------------------------------------
/3D-ResNet/spatial_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | import numbers
  4 | import collections
  5 | import numpy as np
  6 | import torch
  7 | from PIL import Image, ImageOps
  8 | try:
  9 |     import accimage
 10 | except ImportError:
 11 |     accimage = None
 12 | 
 13 | 
 14 | class Compose(object):
 15 |     """Composes several transforms together.
 16 |     Args:
 17 |         transforms (list of ``Transform`` objects): list of transforms to compose.
 18 |     Example:
 19 |         >>> transforms.Compose([
 20 |         >>>     transforms.CenterCrop(10),
 21 |         >>>     transforms.ToTensor(),
 22 |         >>> ])
 23 |     """
 24 | 
 25 |     def __init__(self, transforms):
 26 |         self.transforms = transforms
 27 | 
 28 |     def __call__(self, img):
 29 |         for t in self.transforms:
 30 |             img = t(img)
 31 |         return img
 32 | 
 33 |     def randomize_parameters(self):
 34 |         for t in self.transforms:
 35 |             t.randomize_parameters()
 36 | 
 37 | 
 38 | class ToTensor(object):
 39 |     """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
 40 |     Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
 41 |     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
 42 |     """
 43 | 
 44 |     def __init__(self, norm_value=255):
 45 |         self.norm_value = norm_value
 46 | 
 47 |     def __call__(self, pic):
 48 |         """
 49 |         Args:
 50 |             pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
 51 |         Returns:
 52 |             Tensor: Converted image.
 53 |         """
 54 |         if isinstance(pic, np.ndarray):
 55 |             # handle numpy array
 56 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 57 |             # backward compatibility
 58 |             return img.float().div(self.norm_value)
 59 | 
 60 |         if accimage is not None and isinstance(pic, accimage.Image):
 61 |             nppic = np.zeros(
 62 |                 [pic.channels, pic.height, pic.width], dtype=np.float32)
 63 |             pic.copyto(nppic)
 64 |             return torch.from_numpy(nppic)
 65 | 
 66 |         # handle PIL Image
 67 |         if pic.mode == 'I':
 68 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 69 |         elif pic.mode == 'I;16':
 70 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 71 |         else:
 72 |             img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
 73 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 74 |         if pic.mode == 'YCbCr':
 75 |             nchannel = 3
 76 |         elif pic.mode == 'I;16':
 77 |             nchannel = 1
 78 |         else:
 79 |             nchannel = len(pic.mode)
 80 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 81 |         # put it from HWC to CHW format
 82 |         # yikes, this transpose takes 80% of the loading time/CPU
 83 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 84 |         if isinstance(img, torch.ByteTensor):
 85 |             img =  img.float().div(self.norm_value)
 86 | 
 87 |         return img
 88 | 
 89 |     def randomize_parameters(self):
 90 |         pass
 91 | 
 92 | 
 93 | class Normalize(object):
 94 |     """Normalize an tensor image with mean and standard deviation.
 95 |     Given mean: (R, G, B) and std: (R, G, B),
 96 |     will normalize each channel of the torch.*Tensor, i.e.
 97 |     channel = (channel - mean) / std
 98 |     Args:
 99 |         mean (sequence): Sequence of means for R, G, B channels respecitvely.
100 |         std (sequence): Sequence of standard deviations for R, G, B channels
101 |             respecitvely.
102 |     """
103 | 
104 |     def __init__(self, mean, std):
105 |         self.mean = mean
106 |         self.std = std
107 | 
108 |     def __call__(self, tensor):
109 |         """
110 |         Args:
111 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
112 |         Returns:
113 |             Tensor: Normalized image.
114 |         """
115 |         # TODO: make efficient
116 |         for t, m, s in zip(tensor, self.mean, self.std):
117 |             t.sub_(m).div_(s)
118 |         return tensor
119 | 
120 |     def randomize_parameters(self):
121 |         pass
122 | 
123 | 
124 | class Scale(object):
125 |     """Rescale the input PIL.Image to the given size.
126 |     Args:
127 |         size (sequence or int): Desired output size. If size is a sequence like
128 |             (w, h), output size will be matched to this. If size is an int,
129 |             smaller edge of the image will be matched to this number.
130 |             i.e, if height > width, then image will be rescaled to
131 |             (size * height / width, size)
132 |         interpolation (int, optional): Desired interpolation. Default is
133 |             ``PIL.Image.BILINEAR``
134 |     """
135 | 
136 |     def __init__(self, size, interpolation=Image.BILINEAR):
137 |         assert isinstance(size,
138 |                           int) or (isinstance(size, collections.Iterable) and
139 |                                    len(size) == 2)
140 |         self.size = size
141 |         self.interpolation = interpolation
142 | 
143 |     def __call__(self, img):
144 |         """
145 |         Args:
146 |             img (PIL.Image): Image to be scaled.
147 |         Returns:
148 |             PIL.Image: Rescaled image.
149 |         """
150 |         if isinstance(self.size, int):
151 |             w, h = img.size
152 |             if (w <= h and w == self.size) or (h <= w and h == self.size):
153 |                 return img
154 |             if w < h:
155 |                 ow = self.size
156 |                 oh = int(self.size * h / w)
157 |                 return img.resize((ow, oh), self.interpolation)
158 |             else:
159 |                 oh = self.size
160 |                 ow = int(self.size * w / h)
161 |                 return img.resize((ow, oh), self.interpolation)
162 |         else:
163 |             return img.resize(self.size, self.interpolation)
164 | 
165 |     def randomize_parameters(self):
166 |         pass
167 | 
168 | 
169 | class CenterCrop(object):
170 |     """Crops the given PIL.Image at the center.
171 |     Args:
172 |         size (sequence or int): Desired output size of the crop. If size is an
173 |             int instead of sequence like (h, w), a square crop (size, size) is
174 |             made.
175 |     """
176 | 
177 |     def __init__(self, size):
178 |         if isinstance(size, numbers.Number):
179 |             self.size = (int(size), int(size))
180 |         else:
181 |             self.size = size
182 | 
183 |     def __call__(self, img):
184 |         """
185 |         Args:
186 |             img (PIL.Image): Image to be cropped.
187 |         Returns:
188 |             PIL.Image: Cropped image.
189 |         """
190 |         w, h = img.size
191 |         th, tw = self.size
192 |         x1 = int(round((w - tw) / 2.))
193 |         y1 = int(round((h - th) / 2.))
194 |         return img.crop((x1, y1, x1 + tw, y1 + th))
195 | 
196 |     def randomize_parameters(self):
197 |         pass
198 | 
199 | 
200 | class CornerCrop(object):
201 | 
202 |     def __init__(self, size, crop_position=None):
203 |         self.size = size
204 |         if crop_position is None:
205 |             self.randomize = True
206 |         else:
207 |             self.randomize = False
208 |         self.crop_position = crop_position
209 |         self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br']
210 | 
211 |     def __call__(self, img):
212 |         image_width = img.size[0]
213 |         image_height = img.size[1]
214 | 
215 |         if self.crop_position == 'c':
216 |             th, tw = (self.size, self.size)
217 |             x1 = int(round((image_width - tw) / 2.))
218 |             y1 = int(round((image_height - th) / 2.))
219 |             x2 = x1 + tw
220 |             y2 = y1 + th
221 |         elif self.crop_position == 'tl':
222 |             x1 = 0
223 |             y1 = 0
224 |             x2 = self.size
225 |             y2 = self.size
226 |         elif self.crop_position == 'tr':
227 |             x1 = image_width - self.size
228 |             y1 = 0
229 |             x2 = image_width
230 |             y2 = self.size
231 |         elif self.crop_position == 'bl':
232 |             x1 = 0
233 |             y1 = image_height - self.size
234 |             x2 = self.size
235 |             y2 = image_height
236 |         elif self.crop_position == 'br':
237 |             x1 = image_width - self.size
238 |             y1 = image_height - self.size
239 |             x2 = image_width
240 |             y2 = image_height
241 | 
242 |         img = img.crop((x1, y1, x2, y2))
243 | 
244 |         return img
245 | 
246 |     def randomize_parameters(self):
247 |         if self.randomize:
248 |             self.crop_position = self.crop_positions[random.randint(
249 |                 0,
250 |                 len(self.crop_positions) - 1)]
251 | 
252 | 
253 | class RandomHorizontalFlip(object):
254 |     """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""
255 | 
256 |     def __call__(self, img):
257 |         """
258 |         Args:
259 |             img (PIL.Image): Image to be flipped.
260 |         Returns:
261 |             PIL.Image: Randomly flipped image.
262 |         """
263 |         if self.p < 0.5:
264 |             return img.transpose(Image.FLIP_LEFT_RIGHT)
265 |         return img
266 | 
267 |     def randomize_parameters(self):
268 |         self.p = random.random()
269 | 
270 | 
271 | class MultiScaleCornerCrop(object):
272 |     """Crop the given PIL.Image to randomly selected size.
273 |     A crop of size is selected from scales of the original size.
274 |     A position of cropping is randomly selected from 4 corners and 1 center.
275 |     This crop is finally resized to given size.
276 |     Args:
277 |         scales: cropping scales of the original size
278 |         size: size of the smaller edge
279 |         interpolation: Default: PIL.Image.BILINEAR
280 |     """
281 | 
282 |     def __init__(self,
283 |                  scales,
284 |                  size,
285 |                  interpolation=Image.BILINEAR,
286 |                  crop_positions=['c', 'tl', 'tr', 'bl', 'br']):
287 |         self.scales = scales
288 |         self.size = size
289 |         self.interpolation = interpolation
290 | 
291 |         self.crop_positions = crop_positions
292 | 
293 |     def __call__(self, img):
294 |         min_length = min(img.size[0], img.size[1])
295 |         crop_size = int(min_length * self.scale)
296 | 
297 |         image_width = img.size[0]
298 |         image_height = img.size[1]
299 | 
300 |         if self.crop_position == 'c':
301 |             center_x = image_width // 2
302 |             center_y = image_height // 2
303 |             box_half = crop_size // 2
304 |             x1 = center_x - box_half
305 |             y1 = center_y - box_half
306 |             x2 = center_x + box_half
307 |             y2 = center_y + box_half
308 |         elif self.crop_position == 'tl':
309 |             x1 = 0
310 |             y1 = 0
311 |             x2 = crop_size
312 |             y2 = crop_size
313 |         elif self.crop_position == 'tr':
314 |             x1 = image_width - crop_size
315 |             y1 = 0
316 |             x2 = image_width
317 |             y2 = crop_size
318 |         elif self.crop_position == 'bl':
319 |             x1 = 0
320 |             y1 = image_height - crop_size
321 |             x2 = crop_size
322 |             y2 = image_height
323 |         elif self.crop_position == 'br':
324 |             x1 = image_width - crop_size
325 |             y1 = image_height - crop_size
326 |             x2 = image_width
327 |             y2 = image_height
328 | 
329 |         img = img.crop((x1, y1, x2, y2))
330 | 
331 |         return img.resize((self.size, self.size), self.interpolation)
332 | 
333 |     def randomize_parameters(self):
334 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
335 |         self.crop_position = self.crop_positions[random.randint(
336 |             0,
337 |             len(self.crop_positions) - 1)]
338 | 
339 | 
340 | class MultiScaleRandomCrop(object):
341 | 
342 |     def __init__(self, scales, size, interpolation=Image.BILINEAR):
343 |         self.scales = scales
344 |         self.size = size
345 |         self.interpolation = interpolation
346 | 
347 |     def __call__(self, img):
348 |         min_length = min(img.size[0], img.size[1])
349 |         crop_size = int(min_length * self.scale)
350 | 
351 |         image_width = img.size[0]
352 |         image_height = img.size[1]
353 | 
354 |         x1 = self.tl_x * (image_width - crop_size)
355 |         y1 = self.tl_y * (image_height - crop_size)
356 |         x2 = x1 + crop_size
357 |         y2 = y1 + crop_size
358 | 
359 |         img = img.crop((x1, y1, x2, y2))
360 | 
361 |         return img.resize((self.size, self.size), self.interpolation)
362 | 
363 |     def randomize_parameters(self):
364 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
365 |         self.tl_x = random.random()
366 |         self.tl_y = random.random()
367 | 


--------------------------------------------------------------------------------
/LocalAggregation/src/objectives/localagg.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Local Aggregation Objective as defined in 
  3 | https://arxiv.org/abs/1903.12355
  4 | 
  5 | Code is based on Tensorflow implementation: 
  6 | https://github.com/neuroailab/LocalAggregation
  7 | """
  8 | 
  9 | import faiss
 10 | import torch
 11 | 
 12 | import numpy as np
 13 | import time
 14 | from termcolor import colored
 15 | 
 16 | from src.utils.tensor import repeat_1d_tensor, l2_normalize
 17 | 
 18 | DEFAULT_KMEANS_SEED = 1234
 19 | 
 20 | class LocalAggregationLossModule(torch.nn.Module):
 21 | 
 22 |     def __init__(self, memory_bank_broadcast, cluster_label_broadcast, k=4096, t=0.07, m=0.5):
 23 |         super(LocalAggregationLossModule, self).__init__()
 24 |         self.k, self.t, self.m = k, t, m
 25 | 
 26 |         self.indices = None
 27 |         self.outputs = None
 28 |         self._bank = None  # pass in via forward function
 29 |         self._cluster_labels = None
 30 |         self.memory_bank_broadcast = memory_bank_broadcast
 31 |         self.cluster_label_broadcast = cluster_label_broadcast
 32 |         self.data_len = memory_bank_broadcast[0].size(0)
 33 | 
 34 |     def _softmax(self, dot_prods):
 35 |         Z = 2876934.2
 36 |         return torch.exp(dot_prods / self.t) / Z
 37 | 
 38 |     def updated_new_data_memory(self, indices, outputs):
 39 |         outputs = l2_normalize(outputs)
 40 |         data_memory = torch.index_select(self._bank, 0, indices)
 41 |         new_data_memory = data_memory * self.m + (1 - self.m) * outputs
 42 |         return l2_normalize(new_data_memory, dim=1)
 43 | 
 44 |     def synchronization_check(self):
 45 |         for i in range(len(self.memory_bank_broadcast)):
 46 |             if i == 0:
 47 |                 device = self.memory_bank_broadcast[0].device
 48 |             else:
 49 |                 assert torch.equal(self.memory_bank_broadcast[0], self.memory_bank_broadcast[i].to(device))
 50 | 
 51 |     def _get_all_dot_products(self, vec):
 52 |         assert len(vec.size()) == 2
 53 |         return torch.matmul(vec, torch.transpose(self._bank, 1, 0))
 54 | 
 55 |     def __get_close_nei_in_back(self, each_k_idx, cluster_labels,
 56 |                                 back_nei_idxs, k):
 57 |         # get which neighbors are close in the background set
 58 |         batch_labels = cluster_labels[each_k_idx][self.indices]
 59 |         top_cluster_labels = cluster_labels[each_k_idx][back_nei_idxs]
 60 |         batch_labels = repeat_1d_tensor(batch_labels, k)
 61 | 
 62 |         curr_close_nei = torch.eq(batch_labels, top_cluster_labels)
 63 |         return curr_close_nei.byte()
 64 | 
 65 |     def __get_relative_prob(self, all_close_nei, back_nei_probs):
 66 |         relative_probs = torch.sum(
 67 |             torch.where(
 68 |                 all_close_nei,
 69 |                 back_nei_probs,
 70 |                 torch.zeros_like(back_nei_probs),
 71 |             ), dim=1)
 72 |         # normalize probs
 73 |         relative_probs = relative_probs / torch.sum(back_nei_probs, dim=1, keepdim=True)
 74 |         return relative_probs
 75 | 
 76 |     def __get_close_nei(self, each_k_idx, cluster_labels, indices):
 77 |         batch_size = self.indices.size(0)
 78 |         dtype = torch.int32  # convert to 32-bit integer to save memory consumption
 79 |         batch_labels = cluster_labels[each_k_idx][indices].to(dtype)
 80 |         _cluster_labels = cluster_labels[each_k_idx].to(dtype).unsqueeze(0).expand(batch_size, -1)
 81 |         batch_labels = repeat_1d_tensor(batch_labels, _cluster_labels.size(1))
 82 |         curr_close_nei = torch.eq(batch_labels, _cluster_labels)
 83 |         return curr_close_nei.byte()
 84 | 
 85 |     def forward(self, indices, outputs, gpu_idx):
 86 |         """
 87 |         :param back_nei_idxs: shape (batch_size, 4096)
 88 |         :param all_close_nei: shape (batch_size, _size_of_dataset) in byte
 89 |         """
 90 |         self.indices = indices.detach()
 91 |         self.outputs = l2_normalize(outputs, dim=1)
 92 |         self._bank = self.memory_bank_broadcast[gpu_idx]  # select a mem bank based on gpu device
 93 |         self._cluster_labels = self.cluster_label_broadcast[gpu_idx]
 94 | 
 95 |         k = self.k
 96 | 
 97 |         all_dps = self._get_all_dot_products(self.outputs)
 98 |         back_nei_dps, back_nei_idxs = torch.topk(all_dps, k=k, sorted=False, dim=1)
 99 |         back_nei_probs = self._softmax(back_nei_dps)
100 | 
101 |         all_close_nei_in_back = None
102 |         no_kmeans = self._cluster_labels.size(0)
103 |         with torch.no_grad():
104 |             for each_k_idx in range(no_kmeans):
105 |                 curr_close_nei = self.__get_close_nei_in_back(
106 |                     each_k_idx, self._cluster_labels, back_nei_idxs, k)
107 | 
108 |                 if all_close_nei_in_back is None:
109 |                     all_close_nei_in_back = curr_close_nei
110 |                 else:
111 |                     # assuming all_close_nei and curr_close_nei are byte tensors
112 |                     all_close_nei_in_back = all_close_nei_in_back | curr_close_nei
113 | 
114 |         relative_probs = self.__get_relative_prob(all_close_nei_in_back, back_nei_probs)
115 |         loss = -torch.mean(torch.log(relative_probs + 1e-7)).unsqueeze(0)
116 | 
117 |         # compute new data memory
118 |         new_data_memory = self.updated_new_data_memory(self.indices, self.outputs)
119 | 
120 |         return loss, new_data_memory
121 | 
122 | 
123 | class MemoryBank(object):
124 |     """For efficiently computing the background vectors."""
125 | 
126 |     def __init__(self, size, dim, device_ids, bank=None):
127 |         self.size = size
128 |         self.dim = dim
129 |         self.device = torch.device("cuda:{}".format(device_ids[0]))
130 |         if bank is not None:
131 |             self._bank = bank
132 |         else:
133 |             self._bank = self._create()
134 |         self.bank_broadcast = torch.cuda.comm.broadcast(self._bank, device_ids)
135 |         self.device = [_bank.device for _bank in self.bank_broadcast]
136 |         self.num_device = len(self.device)
137 |         del self._bank
138 |         # print(colored('Warning: using in-place scatter in memory bank update function', 'red'))
139 | 
140 |     def _create(self):
141 |         # initialize random weights
142 |         mb_init = torch.rand(self.size, self.dim, device=self.device)
143 |         std_dev = 1. / np.sqrt(self.dim / 3)
144 |         mb_init = mb_init * (2 * std_dev) - std_dev
145 |         # L2 normalise so that the norm is 1
146 |         mb_init = l2_normalize(mb_init, dim=1)
147 |         return mb_init.detach()  # detach so its not trainable
148 | 
149 |     def as_tensor(self):
150 |         return self.bank_broadcast[0]
151 | 
152 |     def at_idxs(self, idxs):
153 |         return torch.index_select(self.bank_broadcast[0], 0, idxs)
154 | 
155 |     def get_all_dot_products(self, vec):
156 |         # [bs, dim]
157 |         assert len(vec.size()) == 2
158 |         return torch.matmul(vec, torch.transpose(self.bank_broadcast[0], 1, 0))
159 | 
160 |     def get_dot_products(self, vec, idxs):
161 |         vec_shape = list(vec.size())  # [bs, dim]
162 |         idxs_shape = list(idxs.size())  # [bs, ...]
163 | 
164 |         assert len(idxs_shape) in [1, 2]
165 |         assert len(vec_shape) == 2
166 |         assert vec_shape[0] == idxs_shape[0]
167 | 
168 |         if len(idxs_shape) == 1:
169 |             with torch.no_grad():
170 |                 memory_vecs = torch.index_select(self._bank, 0, idxs)
171 |                 memory_vecs_shape = list(memory_vecs.size())
172 |                 assert memory_vecs_shape[0] == idxs_shape[0]
173 |         else:  # len(idxs_shape) == 2
174 |             with torch.no_grad():
175 |                 batch_size, k_dim = idxs.size(0), idxs.size(1)
176 |                 flat_idxs = idxs.view(-1)
177 |                 memory_vecs = torch.index_select(self._bank, 0, flat_idxs)
178 |                 memory_vecs = memory_vecs.view(batch_size, k_dim, self._bank.size(1))
179 |                 memory_vecs_shape = list(memory_vecs.size())
180 | 
181 |             vec_shape[1:1] = [1] * (len(idxs_shape) - 1)
182 |             vec = vec.view(vec_shape)  # [bs, 1, dim]
183 | 
184 |         prods = memory_vecs * vec
185 |         assert list(prods.size()) == memory_vecs_shape
186 | 
187 |         return torch.sum(prods, dim=-1)
188 | 
189 |     def update(self, indices, data_memory):
190 |         # in lieu of scatter-update operation
191 |         data_dim = data_memory.size(1)
192 |         data_memory = data_memory.detach()
193 |         indices = indices.unsqueeze(1).repeat(1, data_dim)
194 | 
195 |         for i in range(self.num_device):
196 |             if i > 0:
197 |                 # start.record()
198 |                 device = self.device[i]
199 |                 indices = indices.to(device)
200 |                 data_memory = data_memory.to(device)
201 |             self.bank_broadcast[i] = self.bank_broadcast[i].scatter_(0, indices, data_memory)
202 | 
203 |     def synchronization_check(self):
204 |         for i in range(len(self.bank_broadcast)):
205 |             if i == 0:
206 |                 device = self.bank_broadcast[0].device
207 |             else:
208 |                 assert torch.equal(self.bank_broadcast[0], self.bank_broadcast[i].to(device))
209 | 
210 | 
211 | def run_kmeans(x, nmb_clusters, verbose=False,
212 |                seed=DEFAULT_KMEANS_SEED, gpu_device=0):
213 |     """
214 |     Runs kmeans on 1 GPU.
215 |     
216 |     Args:
217 |     -----
218 |     x: data
219 |     nmb_clusters (int): number of clusters
220 |     
221 |     Returns:
222 |     --------
223 |     list: ids of data in each cluster
224 |     """
225 |     n_data, d = x.shape
226 | 
227 |     # faiss implementation of k-means
228 |     clus = faiss.Clustering(d, nmb_clusters)
229 |     clus.niter = 20
230 |     clus.max_points_per_centroid = 10000000
231 |     clus.seed = seed
232 |     res = faiss.StandardGpuResources()
233 |     flat_config = faiss.GpuIndexFlatConfig()
234 |     flat_config.useFloat16 = False
235 |     flat_config.device = gpu_device
236 | 
237 |     index = faiss.GpuIndexFlatL2(res, d, flat_config)
238 | 
239 |     # perform the training
240 |     clus.train(x, index)
241 |     _, I = index.search(x, 1)
242 |     losses = faiss.vector_to_array(clus.obj)
243 |     if verbose:
244 |         print('k-means loss evolution: {0}'.format(losses))
245 | 
246 |     return [int(n[0]) for n in I], losses[-1]
247 | 
248 | 
249 | def run_kmeans_multi_gpu(x, nmb_clusters, verbose=False,
250 |                seed=DEFAULT_KMEANS_SEED, gpu_device=0):
251 | 
252 |     """
253 |     Runs kmeans on multi GPUs.
254 | 
255 |     Args:
256 |     -----
257 |     x: data
258 |     nmb_clusters (int): number of clusters
259 | 
260 |     Returns:
261 |     --------
262 |     list: ids of data in each cluster
263 |     """
264 |     n_data, d = x.shape
265 |     ngpus = len(gpu_device)
266 |     assert ngpus > 1
267 | 
268 |     # faiss implementation of k-means
269 |     clus = faiss.Clustering(d, nmb_clusters)
270 |     clus.niter = 20
271 |     clus.max_points_per_centroid = 10000000
272 |     clus.seed = seed
273 |     res = [faiss.StandardGpuResources() for i in range(ngpus)]
274 |     flat_config = []
275 |     for i in gpu_device:
276 |         cfg = faiss.GpuIndexFlatConfig()
277 |         cfg.useFloat16 = False
278 |         cfg.device = i
279 |         flat_config.append(cfg)
280 | 
281 |     indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(ngpus)]
282 |     index = faiss.IndexReplicas()
283 |     for sub_index in indexes:
284 |         index.addIndex(sub_index)
285 | 
286 |     # perform the training
287 |     clus.train(x, index)
288 |     _, I = index.search(x, 1)
289 |     losses = faiss.vector_to_array(clus.obj)
290 |     if verbose:
291 |         print('k-means loss evolution: {0}'.format(losses))
292 | 
293 |     return [int(n[0]) for n in I], losses[-1]
294 | 
295 | 
296 | class Kmeans(object):
297 |     """
298 |     Train <k> different k-means clusterings with different 
299 |     random seeds. These will be used to compute close neighbors
300 |     for a given encoding.
301 |     """
302 |     def __init__(self, k, memory_bank, gpu_device=0, fvs=None):
303 |         super().__init__()
304 |         self.k = k
305 |         self.memory_bank = memory_bank
306 |         self.gpu_device = gpu_device
307 |         self.fvs = fvs
308 | 
309 |     def compute_clusters(self):
310 |         """
311 |         Performs many k-means clustering.
312 |         
313 |         Args:
314 |             x_data (np.array N * dim): data to cluster
315 |         """
316 |         data = self.memory_bank.as_tensor()
317 |         if self.fvs is not None and len(self.fvs) != 0:
318 |             data = torch.cat((data.cpu().float(), self.fvs.float()), 1)
319 |         data_npy = data.detach().numpy()
320 |         clusters = self._compute_clusters(data_npy)
321 |         return clusters
322 | 
323 |     def _compute_clusters(self, data):
324 |         pred_labels = []
325 |         for k_idx, each_k in enumerate(self.k):
326 |             # cluster the data
327 | 
328 |             if len(self.gpu_device) == 1: # single gpu
329 |                 I, _ = run_kmeans(data, each_k, seed=k_idx + DEFAULT_KMEANS_SEED,
330 |                                   gpu_device=self.gpu_device[0])
331 |             else: # multigpu
332 |                 I, _ = run_kmeans_multi_gpu(data, each_k, seed=k_idx + DEFAULT_KMEANS_SEED,
333 |                                   gpu_device=self.gpu_device)
334 | 
335 |             clust_labels = np.asarray(I)
336 |             pred_labels.append(clust_labels)
337 |         pred_labels = np.stack(pred_labels, axis=0)
338 |         pred_labels = torch.from_numpy(pred_labels).long()
339 |         
340 |         return pred_labels
341 | 


--------------------------------------------------------------------------------