├── LocalAggregation ├── src │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── setup.pyc │ │ ├── utils.pyc │ │ ├── __init__.pyc │ │ ├── constants.py │ │ ├── tensor.py │ │ ├── utils.py │ │ └── setup.py │ ├── agents │ │ ├── __init__.py │ │ ├── agents.pyc │ │ └── __init__.pyc │ ├── datasets │ │ ├── __init__.py │ │ ├── target_transforms.py │ │ ├── temporal_transforms.py │ │ ├── kinetics.py │ │ └── spatial_transforms.py │ ├── models │ │ ├── __init__.py │ │ └── resnet3d.py │ ├── objectives │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── localagg.pyc │ │ ├── instance.py │ │ └── localagg.py │ ├── __init__.pyc │ └── idt │ │ ├── idt.sh │ │ ├── sketching.py │ │ ├── compute_fv_models.py │ │ ├── cluster_fv.py │ │ └── extract_idt.py ├── init_env.sh ├── scripts │ ├── instance.py │ └── localagg.py └── config │ ├── kinetics_ir.json │ ├── kinetics_la_tune.json │ └── kinetics_la.json ├── 3D-ResNet ├── models │ ├── __pycache__ │ │ └── resnet.cpython-36.pyc │ └── resnet.py ├── target_transforms.py ├── mean.py ├── utils │ ├── n_frames_ucf101_hmdb51.py │ ├── video_jpg.py │ ├── n_frames_kinetics.py │ ├── fps.py │ ├── video_jpg_kinetics.py │ ├── video_jpg_ucf101_hmdb51.py │ ├── kinetics_json.py │ ├── hmdb51_json.py │ └── ucf101_json.py ├── results │ └── opts.json ├── utils.py ├── train.py ├── test.py ├── temporal_transforms.py ├── validation.py ├── dataset.py ├── model.py ├── main.py ├── datasets │ ├── kinetics.py │ ├── hmdb51.py │ └── ucf101.py ├── opts.py ├── nohup.out └── spatial_transforms.py ├── LICENSE └── README.md /LocalAggregation/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LocalAggregation/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LocalAggregation/src/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LocalAggregation/src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LocalAggregation/src/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LocalAggregation/src/objectives/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LocalAggregation/init_env.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=${PYTHONPATH}:$(pwd) -------------------------------------------------------------------------------- /LocalAggregation/src/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/__init__.pyc -------------------------------------------------------------------------------- /LocalAggregation/src/utils/setup.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/utils/setup.pyc -------------------------------------------------------------------------------- /LocalAggregation/src/utils/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/utils/utils.pyc -------------------------------------------------------------------------------- /LocalAggregation/src/agents/agents.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/agents/agents.pyc -------------------------------------------------------------------------------- /LocalAggregation/src/utils/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/utils/__init__.pyc -------------------------------------------------------------------------------- /LocalAggregation/src/agents/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/agents/__init__.pyc -------------------------------------------------------------------------------- /LocalAggregation/src/objectives/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/objectives/__init__.pyc -------------------------------------------------------------------------------- /LocalAggregation/src/objectives/localagg.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/LocalAggregation/src/objectives/localagg.pyc -------------------------------------------------------------------------------- /3D-ResNet/models/__pycache__/resnet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pvtokmakov/video_cluster/HEAD/3D-ResNet/models/__pycache__/resnet.cpython-36.pyc -------------------------------------------------------------------------------- /LocalAggregation/src/datasets/target_transforms.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class ClassLabel(object): 4 | 5 | def __call__(self, target): 6 | return target['label'] 7 | -------------------------------------------------------------------------------- /LocalAggregation/src/utils/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), '../../')) 3 | SRC_DIR = os.path.join(ROOT_DIR, 'src') 4 | SCRIPTS_DIR = os.path.join(ROOT_DIR, 'scripts') 5 | EVAL_DIR = os.path.join(SRC_DIR, 'evaluation') 6 | -------------------------------------------------------------------------------- /LocalAggregation/src/utils/tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def l2_normalize(x, dim=1): 5 | return x / torch.sqrt(torch.sum(x**2, dim=dim).unsqueeze(dim)) 6 | 7 | 8 | def repeat_1d_tensor(t, num_reps): 9 | return t.unsqueeze(1).expand(-1, num_reps) 10 | 11 | -------------------------------------------------------------------------------- /3D-ResNet/target_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import torch 4 | 5 | 6 | class Compose(object): 7 | 8 | def __init__(self, transforms): 9 | self.transforms = transforms 10 | 11 | def __call__(self, target): 12 | dst = [] 13 | for t in self.transforms: 14 | dst.append(t(target)) 15 | return dst 16 | 17 | 18 | class ClassLabel(object): 19 | 20 | def __call__(self, target): 21 | return target['label'] 22 | 23 | 24 | class VideoID(object): 25 | 26 | def __call__(self, target): 27 | return target['video_id'] 28 | 29 | -------------------------------------------------------------------------------- /3D-ResNet/mean.py: -------------------------------------------------------------------------------- 1 | def get_mean(norm_value=255, dataset='activitynet'): 2 | assert dataset in ['activitynet', 'kinetics'] 3 | 4 | if dataset == 'activitynet': 5 | return [ 6 | 114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value 7 | ] 8 | elif dataset == 'kinetics': 9 | # Kinetics (10 videos for each class) 10 | return [ 11 | 110.63666788 / norm_value, 103.16065604 / norm_value, 12 | 96.29023126 / norm_value 13 | ] 14 | 15 | 16 | def get_std(norm_value=255): 17 | # Kinetics (10 videos for each class) 18 | return [ 19 | 38.7568578 / norm_value, 37.88248729 / norm_value, 20 | 40.02898126 / norm_value 21 | ] 22 | -------------------------------------------------------------------------------- /LocalAggregation/src/idt/idt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PATH=/home/ptokmako/src/opencv/lib:/home/ptokmako/miniconda2/bin:/home/ptokmako/src/ffmpeg/lib:/home/ptokmako/src/ffmpeg/include:/home/ptokmako/miniconda2/envs/idt/lib:/home/ptokmako/miniconda2/envs/idt/include:/opt/cuda/9.1/bin:/home/ptokmako/torch/install/bin:/home/ptokmako/miniconda2/envs/idt/bin:/home/ptokmako/miniconda2/condabin:/opt/gcc49/bin:/opt/openmpi/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/ganglia/bin:/opt/ganglia/sbin:/opt/rocks/bin:/opt/rocks/sbin:/home/ptokmako/bin 4 | export LD_LIBRARY_PATH=/home/ptokmako/src/opencv/lib/:/home/ptokmako/src/ffmpeg/lib/:/home/ptokmako/src/ffmpeg/include/:/home/ptokmako/miniconda2/envs/idt/lib:/home/ptokmako/miniconda2/envs/idt/include:/opt/cuda/9.1/lib64:/opt/cuda/9.1/lib:/home/ptokmako/torch/install/lib:/opt/openmpi/lib:/home/ptokmako/local/readline-8.0/lib 5 | 6 | source ~/miniconda2/etc/profile.d/conda.sh 7 | 8 | conda activate idt 9 | 10 | /home/ptokmako/src/improved_trajectory_release/release/DenseTrackStab $1 -H $2 | gzip > $4/$3.gz 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Pavel Tokmakov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /3D-ResNet/utils/n_frames_ucf101_hmdb51.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | for file_name in os.listdir(class_path): 12 | video_dir_path = os.path.join(class_path, file_name) 13 | image_indices = [] 14 | for image_file_name in os.listdir(video_dir_path): 15 | if 'image' not in image_file_name: 16 | continue 17 | image_indices.append(int(image_file_name[6:11])) 18 | 19 | if len(image_indices) == 0: 20 | print('no image files', video_dir_path) 21 | n_frames = 0 22 | else: 23 | image_indices.sort(reverse=True) 24 | n_frames = image_indices[0] 25 | print(video_dir_path, n_frames) 26 | with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file: 27 | dst_file.write(str(n_frames)) 28 | 29 | 30 | if __name__=="__main__": 31 | dir_path = sys.argv[1] 32 | for class_name in os.listdir(dir_path): 33 | class_process(dir_path, class_name) 34 | -------------------------------------------------------------------------------- /3D-ResNet/utils/video_jpg.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | 7 | if __name__=="__main__": 8 | dir_path = sys.argv[1] 9 | dst_dir_path = sys.argv[2] 10 | 11 | for file_name in os.listdir(dir_path): 12 | if '.mp4' not in file_name: 13 | continue 14 | name, ext = os.path.splitext(file_name) 15 | dst_directory_path = os.path.join(dst_dir_path, name) 16 | 17 | video_file_path = os.path.join(dir_path, file_name) 18 | try: 19 | if os.path.exists(dst_directory_path): 20 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 21 | subprocess.call('rm -r {}'.format(dst_directory_path), shell=True) 22 | print('remove {}'.format(dst_directory_path)) 23 | os.mkdir(dst_directory_path) 24 | else: 25 | continue 26 | else: 27 | os.mkdir(dst_directory_path) 28 | except: 29 | print(dst_directory_path) 30 | continue 31 | cmd = 'ffmpeg -i {} -vf scale=-1:360 {}/image_%05d.jpg'.format(video_file_path, dst_directory_path) 32 | print(cmd) 33 | subprocess.call(cmd, shell=True) 34 | print('\n') 35 | -------------------------------------------------------------------------------- /3D-ResNet/utils/n_frames_kinetics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | print(class_path) 10 | return 11 | 12 | print(class_name) 13 | 14 | for file_name in os.listdir(class_path): 15 | video_dir_path = os.path.join(class_path, file_name) 16 | image_indices = [] 17 | for image_file_name in os.listdir(video_dir_path): 18 | if 'frame' not in image_file_name or "n_frames" in image_file_name: 19 | continue 20 | frame_ind = int(image_file_name.split("frame")[1].split(".")[0]) 21 | image_indices.append(frame_ind) 22 | 23 | if len(image_indices) == 0: 24 | print('no image files', video_dir_path) 25 | n_frames = 0 26 | else: 27 | image_indices.sort(reverse=True) 28 | n_frames = image_indices[0] 29 | print(video_dir_path, n_frames) 30 | with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file: 31 | dst_file.write(str(n_frames)) 32 | 33 | 34 | if __name__=="__main__": 35 | dir_path = sys.argv[1] 36 | for class_name in os.listdir(dir_path): 37 | class_process(dir_path, class_name) 38 | 39 | class_name = 'test' 40 | class_process(dir_path, class_name) 41 | -------------------------------------------------------------------------------- /3D-ResNet/results/opts.json: -------------------------------------------------------------------------------- 1 | {"root_path": "/root/data/ActivityNet", "video_path": "/scratch/ptokmako/hmdb_jpeg/", "annotation_path": "/scratch/ptokmako/testTrainMulti_7030_splits/hmdb51_1.json", "result_path": "results", "dataset": "hmdb51", "n_classes": 101, "n_finetune_classes": 400, "sample_size": 112, "sample_duration": 16, "temp_stride": 1, "initial_scale": 1.0, "n_scales": 5, "scale_step": 0.84081289641525, "train_crop": "corner", "learning_rate": 0.1, "momentum": 0.9, "dampening": 0.9, "weight_decay": 0.001, "mean_dataset": "activitynet", "no_mean_norm": false, "std_norm": false, "nesterov": false, "optimizer": "sgd", "lr_patience": 10, "batch_size": 128, "n_epochs": 200, "begin_epoch": 1, "lr_threshold": 0.0001, "n_val_samples": 3, "resume_path": "/data2/ptokmako/results/hmdb_debug2/save_20.pth", "label_folder": "", "pretrain_path": "", "ft_begin_index": 0, "no_train": true, "no_val": true, "test": true, "test_subset": "val", "scale_in_test": 1.0, "crop_position_in_test": "c", "no_softmax_in_test": false, "no_cuda": false, "n_threads": 16, "checkpoint": 10, "no_hflip": false, "norm_value": 1, "model": "resnet", "model_depth": 18, "resnet_shortcut": "B", "manual_seed": 1, "gpu": ["2"], "shot": -1, "scales": [1.0, 0.84081289641525, 0.706966326778202, 0.5944264048864302, 0.4998013871982635], "arch": "resnet-18", "mean": [114.7748, 107.7354, 99.475], "std": [38.7568578, 37.88248729, 40.02898126]} -------------------------------------------------------------------------------- /LocalAggregation/scripts/instance.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from src.agents.agents import * 3 | from src.utils.setup import process_config 4 | from src.utils.utils import load_json 5 | import os 6 | 7 | 8 | def run(config_path, pre_checkpoint_dir): 9 | config = process_config(config_path) 10 | AgentClass = globals()[config.agent] 11 | agent = AgentClass(config) 12 | 13 | if pre_checkpoint_dir is not None: 14 | # this will load both the weights and memory bank 15 | agent.load_checkpoint('checkpoint.pth.tar', pre_checkpoint_dir, load_memory_bank=True, 16 | load_model=True, load_optim=True, load_epoch=True) 17 | 18 | try: 19 | agent.run() 20 | agent.finalise() 21 | except KeyboardInterrupt: 22 | pass 23 | 24 | 25 | if __name__ == "__main__": 26 | import argparse 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('config', type=str, default='path to config file') 29 | args = parser.parse_args() 30 | 31 | config_json = load_json(args.config) 32 | 33 | pre_checkpoint_dir = None 34 | if config_json['pretrained_exp_dir'] is not None: 35 | print("NOTE: found pretrained model...continue training") 36 | pre_checkpoint_dir = os.path.join(config_json['pretrained_exp_dir'], 'checkpoints') 37 | 38 | run(args.config, pre_checkpoint_dir) 39 | 40 | -------------------------------------------------------------------------------- /3D-ResNet/utils/fps.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | 7 | if __name__=="__main__": 8 | dir_path = sys.argv[1] 9 | dst_dir_path = sys.argv[2] 10 | 11 | for file_name in os.listdir(dir_path): 12 | if '.mp4' not in file_name: 13 | continue 14 | name, ext = os.path.splitext(file_name) 15 | dst_directory_path = os.path.join(dst_dir_path, name) 16 | 17 | video_file_path = os.path.join(dir_path, file_name) 18 | p = subprocess.Popen('ffprobe {}'.format(video_file_path), 19 | shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 20 | _, res = p.communicate() 21 | res = res.decode('utf-8') 22 | 23 | duration_index = res.find('Duration:') 24 | duration_str = res[(duration_index + 10):(duration_index + 21)] 25 | hour = float(duration_str[0:2]) 26 | minute = float(duration_str[3:5]) 27 | sec = float(duration_str[6:10]) 28 | total_sec = hour * 3600 + minute * 60 + sec 29 | 30 | n_frames = len(os.listdir(dst_directory_path)) 31 | if os.path.exists(os.path.join(dst_directory_path, 'fps')): 32 | n_frames -= 1 33 | 34 | fps = round(n_frames / total_sec, 2) 35 | 36 | print(video_file_path, os.path.exists(video_file_path), fps) 37 | with open(os.path.join(dst_directory_path, 'fps'), 'w') as fps_file: 38 | fps_file.write('{}\n'.format(fps)) 39 | -------------------------------------------------------------------------------- /3D-ResNet/utils/video_jpg_kinetics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, dst_dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | dst_class_path = os.path.join(dst_dir_path, class_name) 12 | if not os.path.exists(dst_class_path): 13 | os.mkdir(dst_class_path) 14 | 15 | for file_name in os.listdir(class_path): 16 | if '.mp4.' not in file_name: 17 | continue 18 | name, ext = os.path.splitext(file_name) 19 | if "." in name: 20 | name, ext = os.path.splitext(name) 21 | 22 | dst_directory_path = os.path.join(dst_class_path, name) 23 | 24 | video_file_path = os.path.join(class_path, file_name) 25 | try: 26 | if os.path.exists(dst_directory_path): 27 | continue 28 | else: 29 | os.mkdir(dst_directory_path) 30 | except: 31 | print(dst_directory_path) 32 | continue 33 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path) 34 | print(cmd) 35 | subprocess.call(cmd, shell=True) 36 | print('\n') 37 | 38 | if __name__=="__main__": 39 | dir_path = sys.argv[1] 40 | dst_dir_path = sys.argv[2] 41 | 42 | for class_name in os.listdir(dir_path): 43 | class_process(dir_path, dst_dir_path, class_name) 44 | 45 | class_name = 'test' 46 | class_process(dir_path, dst_dir_path, class_name) 47 | -------------------------------------------------------------------------------- /LocalAggregation/config/kinetics_ir.json: -------------------------------------------------------------------------------- 1 | { 2 | "exp_base": "/data/ptokmako/local_agg/experiments_data", 3 | "debug": false, 4 | "exp_name": "kinetics_ir", 5 | "exp_id": "res18_IR", 6 | "agent": "KineticsAgent", 7 | "cuda": true, 8 | "gpu_device": [0], 9 | "seed": 1337, 10 | "data_loader_workers": 16, 11 | "data_params": { 12 | "image_size": 112, 13 | "sample_duration": 16, 14 | "load_fvs": false, 15 | "fv_path": "/data/ptokmako/kinetics_fv_nobox_nosketch/", 16 | "dataset_path": "/ssd1/ptokmako/kinetics_jpg/", 17 | "annotation_path": "splits.json" 18 | }, 19 | "model_params": { 20 | "embedding_dim": 128, 21 | "hidden_dim": 256, 22 | "n_filters": 64, 23 | "out_dim": 128, 24 | "resnet": true, 25 | "resnet_version": "resnet18" 26 | }, 27 | "loss_params": { 28 | "k": 4096, 29 | "t": 0.07, 30 | "m": 0.5, 31 | "kmeans_k": 30000, 32 | "n_kmeans": 10, 33 | "kmeans_freq": null, 34 | "loss": "InstanceDiscriminationLossModule" 35 | }, 36 | "optim_params": { 37 | "batch_size": 128, 38 | "learning_rate": 0.03, 39 | "lr_decay_schedule": [160, 190], 40 | "lr_decay_rate": 0.1, 41 | "momentum": 0.9, 42 | "weight_decay": 1e-4, 43 | "validate_freq": 1 44 | }, 45 | "num_epochs": 40, 46 | "validate": true, 47 | "copy_checkpoint_freq": null, 48 | "pretrained_exp_dir": null 49 | } 50 | -------------------------------------------------------------------------------- /LocalAggregation/config/kinetics_la_tune.json: -------------------------------------------------------------------------------- 1 | { 2 | "exp_base": "/data/ptokmako/local_agg/experiments_data", 3 | "debug":false, 4 | "exp_name": "kinetics_la_tune", 5 | 6 | "exp_id": "res18_LA", 7 | "agent": "KineticsAgent", 8 | "cuda": true, 9 | "gpu_device": [0], 10 | "faiss_gpu_device": null, 11 | "seed": 1337, 12 | "data_loader_workers": 32, 13 | "data_params": { 14 | "image_size": 112, 15 | "sample_duration": 16, 16 | "load_fvs": true, 17 | "fv_path": "/data/ptokmako/kinetics_fv/", 18 | "dataset_path": "/ssd1/ptokmako/kinetics_jpg/", 19 | "annotation_path": "splits.json" 20 | }, 21 | "model_params": { 22 | "out_dim": 128, 23 | "resnet_version": "resnet18" 24 | }, 25 | "loss_params": { 26 | "k": 4096, 27 | "t": 0.07, 28 | "m": 0.5, 29 | "kmeans_k": 6000, 30 | "n_kmeans": 3, 31 | "kmeans_freq": null, 32 | "loss": "LocalAggregationLossModule" 33 | }, 34 | "optim_params": { 35 | "batch_size": 256, 36 | "learning_rate": 0.03, 37 | "lr_decay_schedule": [10, 20], 38 | "lr_decay_rate": 0.1, 39 | "momentum": 0.9, 40 | "weight_decay": 1e-4, 41 | "validate_freq": 1 42 | }, 43 | "num_epochs": 25, 44 | "validate": false, 45 | "copy_checkpoint_freq": null, 46 | "instance_exp_dir": "/data/ptokmako/local_agg/experiments_data/experiments/kinetics_la/res18_LA", 47 | "pretrained_exp_dir": null, 48 | "cluster_checkpoint_dir": null 49 | } 50 | -------------------------------------------------------------------------------- /LocalAggregation/config/kinetics_la.json: -------------------------------------------------------------------------------- 1 | { 2 | "exp_base": "/data/ptokmako/local_agg/experiments_data", 3 | "debug":false, 4 | "exp_name": "kinetics_la_idt", 5 | 6 | "exp_id": "res18_LA", 7 | "agent": "KineticsAgent", 8 | "cuda": true, 9 | "gpu_device": [0], 10 | "faiss_gpu_device": null, 11 | "seed": 1337, 12 | "data_loader_workers": 32, 13 | "data_params": { 14 | "image_size": 112, 15 | "sample_duration": 16, 16 | "load_fvs": false, 17 | "fv_path": "/data/ptokmako/kinetics_fv_nobox_nosketch/", 18 | "dataset_path": "/ssd1/ptokmako/kinetics_jpg/", 19 | "annotation_path": "splits.json" 20 | }, 21 | "model_params": { 22 | "out_dim": 128, 23 | "resnet_version": "resnet18" 24 | }, 25 | "loss_params": { 26 | "k": 4096, 27 | "t": 0.07, 28 | "m": 0.5, 29 | "kmeans_k": 6000, 30 | "n_kmeans": 3, 31 | "kmeans_freq": null, 32 | "loss": "LocalAggregationLossModule" 33 | }, 34 | "optim_params": { 35 | "batch_size": 256, 36 | "learning_rate": 0.03, 37 | "lr_decay_schedule": [160, 190], 38 | "lr_decay_rate": 0.1, 39 | "momentum": 0.9, 40 | "weight_decay": 1e-4, 41 | "validate_freq": 1 42 | }, 43 | "num_epochs": 200, 44 | "validate": true, 45 | "copy_checkpoint_freq": null, 46 | "instance_exp_dir": "/data/ptokmako/local_agg/experiments_data/experiments/kinetics_ir/res18_IR", 47 | "pretrained_exp_dir": null, 48 | "cluster_checkpoint_dir": "models/idt_clusters" 49 | } 50 | -------------------------------------------------------------------------------- /3D-ResNet/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | class AverageMeter(object): 5 | """Computes and stores the average and current value""" 6 | 7 | def __init__(self): 8 | self.reset() 9 | 10 | def reset(self): 11 | self.val = 0 12 | self.avg = 0 13 | self.sum = 0 14 | self.count = 0 15 | 16 | def update(self, val, n=1): 17 | self.val = val 18 | self.sum += val * n 19 | self.count += n 20 | self.avg = self.sum / self.count 21 | 22 | 23 | class Logger(object): 24 | 25 | def __init__(self, path, header): 26 | self.log_file = open(path, 'w') 27 | self.logger = csv.writer(self.log_file, delimiter='\t') 28 | 29 | self.logger.writerow(header) 30 | self.header = header 31 | 32 | def __del(self): 33 | self.log_file.close() 34 | 35 | def log(self, values): 36 | write_values = [] 37 | for col in self.header: 38 | assert col in values 39 | write_values.append(values[col]) 40 | 41 | self.logger.writerow(write_values) 42 | self.log_file.flush() 43 | 44 | 45 | def load_value_file(file_path): 46 | with open(file_path, 'r') as input_file: 47 | value = float(input_file.read().rstrip('\n\r')) 48 | 49 | return value 50 | 51 | 52 | def calculate_accuracy(outputs, targets, k=1): 53 | batch_size = targets.size(0) 54 | 55 | _, pred = outputs.topk(k, 1, True) 56 | pred = pred.t() 57 | correct = pred.eq(targets.view(1, -1)) 58 | n_correct_elems = correct.float().sum().item() 59 | 60 | return n_correct_elems / batch_size 61 | -------------------------------------------------------------------------------- /3D-ResNet/utils/video_jpg_ucf101_hmdb51.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, dst_dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | dst_class_path = os.path.join(dst_dir_path, class_name) 12 | if not os.path.exists(dst_class_path): 13 | os.mkdir(dst_class_path) 14 | 15 | for file_name in os.listdir(class_path): 16 | if '.avi' not in file_name: 17 | continue 18 | name, ext = os.path.splitext(file_name) 19 | dst_directory_path = os.path.join(dst_class_path, name) 20 | 21 | video_file_path = os.path.join(class_path, file_name) 22 | try: 23 | if os.path.exists(dst_directory_path): 24 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 25 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True) 26 | print('remove {}'.format(dst_directory_path)) 27 | os.mkdir(dst_directory_path) 28 | else: 29 | continue 30 | else: 31 | os.mkdir(dst_directory_path) 32 | except: 33 | print(dst_directory_path) 34 | continue 35 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path) 36 | print(cmd) 37 | subprocess.call(cmd, shell=True) 38 | print('\n') 39 | 40 | if __name__=="__main__": 41 | dir_path = sys.argv[1] 42 | dst_dir_path = sys.argv[2] 43 | 44 | for class_name in os.listdir(dir_path): 45 | class_process(dir_path, dst_dir_path, class_name) 46 | -------------------------------------------------------------------------------- /LocalAggregation/src/idt/sketching.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | def sketch_batch(inp, sd1, nSketchDimC, nFeatDimC, nIsFloatC): 5 | inp = inp.t() 6 | sd1_t = torch.from_numpy(sd1) 7 | sd1 = sd1_t 8 | # print('sd1: ', sd1.shape) 9 | # print(inp.shape) 10 | if nIsFloatC == 1: 11 | inp = inp.type(torch.FloatTensor) 12 | else: 13 | inp = inp.type(torch.DoubleTensor) 14 | 15 | inp = inp.t() 16 | # print(inp.shape) 17 | out1 = inp.mm(sd1) 18 | 19 | res = out1 20 | 21 | if nIsFloatC != 1: 22 | res = res.type(torch.FloatTensor) 23 | 24 | return res 25 | 26 | def choose_h_sk_mat(nSketchDimC, nFeatDimC): 27 | nRep=int( np.ceil(nFeatDimC / nSketchDimC) ) 28 | 29 | rand_array=np.array([]).astype(int) 30 | for i in range(nRep): 31 | rand_array_i = np.random.permutation(int(nSketchDimC)) 32 | rand_array = np.concatenate( (rand_array, rand_array_i), axis=0 ) 33 | 34 | return rand_array[0:nFeatDimC] 35 | 36 | def choose_s_sk_mat(nSketchDimC, nFeatDimC): 37 | nRep = int( np.ceil(nFeatDimC / nSketchDimC) ) 38 | 39 | rand_array = np.array([]).astype(int) 40 | for i in range(nRep): 41 | rand_array_i = np.array([-1, 1]).astype(int) 42 | rand_array = np.concatenate( (rand_array, rand_array_i), axis=0 ) 43 | 44 | rand_array = np.random.permutation(rand_array) 45 | 46 | return rand_array[0: nFeatDimC] 47 | 48 | def create_s_dense(hi, si): 49 | c = np.max(hi) + 1 50 | d = len(hi) 51 | out = np.zeros((d, c)) # in*out 52 | for i in range(d): 53 | out[i, hi[i]] = si[i] 54 | return out 55 | 56 | -------------------------------------------------------------------------------- /LocalAggregation/src/datasets/temporal_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | class TemporalRandomCrop(object): 5 | """Temporally crop the given frame indices at a random location. 6 | 7 | If the number of frames is less than the size, 8 | loop the indices as many times as necessary to satisfy the size. 9 | 10 | Args: 11 | size (int): Desired output size of the crop. 12 | """ 13 | 14 | def __init__(self, size): 15 | self.size = size 16 | 17 | def __call__(self, frame_indices): 18 | """ 19 | Args: 20 | frame_indices (list): frame indices to be cropped. 21 | Returns: 22 | list: Cropped frame indices. 23 | """ 24 | 25 | rand_end = max(0, len(frame_indices) - self.size - 1) 26 | begin_index = random.randint(0, rand_end) 27 | end_index = min(begin_index + self.size, len(frame_indices)) 28 | 29 | out = frame_indices[begin_index:end_index] 30 | 31 | for index in out: 32 | if len(out) >= self.size: 33 | break 34 | out.append(index) 35 | 36 | return out 37 | 38 | 39 | class TemporalCenterCrop(object): 40 | """Temporally crop the given frame indices at a center. 41 | 42 | If the number of frames is less than the size, 43 | loop the indices as many times as necessary to satisfy the size. 44 | 45 | Args: 46 | size (int): Desired output size of the crop. 47 | """ 48 | 49 | def __init__(self, size): 50 | self.size = size 51 | 52 | def __call__(self, frame_indices): 53 | """ 54 | Args: 55 | frame_indices (list): frame indices to be cropped. 56 | Returns: 57 | list: Cropped frame indices. 58 | """ 59 | 60 | center_index = len(frame_indices) // 2 61 | begin_index = max(0, center_index - (self.size // 2)) 62 | end_index = min(begin_index + self.size, len(frame_indices)) 63 | 64 | out = frame_indices[begin_index:end_index] 65 | 66 | for index in out: 67 | if len(out) >= self.size: 68 | break 69 | out.append(index) 70 | 71 | return out 72 | -------------------------------------------------------------------------------- /3D-ResNet/utils/kinetics_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_path, subset): 8 | data = pd.read_csv(csv_path) 9 | keys = [] 10 | key_labels = [] 11 | for i in range(data.shape[0]): 12 | row = data.ix[i, :] 13 | basename = '%s_%s_%s' % (row['youtube_id'], 14 | '%06d' % row['time_start'], 15 | '%06d' % row['time_end']) 16 | keys.append(basename) 17 | if subset != 'testing': 18 | key_labels.append(row['label']) 19 | 20 | database = {} 21 | for i in range(len(keys)): 22 | key = keys[i] 23 | database[key] = {} 24 | database[key]['subset'] = subset 25 | if subset != 'testing': 26 | label = key_labels[i] 27 | database[key]['annotations'] = {'label': label} 28 | else: 29 | database[key]['annotations'] = {} 30 | 31 | return database 32 | 33 | def load_labels(train_csv_path): 34 | data = pd.read_csv(train_csv_path) 35 | return data['label'].unique().tolist() 36 | 37 | def convert_kinetics_csv_to_activitynet_json(train_csv_path, val_csv_path, test_csv_path, dst_json_path): 38 | labels = load_labels(train_csv_path) 39 | train_database = convert_csv_to_dict(train_csv_path, 'training') 40 | print(len(train_database)) 41 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 42 | test_database = convert_csv_to_dict(test_csv_path, 'testing') 43 | 44 | dst_data = {} 45 | dst_data['labels'] = labels 46 | dst_data['database'] = {} 47 | dst_data['database'].update(train_database) 48 | dst_data['database'].update(val_database) 49 | dst_data['database'].update(test_database) 50 | 51 | with open(dst_json_path, 'w') as dst_file: 52 | json.dump(dst_data, dst_file) 53 | 54 | if __name__=="__main__": 55 | train_csv_path = sys.argv[1] 56 | val_csv_path = sys.argv[2] 57 | test_csv_path = sys.argv[3] 58 | dst_json_path = sys.argv[4] 59 | 60 | convert_kinetics_csv_to_activitynet_json( 61 | train_csv_path, val_csv_path, test_csv_path, dst_json_path) 62 | -------------------------------------------------------------------------------- /3D-ResNet/utils/hmdb51_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_dir_path, split_index): 8 | database = {} 9 | for filename in os.listdir(csv_dir_path): 10 | if 'split{}'.format(split_index) not in filename: 11 | continue 12 | 13 | data = pd.read_csv(os.path.join(csv_dir_path, filename), 14 | delimiter=' ', header=None) 15 | keys = [] 16 | subsets = [] 17 | for i in range(data.shape[0]): 18 | row = data.ix[i, :] 19 | if row[1] == 0: 20 | continue 21 | elif row[1] == 1: 22 | subset = 'training' 23 | elif row[1] == 2: 24 | subset = 'validation' 25 | 26 | keys.append(row[0].split('.')[0]) 27 | subsets.append(subset) 28 | 29 | for i in range(len(keys)): 30 | key = keys[i] 31 | database[key] = {} 32 | database[key]['subset'] = subsets[i] 33 | label = '_'.join(filename.split('_')[:-2]) 34 | database[key]['annotations'] = {'label': label} 35 | 36 | return database 37 | 38 | def get_labels(csv_dir_path): 39 | labels = [] 40 | for name in os.listdir(csv_dir_path): 41 | labels.append('_'.join(name.split('_')[:-2])) 42 | return sorted(list(set(labels))) 43 | 44 | def convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path): 45 | labels = get_labels(csv_dir_path) 46 | database = convert_csv_to_dict(csv_dir_path, split_index) 47 | 48 | dst_data = {} 49 | dst_data['labels'] = labels 50 | dst_data['database'] = {} 51 | dst_data['database'].update(database) 52 | 53 | with open(dst_json_path, 'w') as dst_file: 54 | json.dump(dst_data, dst_file) 55 | 56 | if __name__ == '__main__': 57 | csv_dir_path = sys.argv[1] 58 | 59 | for split_index in range(1, 4): 60 | dst_json_path = os.path.join(csv_dir_path, 'hmdb51_{}.json'.format(split_index)) 61 | convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path) -------------------------------------------------------------------------------- /3D-ResNet/utils/ucf101_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_path, subset): 8 | data = pd.read_csv(csv_path, delimiter=' ', header=None) 9 | keys = [] 10 | key_labels = [] 11 | for i in range(data.shape[0]): 12 | row = data.ix[i, :] 13 | slash_rows = data.ix[i, 0].split('/') 14 | class_name = slash_rows[0] 15 | basename = slash_rows[1].split('.')[0] 16 | 17 | keys.append(basename) 18 | key_labels.append(class_name) 19 | 20 | database = {} 21 | for i in range(len(keys)): 22 | key = keys[i] 23 | database[key] = {} 24 | database[key]['subset'] = subset 25 | label = key_labels[i] 26 | database[key]['annotations'] = {'label': label} 27 | 28 | return database 29 | 30 | def load_labels(label_csv_path): 31 | data = pd.read_csv(label_csv_path, delimiter=' ', header=None) 32 | labels = [] 33 | for i in range(data.shape[0]): 34 | labels.append(data.ix[i, 1]) 35 | return labels 36 | 37 | def convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 38 | val_csv_path, dst_json_path): 39 | labels = load_labels(label_csv_path) 40 | train_database = convert_csv_to_dict(train_csv_path, 'training') 41 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 42 | 43 | dst_data = {} 44 | dst_data['labels'] = labels 45 | dst_data['database'] = {} 46 | dst_data['database'].update(train_database) 47 | dst_data['database'].update(val_database) 48 | 49 | with open(dst_json_path, 'w') as dst_file: 50 | json.dump(dst_data, dst_file) 51 | 52 | if __name__ == '__main__': 53 | csv_dir_path = sys.argv[1] 54 | 55 | for split_index in range(1, 4): 56 | label_csv_path = os.path.join(csv_dir_path, 'classInd.txt') 57 | train_csv_path = os.path.join(csv_dir_path, 'trainlist0{}.txt'.format(split_index)) 58 | val_csv_path = os.path.join(csv_dir_path, 'testlist0{}.txt'.format(split_index)) 59 | dst_json_path = os.path.join(csv_dir_path, 'ucf101_0{}.json'.format(split_index)) 60 | 61 | convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 62 | val_csv_path, dst_json_path) 63 | -------------------------------------------------------------------------------- /LocalAggregation/scripts/localagg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from copy import deepcopy 4 | from src.agents.agents import * 5 | from src.utils.setup import process_config, _process_config 6 | from src.utils.utils import load_json 7 | 8 | 9 | def run(config_path, ir_checkpoint_dir=None, pre_checkpoint_dir=None, cluster_checkpoint_dir=None): 10 | config = process_config(config_path) 11 | AgentClass = globals()[config.agent] 12 | agent = AgentClass(config) 13 | 14 | if ir_checkpoint_dir is not None: 15 | agent.load_checkpoint('checkpoint.pth.tar', ir_checkpoint_dir, load_memory_bank=True, 16 | load_model=True, load_optim=False, load_epoch=False, 17 | cluster_label_dir=cluster_checkpoint_dir) 18 | 19 | if pre_checkpoint_dir is not None: 20 | agent.load_checkpoint('checkpoint.pth.tar', pre_checkpoint_dir, load_memory_bank=True, 21 | load_model=True, load_optim=True, load_epoch=True, 22 | cluster_label_dir=cluster_checkpoint_dir) 23 | 24 | try: 25 | agent.run() 26 | agent.finalise() 27 | except KeyboardInterrupt: 28 | pass 29 | 30 | 31 | if __name__ == "__main__": 32 | import argparse 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('config', type=str, default='path to config file') 35 | args = parser.parse_args() 36 | 37 | config_json = load_json(args.config) 38 | 39 | ir_checkpoint_dir = None 40 | pre_checkpoint_dir = None 41 | cluster_checkpoint_dir = None 42 | 43 | if config_json['pretrained_exp_dir'] is not None: 44 | print("NOTE: found pretrained model...Continue training") 45 | pre_checkpoint_dir = os.path.join(config_json['pretrained_exp_dir'], 'checkpoints') 46 | 47 | if config_json['cluster_checkpoint_dir'] is not None: 48 | print("NOTE: loading cluster assignment") 49 | cluster_checkpoint_dir = os.path.join(config_json['cluster_checkpoint_dir'], 'checkpoints') 50 | 51 | # If pre_checkpoint_dir already exits, ignore ir_checkpoint_dir 52 | if pre_checkpoint_dir is None and config_json['instance_exp_dir'] is not None: 53 | print("NOTE: found IR model...") 54 | ir_checkpoint_dir = os.path.join(config_json['instance_exp_dir'], 'checkpoints') 55 | 56 | run(args.config, ir_checkpoint_dir=ir_checkpoint_dir, pre_checkpoint_dir=pre_checkpoint_dir, 57 | cluster_checkpoint_dir=cluster_checkpoint_dir) 58 | -------------------------------------------------------------------------------- /3D-ResNet/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import os 5 | import sys 6 | import numpy as np 7 | 8 | from utils import AverageMeter, calculate_accuracy 9 | 10 | 11 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, 12 | epoch_logger, batch_logger, writer): 13 | print('train at epoch {}'.format(epoch)) 14 | 15 | model.train() 16 | 17 | batch_time = AverageMeter() 18 | data_time = AverageMeter() 19 | losses = AverageMeter() 20 | accuracies = AverageMeter() 21 | 22 | dataset = "ucf" 23 | if opt.dataset == "hmdb51": 24 | dataset = "hmdb" 25 | 26 | end_time = time.time() 27 | for i, (inputs, targets) in enumerate(data_loader): 28 | acc = 0 29 | data_time.update(time.time() - end_time) 30 | 31 | if not opt.no_cuda: 32 | targets = targets.cuda(async=True) 33 | if opt.dataset == "hmdb51": 34 | targets -= 1 35 | 36 | outputs = model(inputs.cuda()) 37 | loss = criterion(outputs, targets) 38 | 39 | losses.update(loss.item(), inputs.size(0)) 40 | 41 | acc = calculate_accuracy(outputs, targets) 42 | accuracies.update(acc, inputs.size(0)) 43 | 44 | optimizer.zero_grad() 45 | loss.backward() 46 | optimizer.step() 47 | 48 | batch_time.update(time.time() - end_time) 49 | end_time = time.time() 50 | 51 | writer.add_scalar('%s/train_loss' % dataset, losses.val, (epoch - 1) * len(data_loader) + (i + 1)) 52 | 53 | print('Epoch: [{0}][{1}/{2}]\t' 54 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 55 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 56 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 57 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 58 | epoch, 59 | i + 1, 60 | len(data_loader), 61 | batch_time=batch_time, 62 | data_time=data_time, 63 | loss=losses, 64 | acc=accuracies)) 65 | 66 | if epoch % opt.checkpoint == 0: 67 | save_file_path = os.path.join(opt.result_path, 68 | 'save_{}.pth'.format(epoch)) 69 | states = { 70 | 'epoch': epoch + 1, 71 | 'arch': opt.arch, 72 | 'state_dict': model.state_dict(), 73 | 'optimizer': optimizer.state_dict(), 74 | } 75 | torch.save(states, save_file_path) 76 | -------------------------------------------------------------------------------- /3D-ResNet/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import torch.nn.functional as F 4 | import time 5 | import os 6 | import sys 7 | import json 8 | 9 | from utils import AverageMeter 10 | 11 | 12 | def calculate_video_results(output_buffer, video_id, test_results, class_names): 13 | video_outputs = torch.stack(output_buffer) 14 | average_scores = torch.mean(video_outputs, dim=0) 15 | sorted_scores, locs = torch.topk(average_scores, k=10) 16 | 17 | video_results = [] 18 | for i in range(sorted_scores.size(0)): 19 | video_results.append({ 20 | 'label': class_names[locs[i].item()], 21 | 'score': sorted_scores[i].item() 22 | }) 23 | 24 | test_results['results'][video_id] = video_results 25 | 26 | 27 | def test(data_loader, model, opt, class_names): 28 | print('test') 29 | 30 | model.eval() 31 | 32 | batch_time = AverageMeter() 33 | data_time = AverageMeter() 34 | 35 | end_time = time.time() 36 | output_buffer = [] 37 | previous_video_id = '' 38 | test_results = {'results': {}} 39 | with torch.no_grad(): 40 | for i, (inputs, targets) in enumerate(data_loader): 41 | data_time.update(time.time() - end_time) 42 | 43 | inputs = Variable(inputs) 44 | outputs = model(inputs) 45 | if not opt.no_softmax_in_test: 46 | outputs = F.softmax(outputs) 47 | 48 | for j in range(outputs.size(0)): 49 | if not (i == 0 and j == 0) and targets[j] != previous_video_id: 50 | calculate_video_results(output_buffer, previous_video_id, 51 | test_results, class_names) 52 | output_buffer = [] 53 | output_buffer.append(outputs[j].data.cpu()) 54 | previous_video_id = targets[j] 55 | 56 | if (i % 100) == 0: 57 | with open( 58 | os.path.join(opt.result_path, '{}.json'.format( 59 | opt.test_subset)), 'w') as f: 60 | json.dump(test_results, f) 61 | 62 | batch_time.update(time.time() - end_time) 63 | end_time = time.time() 64 | 65 | print('[{}/{}]\t' 66 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 67 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format( 68 | i + 1, 69 | len(data_loader), 70 | batch_time=batch_time, 71 | data_time=data_time)) 72 | 73 | with open( 74 | os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)), 75 | 'w') as f: 76 | json.dump(test_results, f) 77 | -------------------------------------------------------------------------------- /LocalAggregation/src/idt/compute_fv_models.py: -------------------------------------------------------------------------------- 1 | import shutil, random, os 2 | import pandas as pd 3 | import numpy as np 4 | import gzip 5 | from sklearn import decomposition 6 | import pickle 7 | from sklearn.mixture import GaussianMixture 8 | from sketching import * 9 | from extract_idt import fisher_vector 10 | import argparse 11 | from os.path import join 12 | 13 | 14 | def get_parser(): 15 | parser = argparse.ArgumentParser(description="FV model estimation") 16 | parser.add_argument("--idt_path", help="Path to precomputed IDTs.") 17 | return parser 18 | 19 | 20 | if __name__ == "__main__": 21 | # define an empty numpy array for concatenating features 22 | all_features = np.array([]) 23 | 24 | args = get_parser().parse_args() 25 | dirpath = args.idt_path 26 | 27 | counter = 0 28 | num_feats = 500 29 | 30 | filenames = os.listdir(dirpath) 31 | for fname in filenames: 32 | if fname.endswith(".gz"): 33 | srcpath = os.path.join(dirpath, fname) 34 | print("concatenating features from: ", srcpath) 35 | counter = counter + 1 36 | df = pd.read_table(gzip.open(srcpath), sep='\s+', header=None) 37 | 38 | # turn pandas dataframe into array 39 | df_array = np.round(df.values, decimals=3) 40 | 41 | array_sum = np.sum(df_array) 42 | array_has_nan = np.isnan(array_sum) 43 | if (array_has_nan): 44 | continue 45 | 46 | if df_array.shape[0] < num_feats: 47 | print('less than %d' % num_feats) 48 | else: 49 | idx = np.random.randint(df_array.shape[0], size=num_feats) 50 | df_array = df_array[idx, :] 51 | 52 | # concatenate all the features 53 | print('stack feature vectors...', counter) 54 | all_features = np.vstack([all_features, df_array]) if all_features.size else df_array 55 | print('Done!-----------------------------') 56 | 57 | features = all_features[:, 10:436] 58 | 59 | trajectories = pd.DataFrame(features) 60 | print('The feature dimension after random sampling is: ', trajectories.shape) 61 | print(trajectories.describe()) 62 | 63 | pca = decomposition.PCA(0.90) 64 | pca_features = pca.fit_transform(trajectories) 65 | 66 | print(pca_features.shape) 67 | 68 | filename = join(dirpath, 'pca_model.sav') 69 | pickle.dump(pca, open(filename, 'wb')) 70 | 71 | K = 256 72 | gmm = GaussianMixture(n_components=K, covariance_type='diag', n_init=2, max_iter=200) 73 | 74 | print("Start the GMM estimation...") 75 | gmm.fit(pca_features) 76 | print("A GMM estimation has been finished!") 77 | 78 | filename = join(dirpath, 'gmm_diag_model.sav') 79 | pickle.dump(gmm, open(filename, 'wb')) 80 | 81 | SKETCH_DIM_feat = 2000 82 | fv = fisher_vector(pca_features[0, :], gmm) 83 | print(fv.shape) 84 | 85 | d_fv = fv.shape[0] 86 | h_fv = choose_h_sk_mat(SKETCH_DIM_feat, d_fv) 87 | s_fv = choose_s_sk_mat(2, d_fv) 88 | sdense_fv = create_s_dense(h_fv, s_fv) 89 | 90 | filename = join(dirpath, 'sketching_proj.sav') 91 | pickle.dump(sdense_fv, open(filename, 'wb')) 92 | -------------------------------------------------------------------------------- /3D-ResNet/temporal_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | 5 | class LoopPadding(object): 6 | 7 | def __init__(self, size): 8 | self.size = size 9 | 10 | def __call__(self, frame_indices): 11 | out = frame_indices 12 | 13 | for index in out: 14 | if len(out) >= self.size: 15 | break 16 | out.append(index) 17 | 18 | return out 19 | 20 | 21 | class TemporalBeginCrop(object): 22 | """Temporally crop the given frame indices at a beginning. 23 | 24 | If the number of frames is less than the size, 25 | loop the indices as many times as necessary to satisfy the size. 26 | 27 | Args: 28 | size (int): Desired output size of the crop. 29 | """ 30 | 31 | def __init__(self, size): 32 | self.size = size 33 | 34 | def __call__(self, frame_indices): 35 | out = frame_indices[:self.size] 36 | 37 | for index in out: 38 | if len(out) >= self.size: 39 | break 40 | out.append(index) 41 | 42 | return out 43 | 44 | 45 | class TemporalCenterCrop(object): 46 | """Temporally crop the given frame indices at a center. 47 | 48 | If the number of frames is less than the size, 49 | loop the indices as many times as necessary to satisfy the size. 50 | 51 | Args: 52 | size (int): Desired output size of the crop. 53 | """ 54 | 55 | def __init__(self, size): 56 | self.size = size 57 | 58 | def __call__(self, frame_indices): 59 | """ 60 | Args: 61 | frame_indices (list): frame indices to be cropped. 62 | Returns: 63 | list: Cropped frame indices. 64 | """ 65 | 66 | center_index = len(frame_indices) // 2 67 | begin_index = max(0, center_index - (self.size // 2)) 68 | end_index = min(begin_index + self.size, len(frame_indices)) 69 | 70 | out = frame_indices[begin_index:end_index] 71 | 72 | for index in out: 73 | if len(out) >= self.size: 74 | break 75 | out.append(index) 76 | 77 | return out 78 | 79 | 80 | class TemporalStride(object): 81 | 82 | def __init__(self, stride): 83 | self.stride = stride 84 | 85 | def __call__(self, frame_indices): 86 | out = frame_indices[0::self.stride] 87 | 88 | return out 89 | 90 | 91 | class TemporalRandomCrop(object): 92 | """Temporally crop the given frame indices at a random location. 93 | 94 | If the number of frames is less than the size, 95 | loop the indices as many times as necessary to satisfy the size. 96 | 97 | Args: 98 | size (int): Desired output size of the crop. 99 | """ 100 | 101 | def __init__(self, size): 102 | self.size = size 103 | 104 | def __call__(self, frame_indices): 105 | """ 106 | Args: 107 | frame_indices (list): frame indices to be cropped. 108 | Returns: 109 | list: Cropped frame indices. 110 | """ 111 | 112 | rand_end = max(0, len(frame_indices) - self.size - 1) 113 | begin_index = random.randint(0, rand_end) 114 | end_index = min(begin_index + self.size, len(frame_indices)) 115 | 116 | out = frame_indices[begin_index:end_index] 117 | 118 | for index in out: 119 | if len(out) >= self.size: 120 | break 121 | out.append(index) 122 | 123 | return out 124 | -------------------------------------------------------------------------------- /LocalAggregation/src/utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | import torch 5 | import numpy as np 6 | from collections import Counter, OrderedDict 7 | 8 | 9 | class AverageMeter(object): 10 | """Computes and stores the average and current value""" 11 | def __init__(self): 12 | self.reset() 13 | 14 | def reset(self): 15 | self.val = 0 16 | self.avg = 0 17 | self.sum = 0 18 | self.count = 0 19 | 20 | def update(self, val, n=1): 21 | self.val = val 22 | self.sum += val * n 23 | self.count += n 24 | self.avg = self.sum / self.count 25 | 26 | 27 | class ProgressMeter(object): 28 | def __init__(self, num_batches, meters, prefix=""): 29 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 30 | self.meters = meters 31 | self.prefix = prefix 32 | 33 | def display(self, batch): 34 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 35 | entries += [str(meter) for meter in self.meters] 36 | print('\t'.join(entries)) 37 | 38 | def _get_batch_fmtstr(self, num_batches): 39 | num_digits = len(str(num_batches // 1)) 40 | fmt = '{:' + str(num_digits) + 'd}' 41 | return '[' + fmt + '/' + fmt.format(num_batches) + ']' 42 | 43 | def copy_checkpoint(folder='./', filename='checkpoint.pth.tar', 44 | copyname='copy.pth.tar'): 45 | shutil.copyfile(os.path.join(folder, filename), 46 | os.path.join(folder, copyname)) 47 | 48 | 49 | def save_checkpoint(state, is_best, folder='./', filename='checkpoint.pth.tar'): 50 | if not os.path.isdir(folder): 51 | os.mkdir(folder) 52 | torch.save(state, os.path.join(folder, filename)) 53 | if is_best: 54 | shutil.copyfile(os.path.join(folder, filename), 55 | os.path.join(folder, 'model_best.pth.tar')) 56 | 57 | 58 | def load_json(f_path): 59 | with open(f_path, 'r') as f: 60 | return json.load(f) 61 | 62 | 63 | def save_json(obj, f_path): 64 | with open(f_path, 'w') as f: 65 | json.dump(obj, f, ensure_ascii=False) 66 | 67 | 68 | class OrderedCounter(Counter, OrderedDict): 69 | """Counter that remembers the order elements are first encountered""" 70 | 71 | def __repr__(self): 72 | return '%s(%r)' % (self.__class__.__name__, OrderedDict(self)) 73 | 74 | def __reduce__(self): 75 | return self.__class__, (OrderedDict(self),) 76 | 77 | 78 | def adjust_learning_rate(epoch, opt_params, optimizer): 79 | if opt_params.lr_decay_schedule is not None: 80 | steps = np.sum(epoch > np.asarray(opt_params.lr_decay_schedule)) 81 | assert isinstance(opt_params.lr_decay_rate, float) 82 | if steps > 0: 83 | new_lr = opt_params.learning_rate * (opt_params.lr_decay_rate ** steps) 84 | for param_group in optimizer.param_groups: 85 | param_group['lr'] = new_lr 86 | 87 | 88 | def exclude_bn_weight_bias_from_weight_decay(model, weight_decay): 89 | decay = [] 90 | no_decay = [] 91 | for name, param in model.named_parameters(): 92 | if not param.requires_grad: 93 | continue 94 | # if len(param.shape) == 1 or name in skip_list: 95 | if 'bn' in name: 96 | no_decay.append(param) 97 | else: 98 | decay.append(param) 99 | return [ 100 | {'params': no_decay, 'weight_decay': 0.}, 101 | {'params': decay, 'weight_decay': weight_decay} 102 | ] -------------------------------------------------------------------------------- /3D-ResNet/validation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import sys 5 | import numpy as np 6 | 7 | from utils import AverageMeter, calculate_accuracy 8 | 9 | 10 | def val_epoch(epoch, data_loader, model, criterion, opt, logger, writer): 11 | print('validation at epoch {}'.format(epoch)) 12 | 13 | model.eval() 14 | 15 | batch_time = AverageMeter() 16 | data_time = AverageMeter() 17 | losses = AverageMeter() 18 | accuracies1 = AverageMeter() 19 | accuracies5 = AverageMeter() 20 | 21 | end_time = time.time() 22 | with torch.no_grad(): 23 | for i, (inputs, targets) in enumerate(data_loader): 24 | acc1 = 0 25 | data_time.update(time.time() - end_time) 26 | 27 | if not opt.no_cuda: 28 | targets = targets.cuda(async=True) 29 | if opt.dataset == "hmdb51": 30 | targets -= 1 31 | 32 | outputs = model(inputs.cuda()) 33 | loss = criterion(outputs, targets) 34 | 35 | losses.update(loss.item(), inputs.size(0)) 36 | 37 | acc1 = calculate_accuracy(outputs, targets) 38 | acc5 = calculate_accuracy(outputs, targets, 5) 39 | accuracies5.update(acc5, inputs.size(0)) 40 | accuracies1.update(acc1, inputs.size(0)) 41 | 42 | batch_time.update(time.time() - end_time) 43 | end_time = time.time() 44 | 45 | print('Epoch: [{0}][{1}/{2}]\t' 46 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 47 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 48 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 49 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 50 | epoch, 51 | i + 1, 52 | len(data_loader), 53 | batch_time=batch_time, 54 | data_time=data_time, 55 | loss=losses, 56 | acc=accuracies1)) 57 | 58 | dataset = "ucf" 59 | if opt.dataset == "hmdb51": 60 | dataset = "hmdb" 61 | 62 | writer.add_scalar('%s/val_top5' % dataset, accuracies5.avg, epoch) 63 | writer.add_scalar('%s/val_top1' % dataset, accuracies1.avg, epoch) 64 | writer.add_scalar('%s/val_loss' % dataset, losses.avg, epoch) 65 | 66 | return losses.avg 67 | 68 | 69 | def val_final(data_loader, model, opt): 70 | print('Final validation') 71 | 72 | model.eval() 73 | 74 | batch_time = AverageMeter() 75 | data_time = AverageMeter() 76 | accuracies1 = AverageMeter() 77 | accuracies5 = AverageMeter() 78 | 79 | end_time = time.time() 80 | with torch.no_grad(): 81 | for i, inputs in enumerate(data_loader): 82 | data_time.update(time.time() - end_time) 83 | 84 | inputs1, inputs2, inputs3, inputs4, inputs5, targets = inputs 85 | 86 | if not opt.no_cuda: 87 | targets = targets.cuda(async=True) 88 | if opt.dataset == "hmdb51": 89 | targets -= 1 90 | 91 | outputs1 = model(inputs1) 92 | outputs2 = model(inputs2) 93 | outputs3 = model(inputs3) 94 | outputs4 = model(inputs4) 95 | outputs5 = model(inputs5) 96 | 97 | outputs = (outputs1 + outputs2 + outputs3 + outputs4 + outputs5) / 5.0 98 | 99 | acc1 = calculate_accuracy(outputs, targets) 100 | acc5 = calculate_accuracy(outputs, targets, 5) 101 | 102 | accuracies5.update(acc5, inputs1.size(0)) 103 | accuracies1.update(acc1, inputs1.size(0)) 104 | 105 | batch_time.update(time.time() - end_time) 106 | end_time = time.time() 107 | 108 | print('Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 109 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 110 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 111 | i + 1, 112 | len(data_loader), 113 | batch_time=batch_time, 114 | data_time=data_time, 115 | acc=accuracies1)) 116 | -------------------------------------------------------------------------------- /3D-ResNet/dataset.py: -------------------------------------------------------------------------------- 1 | from datasets.kinetics import Kinetics 2 | from datasets.ucf101 import UCF101 3 | from datasets.hmdb51 import HMDB51 4 | 5 | 6 | def get_training_set(opt, spatial_transform, temporal_transform, 7 | target_transform): 8 | assert opt.dataset in ['kinetics', 'ucf101', 'hmdb51'] 9 | 10 | if opt.dataset == 'kinetics': 11 | training_data = Kinetics( 12 | opt.video_path, 13 | opt.annotation_path, 14 | 'training', 15 | spatial_transform=spatial_transform, 16 | temporal_transform=temporal_transform, 17 | target_transform=target_transform) 18 | elif opt.dataset == 'ucf101': 19 | training_data = UCF101( 20 | opt.video_path, 21 | opt.annotation_path, 22 | 'training', 23 | spatial_transform=spatial_transform, 24 | temporal_transform=temporal_transform, 25 | target_transform=target_transform, 26 | shot=opt.shot) 27 | elif opt.dataset == 'hmdb51': 28 | training_data = HMDB51( 29 | opt.video_path, 30 | opt.annotation_path, 31 | 'training', 32 | spatial_transform=spatial_transform, 33 | temporal_transform=temporal_transform, 34 | target_transform=target_transform, 35 | shot=opt.shot) 36 | 37 | return training_data 38 | 39 | 40 | def get_validation_set(opt, spatial_transform, temporal_transform, 41 | target_transform): 42 | assert opt.dataset in ['kinetics', 'ucf101', 'hmdb51'] 43 | 44 | if opt.dataset == 'kinetics': 45 | validation_data = Kinetics( 46 | opt.video_path, 47 | opt.annotation_path, 48 | 'validation', 49 | opt.n_val_samples, 50 | spatial_transform, 51 | temporal_transform, 52 | target_transform, 53 | sample_duration=opt.sample_duration) 54 | elif opt.dataset == 'ucf101': 55 | validation_data = UCF101( 56 | opt.video_path, 57 | opt.annotation_path, 58 | 'validation', 59 | opt.n_val_samples, 60 | spatial_transform, 61 | temporal_transform, 62 | target_transform, 63 | sample_duration=opt.sample_duration) 64 | elif opt.dataset == 'hmdb51': 65 | validation_data = HMDB51( 66 | opt.video_path, 67 | opt.annotation_path, 68 | 'validation', 69 | opt.n_val_samples, 70 | spatial_transform, 71 | temporal_transform, 72 | target_transform, 73 | sample_duration=opt.sample_duration) 74 | 75 | return validation_data 76 | 77 | def get_test_set(opt, spatial_transform, temporal_transform, target_transform): 78 | assert opt.dataset in ['kinetics', 'ucf101', 'hmdb51'] 79 | assert opt.test_subset in ['val', 'test'] 80 | 81 | if opt.test_subset == 'val': 82 | subset = 'validation' 83 | elif opt.test_subset == 'test': 84 | subset = 'testing' 85 | if opt.dataset == 'kinetics': 86 | test_data = Kinetics( 87 | opt.video_path, 88 | opt.annotation_path, 89 | subset, 90 | 0, 91 | spatial_transform, 92 | temporal_transform, 93 | target_transform, 94 | sample_duration=opt.sample_duration, 95 | num_vid_samples=5) 96 | elif opt.dataset == 'ucf101': 97 | test_data = UCF101( 98 | opt.video_path, 99 | opt.annotation_path, 100 | subset, 101 | 1, 102 | spatial_transform, 103 | temporal_transform, 104 | target_transform, 105 | sample_duration=opt.sample_duration, 106 | num_vid_samples=5) 107 | elif opt.dataset == 'hmdb51': 108 | test_data = HMDB51( 109 | opt.video_path, 110 | opt.annotation_path, 111 | subset, 112 | 1, 113 | spatial_transform, 114 | temporal_transform, 115 | target_transform, 116 | sample_duration=opt.sample_duration, 117 | num_vid_samples=5) 118 | 119 | return test_data 120 | -------------------------------------------------------------------------------- /3D-ResNet/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from models import resnet 5 | 6 | def generate_model(opt): 7 | assert opt.model in [ 8 | 'resnet' 9 | ] 10 | 11 | if opt.model == 'resnet': 12 | assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200] 13 | input_chan = 3 14 | 15 | from models.resnet import get_fine_tuning_parameters 16 | 17 | if opt.model_depth == 10: 18 | model = resnet.resnet10( 19 | num_classes=opt.n_classes, 20 | shortcut_type=opt.resnet_shortcut, 21 | sample_size=opt.sample_size, 22 | sample_duration=opt.sample_duration, \ 23 | input_chan=input_chan) 24 | elif opt.model_depth == 18: 25 | model = resnet.resnet18( 26 | num_classes=opt.n_classes, 27 | shortcut_type=opt.resnet_shortcut, 28 | sample_size=opt.sample_size, 29 | sample_duration=opt.sample_duration, \ 30 | input_chan=input_chan) 31 | elif opt.model_depth == 34: 32 | model = resnet.resnet34( 33 | num_classes=opt.n_classes, 34 | shortcut_type=opt.resnet_shortcut, 35 | sample_size=opt.sample_size, 36 | sample_duration=opt.sample_duration, \ 37 | input_chan=input_chan) 38 | elif opt.model_depth == 50: 39 | model = resnet.resnet50( 40 | num_classes=opt.n_classes, 41 | shortcut_type=opt.resnet_shortcut, 42 | sample_size=opt.sample_size, 43 | sample_duration=opt.sample_duration, \ 44 | input_chan=input_chan) 45 | elif opt.model_depth == 101: 46 | model = resnet.resnet101( 47 | num_classes=opt.n_classes, 48 | shortcut_type=opt.resnet_shortcut, 49 | sample_size=opt.sample_size, 50 | sample_duration=opt.sample_duration) 51 | elif opt.model_depth == 152: 52 | model = resnet.resnet152( 53 | num_classes=opt.n_classes, 54 | shortcut_type=opt.resnet_shortcut, 55 | sample_size=opt.sample_size, 56 | sample_duration=opt.sample_duration) 57 | elif opt.model_depth == 200: 58 | model = resnet.resnet200( 59 | num_classes=opt.n_classes, 60 | shortcut_type=opt.resnet_shortcut, 61 | sample_size=opt.sample_size, 62 | sample_duration=opt.sample_duration) 63 | 64 | if not opt.no_cuda: 65 | model = model.cuda() 66 | model = nn.DataParallel(model, device_ids=None) 67 | 68 | if opt.pretrain_path: 69 | print('loading pretrained model {}'.format(opt.pretrain_path)) 70 | pretrain = torch.load(opt.pretrain_path) 71 | if 'arch' in pretrain: 72 | assert opt.arch == pretrain['arch'] 73 | model.load_state_dict(pretrain['state_dict']) 74 | else: 75 | if "state_dict" in pretrain.keys(): 76 | model.module.load_state_dict(pretrain['state_dict']) 77 | else: 78 | model.module.fc = nn.Linear( 79 | model.module.fc.in_features, 128) 80 | model.load_state_dict(pretrain['model_state_dict']) 81 | 82 | model.module.fc = nn.Linear(model.module.fc.in_features, opt.n_finetune_classes) 83 | model.module.fc = model.module.fc.cuda() 84 | 85 | model.module.freeze_layers(opt.ft_begin_index) 86 | parameters = get_fine_tuning_parameters(model, opt.ft_begin_index) 87 | return model, parameters 88 | else: 89 | if opt.pretrain_path: 90 | print('loading pretrained model {}'.format(opt.pretrain_path)) 91 | pretrain = torch.load(opt.pretrain_path) 92 | assert opt.arch == pretrain['arch'] 93 | 94 | model.load_state_dict(pretrain['state_dict']) 95 | 96 | model.fc = nn.Linear(model.fc.in_features, opt.n_finetune_classes) 97 | 98 | parameters = get_fine_tuning_parameters(model, opt.ft_begin_index) 99 | return model, parameters 100 | 101 | return model, model.parameters() 102 | -------------------------------------------------------------------------------- /LocalAggregation/src/idt/cluster_fv.py: -------------------------------------------------------------------------------- 1 | from src.datasets.kinetics import load_annotation_data, get_video_names_and_annotations, load_value_file 2 | import os 3 | import torch 4 | import json 5 | import argparse 6 | from os.path import join 7 | import numpy as np 8 | from src.objectives.localagg import run_kmeans_multi_gpu, run_kmeans 9 | 10 | 11 | DEFAULT_KMEANS_SEED = 1234 12 | 13 | 14 | def get_parser(): 15 | parser = argparse.ArgumentParser(description="IDT video inference") 16 | parser.add_argument("--k", type=int, help="Number of clusters.") 17 | parser.add_argument("--num_c", type=int, help="Number of clusterings.") 18 | parser.add_argument("--frames_path", help="Path to Kinetics frames.") 19 | parser.add_argument("--annotation_path", help="Path to Kinetics annotation.") 20 | parser.add_argument("--fv_path", help="Path to Fisher vectors.") 21 | parser.add_argument("--clusters_path", help="Path to save cluster.") 22 | parser.add_argument("--processed_annotation_path", help="Path to output annotation file.") 23 | parser.add_argument('--gpu', nargs='*', help='GPU id') 24 | return parser 25 | 26 | 27 | def compute_clusters(data, k, gpu_devices): 28 | pred_labels = [] 29 | data_npy = data.cpu().detach().numpy() 30 | data_npy = np.float32(data_npy) 31 | for k_idx, each_k in enumerate(k): 32 | # cluster the data 33 | 34 | if len(gpu_devices) == 1: # single gpu 35 | I, _ = run_kmeans(data_npy, each_k, seed=k_idx + DEFAULT_KMEANS_SEED, 36 | gpu_device=gpu_devices[0]) 37 | else: # multigpu 38 | I, _ = run_kmeans_multi_gpu(data_npy, each_k, seed=k_idx + DEFAULT_KMEANS_SEED, gpu_device=gpu_devices) 39 | 40 | clust_labels = np.asarray(I) 41 | pred_labels.append(clust_labels) 42 | pred_labels = np.stack(pred_labels, axis=0) 43 | pred_labels = torch.from_numpy(pred_labels).long() 44 | 45 | return pred_labels 46 | 47 | 48 | if __name__ == "__main__": 49 | args = get_parser().parse_args() 50 | 51 | gpu_devices = [] 52 | if args.gpu: 53 | ids_list = '' 54 | for i in range(len(args.gpu)): 55 | ids_list += args.gpu[i] + ',' 56 | gpu_devices.append(int(args.gpu[i])) 57 | ids_list = ids_list[:-1] 58 | 59 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 60 | os.environ["CUDA_VISIBLE_DEVICES"] = ids_list 61 | 62 | frames_path = args.frames_path 63 | annotation_path = args.annotation_path 64 | fv_path = args.fv_path 65 | k = args.k 66 | n_clusters = args.num_c 67 | 68 | data = load_annotation_data(annotation_path) 69 | 70 | video_names, annotations = get_video_names_and_annotations(data, "training") 71 | 72 | count_valid = 0 73 | count_missing = 0 74 | fvs = [] 75 | database = {} 76 | labels = set([]) 77 | for i in range(len(video_names)): 78 | if i % 1000 == 0: 79 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 80 | 81 | vid_key = video_names[i].split("/")[-1] 82 | vid_label = video_names[i].split("/")[-2].replace("_", " ") 83 | 84 | video_path = os.path.join(frames_path, video_names[i]) 85 | if not os.path.exists(video_path): 86 | continue 87 | 88 | n_frames_file_path = os.path.join(video_path, 'n_frames') 89 | n_frames = int(load_value_file(n_frames_file_path)) 90 | if n_frames <= 0: 91 | continue 92 | 93 | count_valid += 1 94 | 95 | fv_vid_path = os.path.join(fv_path, video_names[i]) + ".dat" 96 | if not os.path.exists(fv_vid_path): 97 | count_missing += 1 98 | continue 99 | else: 100 | fv = torch.load(fv_vid_path) 101 | 102 | value = {} 103 | value['subset'] = 'training' 104 | value['annotations'] = {} 105 | value['annotations']['label'] = vid_label 106 | database[vid_key] = value 107 | 108 | labels.add(vid_label) 109 | 110 | fvs.append(fv.cpu().squeeze()) 111 | 112 | for key, value in data['database'].items(): 113 | this_subset = value['subset'] 114 | if (this_subset == 'validation' and (value['annotations']['label'] in labels)) or this_subset == 'testing': 115 | database[key] = value 116 | 117 | print("%d missing out of %d\n" % (count_missing, count_valid)) 118 | 119 | fvs = torch.stack(fvs) 120 | 121 | k = [k for _ in range(n_clusters)] 122 | 123 | cluster_labels = compute_clusters(fvs, k, gpu_devices) 124 | 125 | os.mkdir(join(args.clusters_path, "checkpoints")) 126 | torch.save({'cluster_labels': cluster_labels}, join(args.clusters_path, "checkpoints/checkpoint.pth.tar")) 127 | 128 | out = {} 129 | out['labels'] = list(labels) 130 | out['database'] = database 131 | 132 | with open(args.processed_annotation_path, 'w') as dst_file: 133 | json.dump(out, dst_file) 134 | -------------------------------------------------------------------------------- /LocalAggregation/src/objectives/instance.py: -------------------------------------------------------------------------------- 1 | """ 2 | Non-parametric Instance Discrimination Loss 3 | https://github.com/zhirongw/lemniscate.pytorch 4 | 5 | Code is based on Tensorflow implementation: 6 | https://github.com/neuroailab/LocalAggregation 7 | 8 | This script wraps the InstanceDiscriminationLoss function as a torch.nn.Module, 9 | so that the loss can be computed parallelly across multi-gpus using Dataparallel 10 | 11 | """ 12 | import math 13 | import torch 14 | import numpy as np 15 | 16 | from src.utils.tensor import l2_normalize 17 | 18 | 19 | class InstanceDiscriminationLossModule(torch.nn.Module): 20 | def __init__(self, memory_bank_broadcast, cluster_labels_broadcast=None, k=4096, t=0.07, m=0.5): 21 | super(InstanceDiscriminationLossModule, self).__init__() 22 | self.k, self.t, self.m = k, t, m 23 | 24 | self.indices = None 25 | self.outputs = None 26 | self._bank = None # pass in via forward function 27 | self.memory_bank_broadcast = memory_bank_broadcast 28 | self.data_len = memory_bank_broadcast[0].size(0) 29 | self.Z_est = 0 30 | 31 | def _softmax(self, dot_prods): 32 | Z = 2876934.2 33 | return torch.exp(dot_prods / self.t) / Z 34 | 35 | def updated_new_data_memory(self, indices, outputs): 36 | outputs = l2_normalize(outputs) 37 | data_memory = torch.index_select(self._bank, 0, indices) 38 | new_data_memory = data_memory * self.m + (1 - self.m) * outputs 39 | return l2_normalize(new_data_memory, dim=1) 40 | 41 | def synchronization_check(self): 42 | for i in range(len(self.memory_bank_broadcast)): 43 | if i == 0: 44 | device = self.memory_bank_broadcast[0].device 45 | else: 46 | assert torch.equal(self.memory_bank_broadcast[0], self.memory_bank_broadcast[i].to(device)) 47 | 48 | def _get_dot_products(self, vec, idxs): 49 | """ 50 | This function is copied from the get_dot_products in Memory_Bank class 51 | Since we want to register self._bank as a buffer (to be broadcasted to multigpus) instead of self.memory_bank, 52 | we need to avoid calling self.memory_bank get_dot_products 53 | 54 | """ 55 | vec_shape = list(vec.size()) # [bs, dim] 56 | idxs_shape = list(idxs.size()) # [bs, ...] 57 | 58 | assert len(idxs_shape) in [1, 2] 59 | assert len(vec_shape) == 2 60 | assert vec_shape[0] == idxs_shape[0] 61 | 62 | if len(idxs_shape) == 1: 63 | with torch.no_grad(): 64 | memory_vecs = torch.index_select(self._bank, 0, idxs) 65 | memory_vecs_shape = list(memory_vecs.size()) 66 | assert memory_vecs_shape[0] == idxs_shape[0] 67 | else: # len(idxs_shape) == 2 68 | with torch.no_grad(): 69 | batch_size, k_dim = idxs.size(0), idxs.size(1) 70 | flat_idxs = idxs.view(-1) 71 | memory_vecs = torch.index_select(self._bank, 0, flat_idxs) 72 | memory_vecs = memory_vecs.view(batch_size, k_dim, self._bank.size(1)) 73 | memory_vecs_shape = list(memory_vecs.size()) 74 | 75 | vec_shape[1:1] = [1] * (len(idxs_shape) - 1) 76 | vec = vec.view(vec_shape) # [bs, 1, dim] 77 | 78 | prods = memory_vecs * vec 79 | assert list(prods.size()) == memory_vecs_shape 80 | 81 | return torch.sum(prods, dim=-1) 82 | 83 | def compute_data_prob(self): 84 | logits = self._get_dot_products(self.outputs, self.indices) 85 | return self._softmax(logits) 86 | 87 | def compute_noise_prob(self): 88 | batch_size = self.indices.size(0) 89 | noise_indx = torch.randint(0, self.data_len, (batch_size, self.k), 90 | device=self.outputs.device) # U(0, data_len) 91 | noise_indx = noise_indx.long() 92 | logits = self._get_dot_products(self.outputs, noise_indx) 93 | noise_probs = self._softmax(logits) 94 | return noise_probs 95 | 96 | def forward(self, indices, outputs, gpu_idx): 97 | self.indices = indices.detach() 98 | self.outputs = l2_normalize(outputs, dim=1) 99 | self._bank = self.memory_bank_broadcast[gpu_idx] 100 | 101 | batch_size = self.indices.size(0) 102 | data_prob = self.compute_data_prob() 103 | noise_prob = self.compute_noise_prob() 104 | 105 | assert data_prob.size(0) == batch_size 106 | assert noise_prob.size(0) == batch_size 107 | assert noise_prob.size(1) == self.k 108 | 109 | base_prob = 1.0 / self.data_len 110 | eps = 1e-7 111 | 112 | ## Pmt 113 | data_div = data_prob + (self.k * base_prob + eps) 114 | 115 | ln_data = torch.log(data_prob) - torch.log(data_div) 116 | 117 | ## Pon 118 | noise_div = noise_prob + (self.k * base_prob + eps) 119 | ln_noise = math.log(self.k * base_prob) - torch.log(noise_div) 120 | 121 | curr_loss = -(torch.sum(ln_data) + torch.sum(ln_noise)) 122 | curr_loss = curr_loss / batch_size 123 | 124 | new_data_memory = self.updated_new_data_memory(self.indices, self.outputs) 125 | 126 | return curr_loss.unsqueeze(0), new_data_memory 127 | 128 | -------------------------------------------------------------------------------- /LocalAggregation/src/utils/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import logging 5 | import numpy as np 6 | from pprint import pprint 7 | from dotmap import DotMap 8 | from logging import Formatter 9 | from logging.handlers import RotatingFileHandler 10 | from time import strftime, localtime, time 11 | 12 | from src.utils.utils import load_json, save_json 13 | 14 | 15 | def makedirs(dir_list): 16 | for dir in dir_list: 17 | if not os.path.exists(dir): 18 | os.makedirs(dir) 19 | 20 | 21 | def process_config(config_path, override_dotmap=None): 22 | config_json = load_json(config_path) 23 | return _process_config(config_json, override_dotmap=override_dotmap) 24 | 25 | 26 | def _process_config(config_json, override_dotmap=None): 27 | """ 28 | Processes config file: 29 | 1) Converts it to a DotMap 30 | 2) Creates experiments path and required subdirs 31 | 3) Set up logging 32 | """ 33 | config = DotMap(config_json) 34 | if override_dotmap is not None: 35 | config.update(override_dotmap) 36 | 37 | print("Loaded configuration: ") 38 | print(config) 39 | 40 | print() 41 | print(" *************************************** ") 42 | print(" Running experiment {}".format(config.exp_name)) 43 | print(" *************************************** ") 44 | print() 45 | 46 | # if config.pretrained_exp_dir is not None: 47 | # # don't make new dir more continuing training 48 | # exp_dir = config.pretrained_exp_dir 49 | # print("[INFO]: Continuing from previously finished training at %s." % exp_dir) 50 | # else: 51 | exp_base = config.exp_base 52 | 53 | if config.debug: 54 | exp_dir = os.path.join(exp_base, "experiments", 55 | config.exp_name, 'debug') 56 | else: 57 | if config.pretrained_exp_dir is not None and isinstance(config.pretrained_exp_dir, str): 58 | # don't make new dir more continuing training 59 | exp_dir = config.pretrained_exp_dir 60 | print('[INFO]: Backup previously trained model and config json') 61 | os.system("cp %s/config.json %s/prev_config.json" % (exp_dir, exp_dir)) 62 | os.system("cp %s/checkpoints/checkpoint.pth.tar %s/checkpoints/prev_checkpoint.pth.tar" % (exp_dir, exp_dir)) 63 | os.system("cp %s/checkpoints/model_best.pth.tar %s/checkpoints/prev_model_best.pth.tar" % (exp_dir, exp_dir)) 64 | elif config.continue_exp_dir is not None and isinstance(config.continue_exp_dir, str): 65 | exp_dir = config.continue_exp_dir 66 | print('[INFO]: Backup previously trained model and config json') 67 | os.system("cp %s/config.json %s/prev_config.json" % (exp_dir, exp_dir)) 68 | os.system( 69 | "cp %s/checkpoints/checkpoint.pth.tar %s/checkpoints/prev_checkpoint.pth.tar" % (exp_dir, exp_dir)) 70 | os.system( 71 | "cp %s/checkpoints/model_best.pth.tar %s/checkpoints/prev_model_best.pth.tar" % (exp_dir, exp_dir)) 72 | else: 73 | if config.exp_id is None: 74 | config.exp_id = strftime('%Y-%m-%d--%H_%M_%S', localtime()) 75 | exp_dir = os.path.join(exp_base, "experiments", 76 | config.exp_name, config.exp_id) 77 | 78 | # create some important directories to be used for the experiment. 79 | config.summary_dir = "summaries/" 80 | config.checkpoint_dir = os.path.join(exp_dir, "checkpoints/") 81 | config.out_dir = os.path.join(exp_dir, "out/") 82 | config.log_dir = os.path.join(exp_dir, "logs/") 83 | 84 | makedirs([config.summary_dir, config.checkpoint_dir, 85 | config.out_dir, config.log_dir]) 86 | 87 | # save config to experiment dir 88 | config_out = os.path.join(exp_dir, 'config.json') 89 | save_json(config.toDict(), config_out) 90 | 91 | # setup logging in the project 92 | setup_logging(config.log_dir) 93 | 94 | logging.getLogger().info("Experiment directory is located at %s" % exp_dir) 95 | 96 | logging.getLogger().info( 97 | "Configurations and directories successfully set up.") 98 | return config 99 | 100 | 101 | def setup_logging(log_dir): 102 | log_file_format = "[%(levelname)s] - %(asctime)s - %(name)s - : %(message)s in %(pathname)s:%(lineno)d" 103 | log_console_format = "[%(levelname)s]: %(message)s" 104 | 105 | # Main logger 106 | main_logger = logging.getLogger() 107 | main_logger.setLevel(logging.INFO) 108 | 109 | console_handler = logging.StreamHandler() 110 | console_handler.setLevel(logging.INFO) 111 | console_handler.setFormatter(Formatter(log_console_format)) 112 | 113 | exp_file_handler = RotatingFileHandler( 114 | '{}exp_debug.log'.format(log_dir), maxBytes=10**6, backupCount=5) 115 | exp_file_handler.setLevel(logging.DEBUG) 116 | exp_file_handler.setFormatter(Formatter(log_file_format)) 117 | 118 | exp_errors_file_handler = RotatingFileHandler( 119 | '{}exp_error.log'.format(log_dir), maxBytes=10**6, backupCount=5) 120 | exp_errors_file_handler.setLevel(logging.WARNING) 121 | exp_errors_file_handler.setFormatter(Formatter(log_file_format)) 122 | 123 | main_logger.addHandler(console_handler) 124 | main_logger.addHandler(exp_file_handler) 125 | main_logger.addHandler(exp_errors_file_handler) 126 | 127 | 128 | def print_cuda_statistics(): 129 | logger = logging.getLogger("Cuda Statistics") 130 | import sys 131 | from subprocess import call 132 | import torch 133 | logger.info('__Python VERSION: {}'.format(sys.version)) 134 | logger.info('__pyTorch VERSION: {}'.format(torch.__version__)) 135 | logger.info('__CUDA VERSION') 136 | try: 137 | call(["nvcc", "--version"]) 138 | except: 139 | pass 140 | logger.info('__CUDNN VERSION: {}'.format(torch.backends.cudnn.version())) 141 | logger.info('__Number CUDA Devices: {}'.format(torch.cuda.device_count())) 142 | logger.info('__Devices') 143 | call(["nvidia-smi", "--format=csv", 144 | "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"]) 145 | logger.info('Active CUDA Device: GPU {}'.format(torch.cuda.current_device())) 146 | logger.info('Available devices {}'.format(torch.cuda.device_count())) 147 | logger.info('Current cuda device {}'.format(torch.cuda.current_device())) 148 | -------------------------------------------------------------------------------- /LocalAggregation/src/idt/extract_idt.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import gzip 3 | import numpy as np 4 | import pickle 5 | import warnings 6 | import glob 7 | from sketching import * 8 | from os.path import isfile, join 9 | import os 10 | import torch 11 | import argparse 12 | 13 | 14 | def get_parser(): 15 | parser = argparse.ArgumentParser(description="IDT video inference") 16 | parser.add_argument("--category", help="Category to process.") 17 | parser.add_argument("--model_path", help="Path to FV models.") 18 | parser.add_argument("--videos_path", help="Path to videos.") 19 | parser.add_argument("--boxes_path", help="Path to boxes.") 20 | parser.add_argument("--out_path", help="Output path.") 21 | return parser 22 | 23 | 24 | def fisher_vector(xx, gmm): 25 | """Computes the Fisher vector on a set of descriptors/features. 26 | 27 | Parameters 28 | ---------- 29 | xx: array_like, shape (N, D) or (D, ) 30 | The set of descriptors 31 | 32 | gmm: instance of sklearn mixture.GMM object 33 | Gauassian mixture model of the descriptors. 34 | 35 | Returns 36 | ------- 37 | fv: array_like, shape (K + 2 * D * K, ) 38 | Fisher vector (derivatives with respect to the mixing weights, means 39 | and variances) of the given descriptors. 40 | 41 | Reference 42 | --------- 43 | J. Krapac, J. Verbeek, F. Jurie. Modeling Spatial Layout with Fisher 44 | Vectors for Image Categorization. In ICCV, 2011. 45 | http://hal.inria.fr/docs/00/61/94/03/PDF/final.r1.pdf 46 | """ 47 | 48 | xx = np.atleast_2d(xx) 49 | N = xx.shape[0] 50 | 51 | # Compute posterior probabilities. 52 | Q = gmm.predict_proba(xx) # NxK 53 | 54 | # Compute the sufficient statistics of descriptors. 55 | Q_sum = np.sum(Q, 0)[:, np.newaxis] / N 56 | Q_xx = np.dot(Q.T, xx) / N 57 | Q_xx_2 = np.dot(Q.T, xx ** 2) / N 58 | 59 | # Compute derivatives with respect to mixing weights, means and variances. 60 | d_pi = Q_sum.squeeze() - gmm.weights_ 61 | d_mu = Q_xx - Q_sum * gmm.means_ 62 | d_sigma = ( 63 | - Q_xx_2 64 | - Q_sum * gmm.means_ ** 2 65 | + Q_sum * gmm.covariances_ + 2 * Q_xx * gmm.means_) 66 | 67 | # Merge derivatives into a vector. 68 | return np.hstack((d_mu.flatten(), d_sigma.flatten())) 69 | 70 | 71 | def power_normalize(xx, alpha=0.5): 72 | """Computes a alpha-power normalization for the matrix xx.""" 73 | return np.sign(xx) * np.abs(xx) ** alpha 74 | 75 | 76 | def L2_normalize(xx): 77 | """L2-normalizes each row of the data xx.""" 78 | Zx = np.sum(xx * xx, 1) 79 | xx_norm = np.divide(xx, np.sqrt(Zx[:, np.newaxis])) 80 | xx_norm[np.isnan(xx_norm)] = 0 81 | return xx_norm 82 | 83 | 84 | def compute_fv(filename, gmm, pca, sdense_fv): 85 | # use '\s+' for the \t and \n 86 | df = pd.read_table(gzip.open(filename), sep='\s+', header=None) 87 | df = df.iloc[:, 10:436] 88 | 89 | # turn pandas dataframe into array 90 | df_array = df.values 91 | if np.any(np.isnan(df_array)): 92 | return None 93 | # use stored PCA 94 | 95 | if df_array.shape[0] > 3000000: 96 | print("Dropping tracks to save memory") 97 | df_array = df_array[:3000000] 98 | 99 | df_array = pca.transform(df_array) 100 | # get the fisher vector for each video sequence 101 | fv = fisher_vector(df_array, gmm) 102 | 103 | fv = power_normalize(fv, alpha=0.5) 104 | fv = np.expand_dims(fv, axis=0) 105 | fv = L2_normalize(fv) 106 | 107 | SKETCH_FLOAT = 0 108 | SKETCH_DIM_feat = 2000 109 | 110 | FEAT_DIM_fv = fv.shape[1] 111 | 112 | fv = sketch_batch(torch.from_numpy(fv), sdense_fv, SKETCH_DIM_feat, FEAT_DIM_fv, SKETCH_FLOAT) 113 | 114 | return fv 115 | 116 | 117 | def process_category(model_path, videos_path, boxes_path, out_path, category): 118 | print(category) 119 | pca = pickle.load(open(join(model_path, "pca_model.sav"), 'rb')) 120 | gmm = pickle.load(open(join(model_path, "gmm_diag_model.sav"), 'rb')) 121 | sdense_fv = pickle.load(open(join(model_path, "sketching_proj.sav"), 'rb')) 122 | 123 | box_path = join(boxes_path, category) 124 | boxes = [f for f in os.listdir(box_path) if isfile(join(box_path, f))] 125 | 126 | vid_path = join(videos_path, category) 127 | vids = [f for f in os.listdir(vid_path) if isfile(join(vid_path, f))] 128 | 129 | out_path = join(out_path, category) 130 | 131 | if not os.path.exists(out_path): 132 | os.makedirs(out_path) 133 | 134 | count = 0 135 | temp_path = "/scratch/ptokmako/IDT_features_temp" 136 | for vid in vids: 137 | vid_name = vid.split(".")[0] 138 | 139 | if (vid_name + ".bb") not in boxes: 140 | print("Boxes not found for %s !!!!!!!!!!" % vid_name) 141 | continue 142 | count += 1 143 | if os.path.exists(out_path + "%s.dat" % vid_name): 144 | continue 145 | print("%d/%d %s" % (count, len(vids), vid_name)) 146 | 147 | sz = 0 148 | filename = "%s/%s.gz" % (temp_path, category + "_" + vid_name) 149 | attempts = 0 150 | while sz < 100 and attempts < 5: 151 | stream = os.popen('sh src/idt/idt.sh "%s" "%s" "%s" %s' % (join(vid_path, vid), join(box_path, vid_name + ".bb"), 152 | category + "_" + vid_name, temp_path)) 153 | output = stream.read() 154 | print(output) 155 | sz = os.path.getsize(filename) 156 | attempts += 1 157 | 158 | if sz < 100: 159 | print("Could not process video!!!!!!!!!!!!!!") 160 | continue 161 | 162 | fv = compute_fv(filename, gmm, pca, sdense_fv) 163 | if fv is not None: 164 | torch.save(fv, join(out_path, vid_name + ".dat")) 165 | else: 166 | print("NaNs in IDT\n") 167 | 168 | stream = os.popen('rm -f "%s/%s.gz"' % (temp_path, category + "_" + vid_name)) 169 | output = stream.read() 170 | 171 | 172 | if __name__ == "__main__": 173 | warnings.filterwarnings("ignore") 174 | args = get_parser().parse_args() 175 | model_path = args.model_path 176 | videos_path = args.videos_path 177 | boxes_path = args.boxes_path 178 | out_path = args.out_path 179 | category = args.category 180 | 181 | process_category(model_path, videos_path, boxes_path, out_path, category) 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /3D-ResNet/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import numpy as np 5 | import torch 6 | from torch import nn 7 | from torch import optim 8 | from torch.optim import lr_scheduler 9 | from tensorboardX import SummaryWriter 10 | 11 | from opts import parse_opts 12 | from model import generate_model 13 | from mean import get_mean, get_std 14 | from spatial_transforms import ( 15 | Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop, 16 | MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor) 17 | from temporal_transforms import LoopPadding, TemporalRandomCrop, TemporalStride 18 | from target_transforms import ClassLabel, VideoID 19 | from target_transforms import Compose as TargetCompose 20 | from dataset import get_training_set, get_validation_set, get_test_set 21 | from utils import Logger 22 | from train import train_epoch 23 | from validation import val_epoch, val_final 24 | import test 25 | import warnings 26 | 27 | 28 | def get_lr(optimizer): 29 | for i, param_group in enumerate(optimizer.param_groups): 30 | if float(param_group['lr']) != 0: 31 | return float(param_group['lr']) 32 | 33 | return 0 34 | 35 | 36 | if __name__ == '__main__': 37 | opt = parse_opts() 38 | model_name = opt.result_path.split("/")[-1] 39 | print(model_name) 40 | if opt.root_path != '': 41 | if opt.resume_path: 42 | opt.resume_path = os.path.join(opt.root_path, opt.resume_path) 43 | if opt.pretrain_path: 44 | opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path) 45 | opt.scales = [opt.initial_scale] 46 | for i in range(1, opt.n_scales): 47 | opt.scales.append(opt.scales[-1] * opt.scale_step) 48 | opt.arch = '{}-{}'.format(opt.model, opt.model_depth) 49 | opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset) 50 | opt.std = get_std(opt.norm_value) 51 | 52 | warnings.filterwarnings("ignore", category=UserWarning) 53 | 54 | if opt.gpu: 55 | ids_list = '' 56 | for i in range(len(opt.gpu)): 57 | ids_list += opt.gpu[i] + ',' 58 | ids_list = ids_list[:-1] 59 | 60 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 61 | os.environ["CUDA_VISIBLE_DEVICES"] = ids_list 62 | 63 | print(opt) 64 | if not os.path.exists(opt.result_path): 65 | os.mkdir(opt.result_path) 66 | with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file: 67 | json.dump(vars(opt), opt_file) 68 | 69 | writer = None 70 | writer = SummaryWriter(log_dir='/home/ptokmako/src/LocalAggregation-Pytorch/summaries/3DResNet/' + model_name) 71 | 72 | torch.manual_seed(opt.manual_seed) 73 | 74 | model, parameters = generate_model(opt) 75 | print(model) 76 | criterion = nn.CrossEntropyLoss() 77 | 78 | if not opt.no_cuda: 79 | criterion = criterion.cuda() 80 | 81 | if opt.no_mean_norm and not opt.std_norm: 82 | norm_method = Normalize([0, 0, 0], [1, 1, 1]) 83 | elif not opt.std_norm: 84 | norm_method = Normalize(opt.mean, [1, 1, 1]) 85 | else: 86 | norm_method = Normalize(opt.mean, opt.std) 87 | 88 | if not opt.no_train: 89 | assert opt.train_crop in ['random', 'corner', 'center'] 90 | if opt.train_crop == 'random': 91 | crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size) 92 | elif opt.train_crop == 'corner': 93 | crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) 94 | elif opt.train_crop == 'center': 95 | crop_method = MultiScaleCornerCrop( 96 | opt.scales, opt.sample_size, crop_positions=['c']) 97 | spatial_transform = Compose([ 98 | crop_method, 99 | RandomHorizontalFlip(), 100 | ToTensor(opt.norm_value), norm_method 101 | ]) 102 | temporal_transform = Compose([ TemporalRandomCrop(opt.sample_duration)]) 103 | target_transform = ClassLabel() 104 | training_data = get_training_set(opt, spatial_transform, 105 | temporal_transform, target_transform) 106 | train_loader = torch.utils.data.DataLoader( 107 | training_data, 108 | batch_size=opt.batch_size, 109 | shuffle=True, 110 | num_workers=opt.n_threads, 111 | pin_memory=True) 112 | train_logger = Logger( 113 | os.path.join(opt.result_path, 'train.log'), 114 | ['epoch', 'loss', 'acc', 'lr']) 115 | train_batch_logger = Logger( 116 | os.path.join(opt.result_path, 'train_batch.log'), 117 | ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) 118 | 119 | if opt.nesterov: 120 | dampening = 0 121 | else: 122 | dampening = opt.dampening 123 | 124 | optimizer = optim.SGD( 125 | parameters, 126 | lr=opt.learning_rate, 127 | momentum=opt.momentum, 128 | dampening=dampening, 129 | weight_decay=opt.weight_decay, 130 | nesterov=opt.nesterov) 131 | 132 | scheduler = lr_scheduler.ReduceLROnPlateau( 133 | optimizer, 'min', patience=opt.lr_patience, threshold=opt.lr_threshold) 134 | if not opt.no_val: 135 | spatial_transform = Compose([ 136 | Scale(opt.sample_size), 137 | CenterCrop(opt.sample_size), 138 | ToTensor(opt.norm_value), norm_method 139 | ]) 140 | temporal_transform = Compose([TemporalRandomCrop(opt.sample_duration)]) 141 | target_transform = ClassLabel() 142 | validation_data = get_validation_set( 143 | opt, spatial_transform, temporal_transform, target_transform) 144 | val_loader = torch.utils.data.DataLoader( 145 | validation_data, 146 | batch_size=opt.batch_size, 147 | shuffle=False, 148 | num_workers=opt.n_threads, 149 | pin_memory=True) 150 | val_logger = Logger( 151 | os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc']) 152 | 153 | if opt.resume_path: 154 | print('loading checkpoint {}'.format(opt.resume_path)) 155 | checkpoint = torch.load(opt.resume_path) 156 | assert opt.arch == checkpoint['arch'] 157 | 158 | opt.begin_epoch = checkpoint['epoch'] 159 | model.load_state_dict(checkpoint['state_dict']) 160 | if not opt.no_train: 161 | optimizer.load_state_dict(checkpoint['optimizer']) 162 | 163 | print('run') 164 | for i in range(opt.begin_epoch, opt.n_epochs + 1): 165 | # noinspection PyInterpreter 166 | writer.add_scalar('ucf/lr', get_lr(optimizer), i) 167 | if not opt.no_train: 168 | train_epoch(i, train_loader, model, criterion, optimizer, opt, 169 | train_logger, train_batch_logger, writer) 170 | if not opt.no_val: 171 | validation_loss = val_epoch(i, val_loader, model, criterion, opt, 172 | val_logger, writer) 173 | 174 | if not opt.no_train and not opt.no_val: 175 | scheduler.step(validation_loss) 176 | 177 | if opt.test: 178 | spatial_transform = Compose([ 179 | Scale(int(opt.sample_size / opt.scale_in_test)), 180 | CenterCrop(opt.sample_size), 181 | ToTensor(opt.norm_value), norm_method 182 | ]) 183 | temporal_transform = LoopPadding(5 * opt.sample_duration) 184 | target_transform = ClassLabel() 185 | 186 | test_data = get_test_set(opt, spatial_transform, temporal_transform, 187 | target_transform) 188 | test_loader = torch.utils.data.DataLoader( 189 | test_data, 190 | batch_size=opt.batch_size, 191 | shuffle=False, 192 | num_workers=opt.n_threads, 193 | pin_memory=True) 194 | val_final(test_loader, model, opt) 195 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unsupervised Learning of Video Representations via Dense Trajectory Clustering 2 | 3 | This is an implementation of the [Unsupervised Learning of Video Representations via Dense Trajectory Clustering](https://arxiv.org/abs/2006.15731) algorithm. 4 | 5 | The codebased is built upon [Local Aggregation](https://github.com/neuroailab/LocalAggregation-Pytorch) and [3D ResNet](https://github.com/kenshohara/3D-ResNets-PyTorch). 6 | 7 | ## Prerequisites 8 | 9 | * Linux 10 | * Pytorch 1.2.0 11 | * [Faiss](https://github.com/facebookresearch/faiss) 12 | * tqdm 13 | * dotmap 14 | * tensorboardX 15 | * sklearn 16 | * pandas 17 | 18 | ## Unsupervised representation learning 19 | 20 | ### Dataset preprocessing 21 | Training is done on the [Kinetics-400 dataset](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Download it and preprocess as follows. 22 | ``` 23 | cd 3D-ResNet 24 | ``` 25 | 26 | * Convert from avi to jpg files using ```utils/video_jpg_kinetics.py``` 27 | 28 | ```bash 29 | python utils/video_jpg_kinetics.py AVI_VIDEO_DIRECTORY JPG_VIDEO_DIRECTORY 30 | ``` 31 | 32 | * Generate n_frames files using ```utils/n_frames_kinetics.py``` 33 | 34 | ```bash 35 | python utils/n_frames_kinetics.py JPG_VIDEO_DIRECTORY 36 | ``` 37 | 38 | * Generate annotation file in json format using ```utils/kinetics_json.py``` 39 | * The CSV files (kinetics_{train, val, test}.csv) are included in the crawler. 40 | 41 | ```bash 42 | python utils/kinetics_json.py TRAIN_CSV_PATH VAL_CSV_PATH TEST_CSV_PATH DST_JSON_APTH 43 | ``` 44 | 45 | If you want to use our precomuted IDT clusters for training, please use the Kinetics annotation file provided in this codebase (splits.json). If you find that some videos are missing in you local copy of Kinetics, then you'll have to recompute the clusters using the cluster_fv.py script below, otherwise the correspondence between cluster labels and videos will be broken. 46 | 47 | ### Runtime Setup 48 | ``` 49 | cd LocalAggregation 50 | source init_env.sh 51 | ``` 52 | 53 | ### Pretrained models 54 | We provide several models trained using our Video LA + IDT prior objective, as well as precomputed clusters for the training set of Kinetics-400, under this [link](https://drive.google.com/file/d/1i3Vn_85Fo94BINHgpMaLNvZOKPfS3lvf/view?usp=sharing) (for the varaints trained on 370k videos we skipped the last tuning stage due to memory issues). In addition, this archive contains models finetuned on UCF101 and HMDB51, which are reported in the state-of-the-art comparison section of the paper. 55 | 56 | ### Training using precomputed IDT descriptors 57 | Begin with training a 3D ResNet with an IR objective for 40 epochs. This is done as a warmup step. Don't forget to update data and experiment paths in the config file. 58 | ``` 59 | CUDA_VISIBLE_DEVICES=0 python scripts/instance.py ./config/kinetics_ir.json 60 | ``` 61 | Then specify `instance_exp_dir` in `./config/kinetics_la.json` to point to the IR model you've just trained, and run the following command to trasfer IDT representations to the 3D ResNet via non-parametric clustering: 62 | ``` 63 | CUDA_VISIBLE_DEVICES=0,1,2 python scripts/localagg.py ./config/kinetics_la.json 64 | ``` 65 | To run the final fine-tuning stage, specify `instance_exp_dir` in `./config/kinetics_la_tune.json` to point to the model trained with IDTs, and run the following command: 66 | ``` 67 | CUDA_VISIBLE_DEVICES=0,1,2 python scripts/localagg.py ./config/kinetics_la_tune.json 68 | ``` 69 | 70 | ### Recomputing and clustering IDT descriptors 71 | We provide precomputed Fisher vector-encoded IDT descriptors for the Kinetics dataset under this [link](https://drive.google.com/file/d/1I5ZWlYJfFxXhPrv6gRq1jZJah85usd1H/view?usp=sharing). 72 | 73 | If you wish to recompute them, you will need to first download and install the original [IDT implementation](https://lear.inrialpes.fr/people/wang/improved_trajectories). 74 | This codes takes person detections as input. You can download the detections we used [here](https://drive.google.com/file/d/1CDX8qkhsx9ygL27VG8UQpzAipa3MeHPu/view?usp=sharing). 75 | 76 | Next, estimate the model (PCA, GMM) parameters used in Fisher vector encoding. To this end, first sample 3500 videos from Kinetics at random, and compute IDTs for them, using the script bellow (don't forget to update paths to the IDT implementation). 77 | ``` 78 | sh src/idt/run_idt.sh PATH_TO_VIDEO PATH_TO_BOXES OUTPUT_NAME PATH_TO_IDTS 79 | ``` 80 | Then run the following script to estimate model parameters based on the computed IDTs. The parameters will be saved to the same directory as the IDTs. 81 | ``` 82 | python src/idt/compute_fv_models.py --idt_path PATH_TO_IDTS 83 | ``` 84 | 85 | Now you can compute the Fisher vector encoded IDT descriptors for training set of Kinetics. The following script takes a category as input, so the in can be parallelized 400-way on a CPU cluster (pleas update the path to a temporary folder insight the script, which is used to store raw IDTs). 86 | ``` 87 | python src/idt/extract_idt.py --category CATEGORY_NAME --model_path PATH_TO_IDTS --videos_path PATH_TO_TRAIN_VIDEOS --boxes_path PATH_TO_BOXES --out_path FV_OUTPUT_PATH 88 | ``` 89 | 90 | Finally, to cluster descriptors, run the following script. 91 | ``` 92 | python src/idt/cluster_fv.py --k 6000 --num_c 3 --frames_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATIONS_JSON --fv_path FV_OUTPUT_PATH --clusters_path PATH_TO_OUTPUT_CLUSTERS_DIRECTORY --processed_annotation_path PATH_TO_OUTPUT_ANNOTATIONS_JSON --gpu 0 1 93 | ``` 94 | This script produces a clustering assignement for the training set videos, and a new annotation file. Make sure to use this file in all the config files to ensure correct correspondence between videos and cluster labels. 95 | 96 | ## Transfer learning 97 | ``` 98 | cd 3D-ResNet 99 | ``` 100 | 101 | ### Dataset preprocessing 102 | Download and pre-process [UCF101](http://crcv.ucf.edu/data/UCF101.php) and [HMDB51](http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) datasets as follows. 103 | 104 | ```bash 105 | python utils/video_jpg_ucf101_hmdb51.py AVI_VIDEO_DIRECTORY JPG_VIDEO_DIRECTORY 106 | ``` 107 | 108 | * Generate n_frames files using ```utils/n_frames_ucf101_hmdb51.py``` 109 | 110 | ```bash 111 | python utils/n_frames_ucf101_hmdb51.py JPG_VIDEO_DIRECTORY 112 | ``` 113 | 114 | * Generate annotation file in json format using ```utils/ucf101_json.py``` and ```utils/hmdb51_json.py``` 115 | 116 | ```bash 117 | python utils/ucf101_json.py ANNOTATION_DIR_PATH 118 | python utils/hmdb51_json.py ANNOTATION_DIR_PATH 119 | ``` 120 | 121 | ### Finetuning 122 | On UCF101: 123 | ```bash 124 | python main.py --video_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATION --result_path OUTPUT_MODEL_PATH --dataset ucf101 --n_finetune_classes 101 --model resnet --model_depth 18 --resnet_shortcut B --batch_size 128 --n_threads 16 --gpu 0 --pretrain_path PATH_TO_PRETRAINED_MODEL --checkpoint 10 --ft_begin_index 2 --n_epochs 40 --lr_patience 5 --n_scales 2 --train_crop random 125 | ``` 126 | 127 | On HMDB51: 128 | ```bash 129 | python main.py --video_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATION --result_path OUTPUT_MODEL_PATH --dataset hmdb51 --n_finetune_classes 101 --model resnet --model_depth 18 --resnet_shortcut B --batch_size 128 --n_threads 16 --gpu 0 --pretrain_path PATH_TO_PRETRAINED_MODEL --checkpoint 10 --ft_begin_index 3 --n_epochs 30 --lr_patience 5 --n_scales 2 --train_crop random 130 | ``` 131 | 132 | ### Evaluation 133 | On UCF101: 134 | ```bash 135 | python main.py --video_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATION --dataset ucf101 --n_classes 101 --model resnet --model_depth 18 --resnet_shortcut B --batch_size 128 --n_threads 16 --gpu 0 --test --no_train --no_val --resume_path OUTPUT_MODEL_PATH/save_40.pth 136 | ``` 137 | 138 | On HMDB51: 139 | ```bash 140 | python main.py --video_path PATH_TO_FRAMES --annotation_path PATH_TO_ANNOTATION --dataset hmdb51 --n_classes 101 --model resnet --model_depth 18 --resnet_shortcut B --batch_size 128 --n_threads 16 --gpu 0 --test --no_train --no_val --resume_path OUTPUT_MODEL_PATH/save_30.pth 141 | ``` 142 | -------------------------------------------------------------------------------- /3D-ResNet/datasets/kinetics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | import re 10 | import random 11 | 12 | from utils import load_value_file 13 | 14 | 15 | def pil_loader(path): 16 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 17 | with open(path, 'rb') as f: 18 | with Image.open(f) as img: 19 | return img.convert('RGB') 20 | 21 | 22 | def accimage_loader(path): 23 | try: 24 | import accimage 25 | return accimage.Image(path) 26 | except IOError: 27 | # Potentially a decoding problem, fall back to PIL.Image 28 | return pil_loader(path) 29 | 30 | 31 | def get_default_image_loader(): 32 | from torchvision import get_image_backend 33 | if get_image_backend() == 'accimage': 34 | return accimage_loader 35 | else: 36 | return pil_loader 37 | 38 | 39 | def video_loader(video_dir_path, frame_indices, image_loader): 40 | video = [] 41 | for i in frame_indices: 42 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 43 | if os.path.exists(image_path): 44 | video.append(image_loader(image_path)) 45 | else: 46 | return video 47 | 48 | return video 49 | 50 | 51 | def get_default_video_loader(): 52 | image_loader = get_default_image_loader() 53 | return functools.partial(video_loader, image_loader=image_loader) 54 | 55 | 56 | def load_annotation_data(data_file_path): 57 | with open(data_file_path, 'r') as data_file: 58 | return json.load(data_file) 59 | 60 | 61 | def get_class_labels(data): 62 | class_labels_map = {} 63 | index = 0 64 | for class_label in data['labels']: 65 | class_labels_map[class_label] = index 66 | index += 1 67 | return class_labels_map 68 | 69 | 70 | def get_video_names_and_annotations(data, subset): 71 | video_names = [] 72 | annotations = [] 73 | 74 | for key, value in data['database'].items(): 75 | this_subset = value['subset'] 76 | if this_subset == subset: 77 | if subset == 'validation': 78 | key = re.sub("_\d+", "", key) 79 | if subset == 'testing': 80 | video_names.append('test/{}'.format(key)) 81 | else: 82 | label = value['annotations']['label'] 83 | video_names.append('{}/{}/{}'.format(subset, label.replace(" ", "_"), key)) 84 | annotations.append(value['annotations']) 85 | 86 | return video_names, annotations 87 | 88 | 89 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 90 | sample_duration): 91 | data = load_annotation_data(annotation_path) 92 | 93 | video_names, annotations = get_video_names_and_annotations(data, subset) 94 | class_to_idx = get_class_labels(data) 95 | idx_to_class = {} 96 | for name, label in class_to_idx.items(): 97 | idx_to_class[label] = name 98 | 99 | dataset = [] 100 | for i in range(len(video_names)): 101 | if i % 1000 == 0: 102 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 103 | 104 | video_path = os.path.join(root_path, video_names[i]) 105 | n_frames_file_path = os.path.join(video_path, 'n_frames') 106 | if not os.path.exists(video_path) or not os.path.exists(n_frames_file_path): 107 | # print("Path not found") 108 | continue 109 | 110 | n_frames = int(load_value_file(n_frames_file_path)) 111 | if n_frames <= 0: 112 | continue 113 | 114 | begin_t = 1 115 | end_t = n_frames 116 | sample = { 117 | 'video': video_path, 118 | 'segment': [begin_t, end_t], 119 | 'n_frames': n_frames, 120 | 'video_id': video_names[i][:-14].split('/')[1] 121 | } 122 | if len(annotations) != 0: 123 | sample['label'] = class_to_idx[annotations[i]['label']] 124 | else: 125 | sample['label'] = -1 126 | 127 | if n_samples_for_each_video == 1: 128 | sample['frame_indices'] = list(range(1, n_frames + 1)) 129 | dataset.append(sample) 130 | else: 131 | if n_samples_for_each_video > 1: 132 | step = max(1, 133 | math.ceil((n_frames - 1 - sample_duration) / 134 | (n_samples_for_each_video - 1))) 135 | else: 136 | step = sample_duration 137 | for j in range(1, n_frames, step): 138 | sample_j = copy.deepcopy(sample) 139 | sample_j['frame_indices'] = list( 140 | range(j, min(n_frames + 1, j + sample_duration))) 141 | dataset.append(sample_j) 142 | 143 | return dataset, idx_to_class 144 | 145 | 146 | class Kinetics(data.Dataset): 147 | """ 148 | Args: 149 | root (string): Root directory path. 150 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 151 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 152 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 153 | and returns a transformed version 154 | target_transform (callable, optional): A function/transform that takes in the 155 | target and transforms it. 156 | loader (callable, optional): A function to load an video given its path and frame indices. 157 | Attributes: 158 | classes (list): List of the class names. 159 | class_to_idx (dict): Dict with items (class_name, class_index). 160 | imgs (list): List of (image path, class_index) tuples 161 | """ 162 | 163 | def __init__(self, 164 | root_path, 165 | annotation_path, 166 | subset, 167 | n_samples_for_each_video=1, 168 | spatial_transform=None, 169 | temporal_transform=None, 170 | target_transform=None, 171 | sample_duration=16, 172 | get_loader=get_default_video_loader, 173 | num_vid_samples=1): 174 | self.data, self.class_names = make_dataset( 175 | root_path, annotation_path, subset, n_samples_for_each_video, 176 | sample_duration) 177 | 178 | self.spatial_transform = spatial_transform 179 | self.temporal_transform = temporal_transform 180 | self.target_transform = target_transform 181 | self.loader = get_loader() 182 | self.num_vid_samples = num_vid_samples 183 | self.sample_duration = sample_duration 184 | 185 | def __getitem__(self, index): 186 | """ 187 | Args: 188 | index (int): Index 189 | Returns: 190 | tuple: (image, target) where target is class_index of the target class. 191 | """ 192 | path = self.data[index]['video'] 193 | 194 | frame_indices = self.data[index]['frame_indices'] 195 | if self.temporal_transform is not None: 196 | frame_indices = self.temporal_transform(frame_indices) 197 | 198 | target = self.data[index] 199 | if self.target_transform is not None: 200 | target = self.target_transform(target) 201 | 202 | if self.num_vid_samples == 1: 203 | clip = self.loader(path, frame_indices) 204 | if self.spatial_transform is not None: 205 | self.spatial_transform.randomize_parameters() 206 | clip = [self.spatial_transform(img) for img in clip] 207 | clip = torch.stack(clip, 0) 208 | clip = clip.permute(1, 0, 2, 3) 209 | 210 | return clip, target 211 | else: 212 | clips = [] 213 | for i in range(self.num_vid_samples): 214 | start = random.randint(0, len(frame_indices) - self.sample_duration - 1) 215 | inds = frame_indices[start: start + self.sample_duration] 216 | 217 | clip = self.loader(path, inds) 218 | if self.spatial_transform is not None: 219 | self.spatial_transform.randomize_parameters() 220 | clip = [self.spatial_transform(img) for img in clip] 221 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 222 | clips.append(clip) 223 | 224 | return clips[0], clips[1], clips[2], clips[3], clips[4], target 225 | 226 | def __len__(self): 227 | return len(self.data) 228 | -------------------------------------------------------------------------------- /3D-ResNet/opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_opts(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument( 7 | '--root_path', 8 | default='/root/data/ActivityNet', 9 | type=str, 10 | help='Root directory path of data') 11 | parser.add_argument( 12 | '--video_path', 13 | default='video_kinetics_jpg', 14 | type=str, 15 | help='Directory path of Videos') 16 | parser.add_argument( 17 | '--annotation_path', 18 | default='kinetics.json', 19 | type=str, 20 | help='Annotation file path') 21 | parser.add_argument( 22 | '--result_path', 23 | default='results', 24 | type=str, 25 | help='Result directory path') 26 | parser.add_argument( 27 | '--dataset', 28 | default='kinetics', 29 | type=str, 30 | help='Used dataset (kinetics | ucf101 | hmdb51)') 31 | parser.add_argument( 32 | '--n_classes', 33 | default=400, 34 | type=int, 35 | help= 36 | 'Number of classes (kinetics: 400, ucf101: 101, hmdb51: 51)' 37 | ) 38 | parser.add_argument( 39 | '--n_finetune_classes', 40 | default=400, 41 | type=int, 42 | help= 43 | 'Number of classes for fine-tuning. n_classes is set to the number when pretraining.' 44 | ) 45 | parser.add_argument( 46 | '--sample_size', 47 | default=112, 48 | type=int, 49 | help='Height and width of inputs') 50 | parser.add_argument( 51 | '--sample_duration', 52 | default=16, 53 | type=int, 54 | help='Temporal duration of inputs') 55 | parser.add_argument( 56 | '--temp_stride', 57 | default=1, 58 | type=int, 59 | help='Temporal stride') 60 | parser.add_argument( 61 | '--initial_scale', 62 | default=1.0, 63 | type=float, 64 | help='Initial scale for multiscale cropping') 65 | parser.add_argument( 66 | '--n_scales', 67 | default=5, 68 | type=int, 69 | help='Number of scales for multiscale cropping') 70 | parser.add_argument( 71 | '--scale_step', 72 | default=0.84081289641525, 73 | type=float, 74 | help='Scale step for multiscale cropping') 75 | parser.add_argument( 76 | '--train_crop', 77 | default='corner', 78 | type=str, 79 | help= 80 | 'Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center. (random | corner | center)' 81 | ) 82 | parser.add_argument( 83 | '--learning_rate', 84 | default=0.1, 85 | type=float, 86 | help= 87 | 'Initial learning rate (divided by 10 while training by lr scheduler)') 88 | parser.add_argument('--momentum', default=0.9, type=float, help='Momentum') 89 | parser.add_argument( 90 | '--dampening', default=0.9, type=float, help='dampening of SGD') 91 | parser.add_argument( 92 | '--weight_decay', default=1e-3, type=float, help='Weight Decay') 93 | parser.add_argument( 94 | '--mean_dataset', 95 | default='activitynet', 96 | type=str, 97 | help= 98 | 'dataset for mean values of mean subtraction (activitynet | kinetics)') 99 | parser.add_argument( 100 | '--no_mean_norm', 101 | action='store_true', 102 | help='If true, inputs are not normalized by mean.') 103 | parser.set_defaults(no_mean_norm=False) 104 | parser.add_argument( 105 | '--std_norm', 106 | action='store_true', 107 | help='If true, inputs are normalized by standard deviation.') 108 | parser.set_defaults(std_norm=False) 109 | parser.add_argument( 110 | '--nesterov', action='store_true', help='Nesterov momentum') 111 | parser.set_defaults(nesterov=False) 112 | parser.add_argument( 113 | '--optimizer', 114 | default='sgd', 115 | type=str, 116 | help='Currently only support SGD') 117 | parser.add_argument( 118 | '--lr_patience', 119 | default=10, 120 | type=int, 121 | help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.' 122 | ) 123 | parser.add_argument( 124 | '--batch_size', default=128, type=int, help='Batch Size') 125 | parser.add_argument( 126 | '--n_epochs', 127 | default=200, 128 | type=int, 129 | help='Number of total epochs to run') 130 | parser.add_argument( 131 | '--begin_epoch', 132 | default=1, 133 | type=int, 134 | help= 135 | 'Training begins at this epoch. Previous trained model indicated by resume_path is loaded.' 136 | ) 137 | parser.add_argument( 138 | '--lr_threshold', 139 | default=0.0001, 140 | type=float, 141 | help='LR scheduler threshold') 142 | parser.add_argument( 143 | '--n_val_samples', 144 | default=3, 145 | type=int, 146 | help='Number of validation samples for each activity') 147 | parser.add_argument( 148 | '--resume_path', 149 | default='', 150 | type=str, 151 | help='Save data (.pth) of previous training') 152 | parser.add_argument( 153 | '--label_folder', 154 | default='', 155 | type=str, 156 | help='Folder that stores label encodings for the movies') 157 | parser.add_argument( 158 | '--pretrain_path', default='', type=str, help='Pretrained model (.pth)') 159 | parser.add_argument( 160 | '--ft_begin_index', 161 | default=0, 162 | type=int, 163 | help='Begin block index of fine-tuning') 164 | parser.add_argument( 165 | '--no_train', 166 | action='store_true', 167 | help='If true, training is not performed.') 168 | parser.set_defaults(no_train=False) 169 | parser.add_argument( 170 | '--no_val', 171 | action='store_true', 172 | help='If true, validation is not performed.') 173 | parser.set_defaults(no_val=False) 174 | parser.add_argument( 175 | '--test', action='store_true', help='If true, test is performed.') 176 | parser.set_defaults(test=False) 177 | parser.add_argument( 178 | '--test_subset', 179 | default='val', 180 | type=str, 181 | help='Used subset in test (val | test)') 182 | parser.add_argument( 183 | '--scale_in_test', 184 | default=1.0, 185 | type=float, 186 | help='Spatial scale in test') 187 | parser.add_argument( 188 | '--crop_position_in_test', 189 | default='c', 190 | type=str, 191 | help='Cropping method (c | tl | tr | bl | br) in test') 192 | parser.add_argument( 193 | '--no_softmax_in_test', 194 | action='store_true', 195 | help='If true, output for each clip is not normalized using softmax.') 196 | parser.set_defaults(no_softmax_in_test=False) 197 | parser.add_argument( 198 | '--no_cuda', action='store_true', help='If true, cuda is not used.') 199 | parser.set_defaults(no_cuda=False) 200 | parser.add_argument( 201 | '--n_threads', 202 | default=4, 203 | type=int, 204 | help='Number of threads for multi-thread loading') 205 | parser.add_argument( 206 | '--checkpoint', 207 | default=10, 208 | type=int, 209 | help='Trained model is saved at every this epochs.') 210 | parser.add_argument( 211 | '--no_hflip', 212 | action='store_true', 213 | help='If true holizontal flipping is not performed.') 214 | parser.set_defaults(no_hflip=False) 215 | parser.add_argument( 216 | '--norm_value', 217 | default=1, 218 | type=int, 219 | help= 220 | 'If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].') 221 | parser.add_argument( 222 | '--model', 223 | default='resnet', 224 | type=str, 225 | help='(resnet') 226 | parser.add_argument( 227 | '--model_depth', 228 | default=18, 229 | type=int, 230 | help='Depth of resnet (10 | 18 | 34 | 50 | 101)') 231 | parser.add_argument( 232 | '--resnet_shortcut', 233 | default='B', 234 | type=str, 235 | help='Shortcut type of resnet (A | B)') 236 | parser.add_argument( 237 | '--manual_seed', default=1, type=int, help='Manually set random seed') 238 | parser.add_argument('--gpu', nargs='*', help='GPU id') 239 | parser.add_argument( 240 | '--shot', 241 | default=-1, 242 | type=int, 243 | help='Number of training examples per category') 244 | 245 | args = parser.parse_args() 246 | 247 | return args 248 | -------------------------------------------------------------------------------- /3D-ResNet/datasets/hmdb51.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | import random 10 | import numpy as np 11 | from temporal_transforms import LoopPadding 12 | 13 | from utils import load_value_file 14 | 15 | 16 | def pil_loader(path): 17 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 18 | with open(path, 'rb') as f: 19 | with Image.open(f) as img: 20 | return img.convert('RGB') 21 | 22 | 23 | def accimage_loader(path): 24 | try: 25 | import accimage 26 | return accimage.Image(path) 27 | except IOError: 28 | # Potentially a decoding problem, fall back to PIL.Image 29 | return pil_loader(path) 30 | 31 | 32 | def get_default_image_loader(): 33 | from torchvision import get_image_backend 34 | if get_image_backend() == 'accimage': 35 | return accimage_loader 36 | else: 37 | return pil_loader 38 | 39 | 40 | def video_loader(video_dir_path, frame_indices, image_loader): 41 | video = [] 42 | for i in frame_indices: 43 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 44 | if os.path.exists(image_path): 45 | video.append(image_loader(image_path)) 46 | else: 47 | return video 48 | 49 | return video 50 | 51 | 52 | def get_default_video_loader(): 53 | image_loader = get_default_image_loader() 54 | return functools.partial(video_loader, image_loader=image_loader) 55 | 56 | 57 | def load_annotation_data(data_file_path): 58 | with open(data_file_path, 'r') as data_file: 59 | return json.load(data_file) 60 | 61 | 62 | def get_class_labels(data): 63 | class_labels_map = {} 64 | index = 0 65 | for class_label in data['labels']: 66 | class_labels_map[class_label] = index 67 | index += 1 68 | return class_labels_map 69 | 70 | 71 | def get_video_names_and_annotations(data, subset): 72 | video_names = [] 73 | annotations = [] 74 | 75 | for key, value in data['database'].items(): 76 | this_subset = value['subset'] 77 | if this_subset == subset: 78 | label = value['annotations']['label'] 79 | video_names.append('{}/{}'.format(label, key)) 80 | annotations.append(value['annotations']) 81 | 82 | return video_names, annotations 83 | 84 | 85 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 86 | sample_duration, shot=-1): 87 | data = load_annotation_data(annotation_path) 88 | video_names, annotations = get_video_names_and_annotations(data, subset) 89 | class_to_idx = get_class_labels(data) 90 | instance_count = {} 91 | idx_to_class = {} 92 | for name, label in class_to_idx.items(): 93 | instance_count[name] = 0 94 | idx_to_class[label] = name 95 | 96 | dataset = [] 97 | for i in range(len(video_names)): 98 | if i % 1000 == 0: 99 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 100 | 101 | video_path = os.path.join(root_path, video_names[i]) 102 | if not os.path.exists(video_path): 103 | print(video_path) 104 | continue 105 | 106 | n_frames_file_path = os.path.join(video_path, 'n_frames') 107 | n_frames = int(load_value_file(n_frames_file_path)) 108 | if n_frames <= 0: 109 | continue 110 | 111 | if (shot > 0) and (instance_count[annotations[i]['label']] == shot): 112 | continue 113 | 114 | begin_t = 1 115 | end_t = n_frames 116 | sample = { 117 | 'video': video_path, 118 | 'segment': [begin_t, end_t], 119 | 'n_frames': n_frames, 120 | 'video_id': video_names[i].split('/')[1], 121 | } 122 | if len(annotations) != 0: 123 | sample['label'] = class_to_idx[annotations[i]['label']] 124 | instance_count[annotations[i]['label']] += 1 125 | else: 126 | sample['label'] = -1 127 | 128 | if n_samples_for_each_video == 1: 129 | sample['frame_indices'] = list(range(1, n_frames + 1)) 130 | dataset.append(sample) 131 | else: 132 | if n_samples_for_each_video > 1: 133 | step = max(1, 134 | math.ceil((n_frames - 1 - sample_duration) / 135 | (n_samples_for_each_video - 1))) 136 | else: 137 | step = sample_duration 138 | for j in range(1, n_samples_for_each_video, step): 139 | sample_j = copy.deepcopy(sample) 140 | sample_j['frame_indices'] = list( 141 | range(j, min(n_frames + 1, j + sample_duration))) 142 | dataset.append(sample_j) 143 | 144 | print(len(dataset)) 145 | 146 | return dataset, idx_to_class 147 | 148 | 149 | class HMDB51(data.Dataset): 150 | """ 151 | Args: 152 | root (string): Root directory path. 153 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 154 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 155 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 156 | and returns a transformed version 157 | target_transform (callable, optional): A function/transform that takes in the 158 | target and transforms it. 159 | loader (callable, optional): A function to load an video given its path and frame indices. 160 | Attributes: 161 | classes (list): List of the class names. 162 | class_to_idx (dict): Dict with items (class_name, class_index). 163 | imgs (list): List of (image path, class_index) tuples 164 | """ 165 | 166 | def __init__(self, 167 | root_path, 168 | annotation_path, 169 | subset, 170 | n_samples_for_each_video=1, 171 | spatial_transform=None, 172 | temporal_transform=None, 173 | target_transform=None, 174 | sample_duration=16, 175 | get_loader=get_default_video_loader, 176 | num_vid_samples=1, 177 | shot=-1): 178 | self.data, self.class_names = make_dataset( 179 | root_path, annotation_path, subset, n_samples_for_each_video, 180 | sample_duration, shot=shot) 181 | 182 | self.spatial_transform = spatial_transform 183 | self.temporal_transform = temporal_transform 184 | self.target_transform = target_transform 185 | self.loader = get_loader() 186 | self.num_vid_samples = num_vid_samples 187 | self.sample_duration = sample_duration 188 | 189 | def __getitem__(self, index): 190 | """ 191 | Args: 192 | index (int): Index 193 | Returns: 194 | tuple: (image, target) where target is class_index of the target class. 195 | """ 196 | path = self.data[index]['video'] 197 | 198 | target = self.data[index] 199 | if self.target_transform is not None: 200 | target = self.target_transform(target) 201 | 202 | frame_indices = self.data[index]['frame_indices'] 203 | if self.temporal_transform is not None: 204 | frame_indices = self.temporal_transform(frame_indices) 205 | 206 | if self.num_vid_samples == 1: 207 | clip = self.loader(path, frame_indices) 208 | if self.spatial_transform is not None: 209 | self.spatial_transform.randomize_parameters() 210 | clip = [self.spatial_transform(img) for img in clip] 211 | clip = torch.stack(clip, 0) 212 | clip = clip.permute(1, 0, 2, 3) 213 | 214 | return clip, target 215 | else: 216 | clips = [] 217 | for i in range(self.num_vid_samples): 218 | start = random.randint(0, len(frame_indices) - self.sample_duration - 1) 219 | inds = frame_indices[start: start + self.sample_duration] 220 | 221 | clip = self.loader(path, inds) 222 | if self.spatial_transform is not None: 223 | self.spatial_transform.randomize_parameters() 224 | clip = [self.spatial_transform(img) for img in clip] 225 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 226 | clips.append(clip) 227 | 228 | return clips[0], clips[1], clips[2], clips[3], clips[4], target 229 | 230 | def __len__(self): 231 | return len(self.data) 232 | -------------------------------------------------------------------------------- /3D-ResNet/datasets/ucf101.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | import random 10 | import numpy as np 11 | from temporal_transforms import LoopPadding 12 | 13 | def load_value_file(file_path): 14 | with open(file_path, 'r') as input_file: 15 | value = float(input_file.read().rstrip('\n\r')) 16 | 17 | return value 18 | 19 | 20 | def pil_loader(path): 21 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 22 | with open(path, 'rb') as f: 23 | with Image.open(f) as img: 24 | return img.convert('RGB') 25 | 26 | 27 | def accimage_loader(path): 28 | try: 29 | import accimage 30 | return accimage.Image(path) 31 | except IOError: 32 | # Potentially a decoding problem, fall back to PIL.Image 33 | return pil_loader(path) 34 | 35 | 36 | def get_default_image_loader(): 37 | from torchvision import get_image_backend 38 | if get_image_backend() == 'accimage': 39 | return accimage_loader 40 | else: 41 | return pil_loader 42 | 43 | 44 | def video_loader(video_dir_path, frame_indices, image_loader): 45 | video = [] 46 | for i in frame_indices: 47 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 48 | if os.path.exists(image_path): 49 | video.append(image_loader(image_path)) 50 | else: 51 | return video 52 | 53 | return video 54 | 55 | 56 | def get_default_video_loader(): 57 | image_loader = get_default_image_loader() 58 | return functools.partial(video_loader, image_loader=image_loader) 59 | 60 | 61 | def load_annotation_data(data_file_path): 62 | with open(data_file_path, 'r') as data_file: 63 | return json.load(data_file) 64 | 65 | 66 | def get_class_labels(data): 67 | class_labels_map = {} 68 | index = 0 69 | for class_label in data['labels']: 70 | class_labels_map[class_label] = index 71 | index += 1 72 | return class_labels_map 73 | 74 | 75 | def get_video_names_and_annotations(data, subset): 76 | video_names = [] 77 | annotations = [] 78 | 79 | for key, value in data['database'].items(): 80 | this_subset = value['subset'] 81 | if this_subset == subset: 82 | label = value['annotations']['label'] 83 | video_names.append('{}/{}'.format(label, key)) 84 | annotations.append(value['annotations']) 85 | 86 | return video_names, annotations 87 | 88 | 89 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 90 | sample_duration, shot=-1): 91 | data = load_annotation_data(annotation_path) 92 | video_names, annotations = get_video_names_and_annotations(data, subset) 93 | class_to_idx = get_class_labels(data) 94 | instance_count = {} 95 | idx_to_class = {} 96 | for name, label in class_to_idx.items(): 97 | instance_count[name] = 0 98 | idx_to_class[label] = name 99 | 100 | dataset = [] 101 | for i in range(len(video_names)): 102 | if i % 1000 == 0: 103 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 104 | 105 | video_path = os.path.join(root_path, video_names[i]) 106 | if not os.path.exists(video_path): 107 | continue 108 | 109 | n_frames_file_path = os.path.join(video_path, 'n_frames') 110 | n_frames = int(load_value_file(n_frames_file_path)) 111 | if n_frames <= 0: 112 | continue 113 | 114 | if (shot > 0) and (instance_count[annotations[i]['label']] == shot): 115 | continue 116 | 117 | begin_t = 1 118 | end_t = n_frames 119 | sample = { 120 | 'video': video_path, 121 | 'segment': [begin_t, end_t], 122 | 'n_frames': n_frames, 123 | 'video_id': video_names[i].split('/')[1] 124 | } 125 | if len(annotations) != 0: 126 | sample['label'] = class_to_idx[annotations[i]['label']] 127 | instance_count[annotations[i]['label']] += 1 128 | else: 129 | sample['label'] = -1 130 | 131 | if n_samples_for_each_video == 1: 132 | sample['frame_indices'] = list(range(1, n_frames + 1)) 133 | dataset.append(sample) 134 | else: 135 | if n_samples_for_each_video > 1: 136 | step = max(1, 137 | math.ceil((n_frames - 1 - sample_duration) / 138 | (n_samples_for_each_video - 1))) 139 | else: 140 | step = sample_duration 141 | for j in range(1, n_samples_for_each_video, step): 142 | sample_j = copy.deepcopy(sample) 143 | sample_j['frame_indices'] = list( 144 | range(j, min(n_frames + 1, j + sample_duration))) 145 | dataset.append(sample_j) 146 | 147 | return dataset, idx_to_class 148 | 149 | 150 | class UCF101(data.Dataset): 151 | """ 152 | Args: 153 | root (string): Root directory path. 154 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 155 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 156 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 157 | and returns a transformed version 158 | target_transform (callable, optional): A function/transform that takes in the 159 | target and transforms it. 160 | loader (callable, optional): A function to load an video given its path and frame indices. 161 | Attributes: 162 | classes (list): List of the class names. 163 | class_to_idx (dict): Dict with items (class_name, class_index). 164 | imgs (list): List of (image path, class_index) tuples 165 | """ 166 | 167 | def __init__(self, 168 | root_path, 169 | annotation_path, 170 | subset, 171 | n_samples_for_each_video=1, 172 | spatial_transform=None, 173 | temporal_transform=None, 174 | target_transform=None, 175 | sample_duration=16, 176 | get_loader=get_default_video_loader, 177 | label_folder=None, 178 | num_vid_samples=1, 179 | shot=-1): 180 | self.data, self.class_names = make_dataset( 181 | root_path, annotation_path, subset, n_samples_for_each_video, 182 | sample_duration, shot=shot) 183 | print(len(self.data)) 184 | 185 | self.spatial_transform = spatial_transform 186 | self.temporal_transform = temporal_transform 187 | self.target_transform = target_transform 188 | self.loader = get_loader() 189 | self.sample_duration = sample_duration 190 | self.num_vid_samples = num_vid_samples 191 | 192 | def __getitem__(self, index): 193 | """ 194 | Args: 195 | index (int): Index 196 | Returns: 197 | tuple: (image, target) where target is class_index of the target class. 198 | """ 199 | path = self.data[index]['video'] 200 | 201 | frame_indices = self.data[index]['frame_indices'] 202 | if self.temporal_transform is not None: 203 | frame_indices = self.temporal_transform(frame_indices) 204 | 205 | target = self.data[index] 206 | if self.target_transform is not None: 207 | target = self.target_transform(target) 208 | 209 | if self.num_vid_samples == 1: 210 | clip = self.loader(path, frame_indices) 211 | if self.spatial_transform is not None: 212 | self.spatial_transform.randomize_parameters() 213 | clip = [self.spatial_transform(img) for img in clip] 214 | clip = torch.stack(clip, 0) 215 | clip = clip.permute(1, 0, 2, 3) 216 | 217 | return clip, target 218 | else: 219 | clips = [] 220 | for i in range(self.num_vid_samples): 221 | start = random.randint(0, len(frame_indices) - self.sample_duration - 1) 222 | inds = frame_indices[start: start + self.sample_duration] 223 | 224 | clip = self.loader(path, inds) 225 | if self.spatial_transform is not None: 226 | self.spatial_transform.randomize_parameters() 227 | clip = [self.spatial_transform(img) for img in clip] 228 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 229 | clips.append(clip) 230 | 231 | return clips[0], clips[1], clips[2], clips[3], clips[4], target 232 | 233 | def __len__(self): 234 | return len(self.data) 235 | -------------------------------------------------------------------------------- /3D-ResNet/models/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = [ 9 | 'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 10 | 'resnet152', 'resnet200' 11 | ] 12 | 13 | 14 | def conv3x3x3(in_planes, out_planes, stride=1): 15 | # 3x3x3 convolution with padding 16 | return nn.Conv3d( 17 | in_planes, 18 | out_planes, 19 | kernel_size=3, 20 | stride=stride, 21 | padding=1, 22 | bias=False) 23 | 24 | 25 | def downsample_basic_block(x, planes, stride): 26 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 27 | zero_pads = torch.Tensor( 28 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 29 | out.size(4)).zero_() 30 | if isinstance(out.data, torch.cuda.FloatTensor): 31 | zero_pads = zero_pads.cuda() 32 | 33 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 34 | 35 | return out 36 | 37 | 38 | class BasicBlock(nn.Module): 39 | expansion = 1 40 | 41 | def __init__(self, inplanes, planes, stride=1, downsample=None): 42 | super(BasicBlock, self).__init__() 43 | self.conv1 = conv3x3x3(inplanes, planes, stride) 44 | self.bn1 = nn.BatchNorm3d(planes) 45 | self.relu = nn.ReLU(inplace=True) 46 | self.conv2 = conv3x3x3(planes, planes) 47 | self.bn2 = nn.BatchNorm3d(planes) 48 | self.downsample = downsample 49 | self.stride = stride 50 | 51 | def forward(self, x): 52 | residual = x 53 | 54 | out = self.conv1(x) 55 | out = self.bn1(out) 56 | out = self.relu(out) 57 | 58 | out = self.conv2(out) 59 | out = self.bn2(out) 60 | 61 | if self.downsample is not None: 62 | residual = self.downsample(x) 63 | 64 | out += residual 65 | out = self.relu(out) 66 | 67 | return out 68 | 69 | 70 | class Bottleneck(nn.Module): 71 | expansion = 4 72 | 73 | def __init__(self, inplanes, planes, stride=1, downsample=None): 74 | super(Bottleneck, self).__init__() 75 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 76 | self.bn1 = nn.BatchNorm3d(planes) 77 | self.conv2 = nn.Conv3d( 78 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 79 | self.bn2 = nn.BatchNorm3d(planes) 80 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 81 | self.bn3 = nn.BatchNorm3d(planes * 4) 82 | self.relu = nn.ReLU(inplace=True) 83 | self.downsample = downsample 84 | self.stride = stride 85 | 86 | def forward(self, x): 87 | residual = x 88 | 89 | out = self.conv1(x) 90 | out = self.bn1(out) 91 | out = self.relu(out) 92 | 93 | out = self.conv2(out) 94 | out = self.bn2(out) 95 | out = self.relu(out) 96 | 97 | out = self.conv3(out) 98 | out = self.bn3(out) 99 | 100 | if self.downsample is not None: 101 | residual = self.downsample(x) 102 | 103 | out += residual 104 | out = self.relu(out) 105 | 106 | return out 107 | 108 | 109 | class ResNet(nn.Module): 110 | 111 | def __init__(self, 112 | block, 113 | layers, 114 | sample_size, 115 | sample_duration, 116 | shortcut_type='B', 117 | num_classes=400, 118 | input_chan=3): 119 | self.inplanes = 64 120 | super(ResNet, self).__init__() 121 | self.conv1 = nn.Conv3d( 122 | input_chan, 123 | 64, 124 | kernel_size=7, 125 | stride=(1, 2, 2), 126 | padding=(3, 3, 3), 127 | bias=False) 128 | self.bn1 = nn.BatchNorm3d(64) 129 | self.relu = nn.ReLU(inplace=True) 130 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 131 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 132 | self.layer2 = self._make_layer( 133 | block, 128, layers[1], shortcut_type, stride=2) 134 | self.layer3 = self._make_layer( 135 | block, 256, layers[2], shortcut_type, stride=2) 136 | self.layer4 = self._make_layer( 137 | block, 512, layers[3], shortcut_type, stride=2) 138 | last_duration = int(math.ceil(sample_duration / 16)) 139 | last_size = int(math.ceil(sample_size / 32)) 140 | self.avgpool = nn.AvgPool3d( 141 | (last_duration, last_size, last_size), stride=1) 142 | self.fc = nn.Linear(512 * block.expansion, num_classes) 143 | 144 | for m in self.modules(): 145 | if isinstance(m, nn.Conv3d): 146 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 147 | elif isinstance(m, nn.BatchNorm3d): 148 | m.weight.data.fill_(1) 149 | m.bias.data.zero_() 150 | 151 | def freeze_layers(self, freeze_at): 152 | if freeze_at < 0: 153 | return 154 | else: 155 | for p in self.bn1.parameters(): 156 | p.requires_grad = False 157 | 158 | for stage in range(freeze_at): 159 | print("freezing at %d" % stage) 160 | if stage == 0: 161 | m = self.conv1 162 | else: 163 | m = getattr(self, "layer" + str(stage)) 164 | for p in m.parameters(): 165 | p.requires_grad = False 166 | 167 | 168 | 169 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 170 | downsample = None 171 | if stride != 1 or self.inplanes != planes * block.expansion: 172 | if shortcut_type == 'A': 173 | downsample = partial( 174 | downsample_basic_block, 175 | planes=planes * block.expansion, 176 | stride=stride) 177 | else: 178 | downsample = nn.Sequential( 179 | nn.Conv3d( 180 | self.inplanes, 181 | planes * block.expansion, 182 | kernel_size=1, 183 | stride=stride, 184 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 185 | 186 | layers = [] 187 | layers.append(block(self.inplanes, planes, stride, downsample)) 188 | self.inplanes = planes * block.expansion 189 | for i in range(1, blocks): 190 | layers.append(block(self.inplanes, planes)) 191 | 192 | return nn.Sequential(*layers) 193 | 194 | def forward(self, x): 195 | x = self.conv1(x) 196 | x = self.bn1(x) 197 | x = self.relu(x) 198 | x = self.maxpool(x) 199 | 200 | x = self.layer1(x) 201 | x = self.layer2(x) 202 | x = self.layer3(x) 203 | x = self.layer4(x) 204 | 205 | x = self.avgpool(x) 206 | 207 | x = x.view(x.size(0), -1) 208 | 209 | if self.fc is not None: 210 | x = self.fc(x) 211 | 212 | return x 213 | 214 | 215 | def get_fine_tuning_parameters(model, ft_begin_index): 216 | if ft_begin_index == 0: 217 | return model.parameters() 218 | 219 | ft_module_names = [] 220 | for i in range(ft_begin_index, 5): 221 | ft_module_names.append('layer{}'.format(i)) 222 | ft_module_names.append('fc') 223 | 224 | parameters = [] 225 | for k, v in model.named_parameters(): 226 | for ft_module in ft_module_names: 227 | if ft_module in k: 228 | parameters.append({'params': v}) 229 | break 230 | else: 231 | parameters.append({'params': v, 'lr': 0.0}) 232 | 233 | return parameters 234 | 235 | 236 | def resnet10(**kwargs): 237 | """Constructs a ResNet-18 model. 238 | """ 239 | model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs) 240 | return model 241 | 242 | 243 | def resnet18(**kwargs): 244 | """Constructs a ResNet-18 model. 245 | """ 246 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 247 | return model 248 | 249 | 250 | def resnet34(**kwargs): 251 | """Constructs a ResNet-34 model. 252 | """ 253 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 254 | return model 255 | 256 | 257 | def resnet50(**kwargs): 258 | """Constructs a ResNet-50 model. 259 | """ 260 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 261 | return model 262 | 263 | 264 | def resnet101(**kwargs): 265 | """Constructs a ResNet-101 model. 266 | """ 267 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 268 | return model 269 | 270 | 271 | def resnet152(**kwargs): 272 | """Constructs a ResNet-101 model. 273 | """ 274 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 275 | return model 276 | 277 | 278 | def resnet200(**kwargs): 279 | """Constructs a ResNet-101 model. 280 | """ 281 | model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs) 282 | return model 283 | -------------------------------------------------------------------------------- /LocalAggregation/src/models/resnet3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = [ 9 | 'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 10 | 'resnet152', 'resnet200' 11 | ] 12 | 13 | 14 | def conv3x3x3(in_planes, out_planes, stride=1): 15 | # 3x3x3 convolution with padding 16 | return nn.Conv3d( 17 | in_planes, 18 | out_planes, 19 | kernel_size=3, 20 | stride=stride, 21 | padding=1, 22 | bias=False) 23 | 24 | 25 | def downsample_basic_block(x, planes, stride): 26 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 27 | zero_pads = torch.Tensor( 28 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 29 | out.size(4)).zero_() 30 | if isinstance(out.data, torch.cuda.FloatTensor): 31 | zero_pads = zero_pads.cuda() 32 | 33 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 34 | 35 | return out 36 | 37 | 38 | class BasicBlock(nn.Module): 39 | expansion = 1 40 | 41 | def __init__(self, inplanes, planes, stride=1, downsample=None): 42 | super(BasicBlock, self).__init__() 43 | self.conv1 = conv3x3x3(inplanes, planes, stride) 44 | self.bn1 = nn.BatchNorm3d(planes) 45 | self.relu = nn.ReLU(inplace=True) 46 | self.conv2 = conv3x3x3(planes, planes) 47 | self.bn2 = nn.BatchNorm3d(planes) 48 | self.downsample = downsample 49 | self.stride = stride 50 | 51 | def forward(self, x): 52 | residual = x 53 | 54 | out = self.conv1(x) 55 | out = self.bn1(out) 56 | out = self.relu(out) 57 | 58 | out = self.conv2(out) 59 | out = self.bn2(out) 60 | 61 | if self.downsample is not None: 62 | residual = self.downsample(x) 63 | 64 | out += residual 65 | out = self.relu(out) 66 | 67 | return out 68 | 69 | 70 | class Bottleneck(nn.Module): 71 | expansion = 4 72 | 73 | def __init__(self, inplanes, planes, stride=1, downsample=None): 74 | super(Bottleneck, self).__init__() 75 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 76 | self.bn1 = nn.BatchNorm3d(planes) 77 | self.conv2 = nn.Conv3d( 78 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 79 | self.bn2 = nn.BatchNorm3d(planes) 80 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 81 | self.bn3 = nn.BatchNorm3d(planes * 4) 82 | self.relu = nn.ReLU(inplace=True) 83 | self.downsample = downsample 84 | self.stride = stride 85 | 86 | def forward(self, x): 87 | residual = x 88 | 89 | out = self.conv1(x) 90 | out = self.bn1(out) 91 | out = self.relu(out) 92 | 93 | out = self.conv2(out) 94 | out = self.bn2(out) 95 | out = self.relu(out) 96 | 97 | out = self.conv3(out) 98 | out = self.bn3(out) 99 | 100 | if self.downsample is not None: 101 | residual = self.downsample(x) 102 | 103 | out += residual 104 | out = self.relu(out) 105 | 106 | return out 107 | 108 | 109 | class ResNet(nn.Module): 110 | 111 | def __init__(self, 112 | block, 113 | layers, 114 | sample_size, 115 | sample_duration, 116 | shortcut_type='B', 117 | num_classes=400, 118 | input_chan=3): 119 | self.inplanes = 64 120 | super(ResNet, self).__init__() 121 | self.conv1 = nn.Conv3d( 122 | input_chan, 123 | 64, 124 | kernel_size=7, 125 | stride=(1, 2, 2), 126 | padding=(3, 3, 3), 127 | bias=False) 128 | self.bn1 = nn.BatchNorm3d(64) 129 | self.relu = nn.ReLU(inplace=True) 130 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 131 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 132 | self.layer2 = self._make_layer( 133 | block, 128, layers[1], shortcut_type, stride=2) 134 | self.layer3 = self._make_layer( 135 | block, 256, layers[2], shortcut_type, stride=2) 136 | self.layer4 = self._make_layer( 137 | block, 512, layers[3], shortcut_type, stride=2) 138 | last_duration = int(math.ceil(sample_duration / 16)) 139 | last_size = int(math.ceil(sample_size / 32)) 140 | self.avgpool = nn.AvgPool3d( 141 | (last_duration, last_size, last_size), stride=1) 142 | self.fc = nn.Linear(512 * block.expansion, num_classes) 143 | 144 | for m in self.modules(): 145 | if isinstance(m, nn.Conv3d): 146 | nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu') 147 | elif isinstance(m, nn.Linear): 148 | nn.init.xavier_uniform_(m.weight) 149 | nn.init.constant_(m.bias, 0) 150 | elif isinstance(m, (nn.BatchNorm3d, nn.GroupNorm)): 151 | nn.init.constant_(m.weight, 1) 152 | nn.init.constant_(m.bias, 0) 153 | 154 | def freeze_layers(self, freeze_at): 155 | if freeze_at < 0: 156 | return 157 | else: 158 | for p in self.bn1.parameters(): 159 | p.requires_grad = False 160 | 161 | for stage in range(freeze_at): 162 | print("freezing at %d" % stage) 163 | if stage == 0: 164 | m = self.conv1 165 | else: 166 | m = getattr(self, "layer" + str(stage)) 167 | for p in m.parameters(): 168 | p.requires_grad = False 169 | 170 | 171 | 172 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 173 | downsample = None 174 | if stride != 1 or self.inplanes != planes * block.expansion: 175 | if shortcut_type == 'A': 176 | downsample = partial( 177 | downsample_basic_block, 178 | planes=planes * block.expansion, 179 | stride=stride) 180 | else: 181 | downsample = nn.Sequential( 182 | nn.Conv3d( 183 | self.inplanes, 184 | planes * block.expansion, 185 | kernel_size=1, 186 | stride=stride, 187 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 188 | 189 | layers = [] 190 | layers.append(block(self.inplanes, planes, stride, downsample)) 191 | self.inplanes = planes * block.expansion 192 | for i in range(1, blocks): 193 | layers.append(block(self.inplanes, planes)) 194 | 195 | return nn.Sequential(*layers) 196 | 197 | def forward(self, x): 198 | x = self.conv1(x) 199 | x = self.bn1(x) 200 | x = self.relu(x) 201 | x = self.maxpool(x) 202 | 203 | x = self.layer1(x) 204 | x = self.layer2(x) 205 | x = self.layer3(x) 206 | x = self.layer4(x) 207 | 208 | x = self.avgpool(x) 209 | 210 | x = x.view(x.size(0), -1) 211 | 212 | if self.fc is not None: 213 | x = self.fc(x) 214 | 215 | return x 216 | 217 | 218 | def get_fine_tuning_parameters(model, ft_begin_index): 219 | if ft_begin_index == 0: 220 | return model.parameters() 221 | 222 | ft_module_names = [] 223 | for i in range(ft_begin_index, 5): 224 | ft_module_names.append('layer{}'.format(i)) 225 | ft_module_names.append('fc') 226 | 227 | parameters = [] 228 | for k, v in model.named_parameters(): 229 | for ft_module in ft_module_names: 230 | if ft_module in k: 231 | parameters.append({'params': v}) 232 | break 233 | else: 234 | parameters.append({'params': v, 'lr': 0.0}) 235 | 236 | return parameters 237 | 238 | 239 | def resnet10(**kwargs): 240 | """Constructs a ResNet-18 model. 241 | """ 242 | model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs) 243 | return model 244 | 245 | 246 | def resnet18(**kwargs): 247 | """Constructs a ResNet-18 model. 248 | """ 249 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 250 | return model 251 | 252 | 253 | def resnet34(**kwargs): 254 | """Constructs a ResNet-34 model. 255 | """ 256 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 257 | return model 258 | 259 | 260 | def resnet50(**kwargs): 261 | """Constructs a ResNet-50 model. 262 | """ 263 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 264 | return model 265 | 266 | 267 | def resnet101(**kwargs): 268 | """Constructs a ResNet-101 model. 269 | """ 270 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 271 | return model 272 | 273 | 274 | def resnet152(**kwargs): 275 | """Constructs a ResNet-101 model. 276 | """ 277 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 278 | return model 279 | 280 | 281 | def resnet200(**kwargs): 282 | """Constructs a ResNet-101 model. 283 | """ 284 | model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs) 285 | return model 286 | -------------------------------------------------------------------------------- /LocalAggregation/src/datasets/kinetics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | import re 10 | import numpy as np 11 | from src.objectives.localagg import MemoryBank 12 | 13 | 14 | def load_value_file(file_path): 15 | with open(file_path, 'r') as input_file: 16 | value = float(input_file.read().rstrip('\n\r')) 17 | 18 | return value 19 | 20 | 21 | def pil_loader(path): 22 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 23 | with open(path, 'rb') as f: 24 | with Image.open(f) as img: 25 | return img.convert('RGB') 26 | 27 | 28 | def accimage_loader(path): 29 | try: 30 | import accimage 31 | return accimage.Image(path) 32 | except IOError: 33 | # Potentially a decoding problem, fall back to PIL.Image 34 | return pil_loader(path) 35 | 36 | 37 | def get_default_image_loader(): 38 | from torchvision import get_image_backend 39 | if get_image_backend() == 'accimage': 40 | return accimage_loader 41 | else: 42 | return pil_loader 43 | 44 | 45 | def video_loader(video_dir_path, frame_indices, image_loader): 46 | video = [] 47 | format = 'image_{:05d}.jpg' 48 | if not os.path.exists(os.path.join(video_dir_path, format.format(1))): 49 | format = "frame{:d}.jpg" 50 | frame_indices = [x - 1 for x in frame_indices] 51 | for i in frame_indices: 52 | image_path = os.path.join(video_dir_path, format.format(i)) 53 | if os.path.exists(image_path): 54 | video.append(image_loader(image_path)) 55 | else: 56 | return video 57 | 58 | return video 59 | 60 | 61 | def get_default_video_loader(): 62 | image_loader = get_default_image_loader() 63 | return functools.partial(video_loader, image_loader=image_loader) 64 | 65 | 66 | def load_annotation_data(data_file_path): 67 | with open(data_file_path, 'r') as data_file: 68 | return json.load(data_file) 69 | 70 | 71 | def get_class_labels(data): 72 | class_labels_map = {} 73 | index = 0 74 | for class_label in data['labels']: 75 | class_labels_map[class_label] = index 76 | index += 1 77 | return class_labels_map 78 | 79 | 80 | def get_video_names_and_annotations(data, subset): 81 | video_names = [] 82 | annotations = [] 83 | 84 | for key, value in data['database'].items(): 85 | this_subset = value['subset'] 86 | if this_subset == subset: 87 | if subset == 'validation': 88 | key = re.sub("_\d+", "", key) 89 | if subset == 'testing': 90 | video_names.append('test/{}'.format(key)) 91 | else: 92 | label = value['annotations']['label'] 93 | video_names.append('{}/{}/{}'.format(subset, label.replace(" ", "_"), key)) 94 | annotations.append(value['annotations']) 95 | 96 | return video_names, annotations 97 | 98 | 99 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 100 | sample_duration, load_fvs=False, fv_path=None): 101 | data = load_annotation_data(annotation_path) 102 | 103 | video_names, annotations = get_video_names_and_annotations(data, subset) 104 | class_to_idx = get_class_labels(data) 105 | idx_to_class = {} 106 | for name, label in class_to_idx.items(): 107 | idx_to_class[label] = name 108 | 109 | dataset = [] 110 | fvs = [] 111 | for i in range(len(video_names)): 112 | if i % 1000 == 0: 113 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 114 | 115 | video_path = os.path.join(root_path, video_names[i]) 116 | if not os.path.exists(video_path): 117 | video_path = video_path.replace("/training/", "/validation/") 118 | video_names[i] = video_names[i].replace("training/", "validation/") 119 | if not os.path.exists(video_path): 120 | video_path = video_path.replace("/validation/", "/test/") 121 | video_names[i] = video_names[i].replace("validation/", "test/") 122 | if not os.path.exists(video_path): 123 | continue 124 | 125 | n_frames_file_path = os.path.join(video_path, 'n_frames') 126 | n_frames = int(load_value_file(n_frames_file_path)) 127 | if n_frames <= 0: 128 | continue 129 | 130 | if load_fvs: 131 | fv_vid_path = os.path.join(fv_path, video_names[i]) + ".dat" 132 | if os.path.exists(fv_vid_path): 133 | fv = torch.load(fv_vid_path) 134 | fvs.append(fv.cpu().squeeze()) 135 | else: 136 | continue 137 | 138 | begin_t = 1 139 | end_t = n_frames 140 | sample = { 141 | 'video': video_path, 142 | 'n_frames': n_frames, 143 | } 144 | if len(annotations) != 0: 145 | sample['label'] = class_to_idx[annotations[i]['label']] 146 | else: 147 | sample['label'] = -1 148 | 149 | if n_samples_for_each_video == 1: 150 | sample['frame_indices'] = list(range(1, n_frames + 1)) 151 | dataset.append(sample) 152 | else: 153 | if n_samples_for_each_video > 1: 154 | step = max(1, 155 | math.ceil((n_frames - 1 - sample_duration) / 156 | (n_samples_for_each_video - 1))) 157 | else: 158 | step = sample_duration 159 | for j in range(1, n_frames, step): 160 | if (j + sample_duration) > n_frames: 161 | break 162 | sample_j = copy.deepcopy(sample) 163 | sample_j['frame_indices'] = list( 164 | range(j, min(n_frames + 1, j + sample_duration))) 165 | sample_j['n_frames'] = len(sample_j['frame_indices']) 166 | dataset.append(sample_j) 167 | 168 | if load_fvs: 169 | fvs = torch.stack(fvs) 170 | 171 | return dataset, idx_to_class, fvs 172 | 173 | 174 | class Kinetics(data.Dataset): 175 | """ 176 | Args: 177 | root (string): Root directory path. 178 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 179 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 180 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 181 | and returns a transformed version 182 | target_transform (callable, optional): A function/transform that takes in the 183 | target and transforms it. 184 | loader (callable, optional): A function to load an video given its path and frame indices. 185 | Attributes: 186 | classes (list): List of the class names. 187 | class_to_idx (dict): Dict with items (class_name, class_index). 188 | imgs (list): List of (image path, class_index) tuples 189 | """ 190 | 191 | def __init__(self, 192 | root_path=None, 193 | annotation_path=None, 194 | train=True, 195 | n_samples_for_each_video=1, 196 | spatial_transform=None, 197 | temporal_transform=None, 198 | target_transform=None, 199 | sample_duration=16, 200 | get_loader=get_default_video_loader, 201 | load_fvs=False, 202 | fv_path=None): 203 | 204 | subset = 'validation' 205 | if train: 206 | subset = 'training' 207 | 208 | self.data, self.class_names, fvs = make_dataset( 209 | root_path, annotation_path, subset, n_samples_for_each_video, 210 | sample_duration, load_fvs=load_fvs, fv_path=fv_path) 211 | 212 | self.spatial_transform = spatial_transform 213 | self.temporal_transform = temporal_transform 214 | self.target_transform = target_transform 215 | self.loader = get_loader() 216 | 217 | self.fvs = fvs 218 | 219 | def __getitem__(self, index): 220 | """ 221 | Args: 222 | index (int): Index 223 | Returns: 224 | tuple: (image, target) where target is class_index of the target class. 225 | """ 226 | path = self.data[index]['video'] 227 | 228 | frame_indices = self.data[index]['frame_indices'] 229 | 230 | if self.temporal_transform is not None: 231 | frame_indices = self.temporal_transform(frame_indices) 232 | 233 | clip = self.loader(path, frame_indices) 234 | if len(clip) == 0: 235 | print(path) 236 | print(frame_indices) 237 | 238 | if self.spatial_transform is not None: 239 | self.spatial_transform.randomize_parameters() 240 | clip = [self.spatial_transform(img) for img in clip] 241 | clip = torch.stack(clip, 0) 242 | clip = clip.permute(1, 0, 2, 3) 243 | 244 | target = self.data[index] 245 | if self.target_transform is not None: 246 | target = self.target_transform(target) 247 | 248 | return index, clip, target 249 | 250 | def __len__(self): 251 | return len(self.data) 252 | -------------------------------------------------------------------------------- /3D-ResNet/nohup.out: -------------------------------------------------------------------------------- 1 | ucf_fromkin_ours_64fr_5scale_split3 2 | Namespace(annotation_path='/data2/ptokmako/ucfTrainTestlist/ucf101_03.json', arch='resnet-18', batch_size=128, begin_epoch=1, checkpoint=10, crop_position_in_test='c', dampening=0.9, dataset='ucf101', ft_begin_index=4, gpu=['0,1,8,9'], initial_scale=1.0, label_folder='', learning_rate=0.001, lr_patience=5, lr_threshold=0.0001, manual_seed=1, mean=[114.7748, 107.7354, 99.475], mean_dataset='activitynet', model='resnet', model_depth=18, momentum=0.9, n_classes=400, n_epochs=200, n_finetune_classes=101, n_scales=5, n_threads=8, n_val_samples=3, nesterov=False, no_cuda=False, no_hflip=False, no_mean_norm=False, no_softmax_in_test=False, no_train=False, no_val=False, norm_value=1, optimizer='sgd', pretrain_path='/data2/ptokmako/results/kinetics_64fr/save_200.pth', resnet_shortcut='B', result_path='/data2/ptokmako/results/ucf_fromkin_ours_64fr_5scale_split3', resume_path='', root_path='/root/data/ActivityNet', sample_duration=64, sample_size=112, scale_in_test=1.0, scale_step=0.84081289641525, scales=[1.0, 0.84081289641525, 0.706966326778202, 0.5944264048864302, 0.4998013871982635], shot=-1, std=[38.7568578, 37.88248729, 40.02898126], std_norm=False, temp_stride=1, test=False, test_subset='val', train_crop='random', video_path='/scratch/ptokmako/UCF_jpeg/', weight_decay=5e-05) 3 | loading pretrained model /data2/ptokmako/results/kinetics_64fr/save_200.pth 4 | freezing at 0 5 | freezing at 1 6 | freezing at 2 7 | freezing at 3 8 | DataParallel( 9 | (module): ResNet( 10 | (conv1): Conv3d(3, 64, kernel_size=(7, 7, 7), stride=(1, 2, 2), padding=(3, 3, 3), bias=False) 11 | (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 12 | (relu): ReLU(inplace=True) 13 | (maxpool): MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1, dilation=1, ceil_mode=False) 14 | (layer1): Sequential( 15 | (0): BasicBlock( 16 | (conv1): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 17 | (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 18 | (relu): ReLU(inplace=True) 19 | (conv2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 20 | (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 21 | ) 22 | (1): BasicBlock( 23 | (conv1): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 24 | (bn1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 25 | (relu): ReLU(inplace=True) 26 | (conv2): Conv3d(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 27 | (bn2): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 28 | ) 29 | ) 30 | (layer2): Sequential( 31 | (0): BasicBlock( 32 | (conv1): Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1), bias=False) 33 | (bn1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 34 | (relu): ReLU(inplace=True) 35 | (conv2): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 36 | (bn2): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 37 | (downsample): Sequential( 38 | (0): Conv3d(64, 128, kernel_size=(1, 1, 1), stride=(2, 2, 2), bias=False) 39 | (1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 40 | ) 41 | ) 42 | (1): BasicBlock( 43 | (conv1): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 44 | (bn1): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 45 | (relu): ReLU(inplace=True) 46 | (conv2): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 47 | (bn2): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 48 | ) 49 | ) 50 | (layer3): Sequential( 51 | (0): BasicBlock( 52 | (conv1): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1), bias=False) 53 | (bn1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 54 | (relu): ReLU(inplace=True) 55 | (conv2): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 56 | (bn2): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 57 | (downsample): Sequential( 58 | (0): Conv3d(128, 256, kernel_size=(1, 1, 1), stride=(2, 2, 2), bias=False) 59 | (1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 60 | ) 61 | ) 62 | (1): BasicBlock( 63 | (conv1): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 64 | (bn1): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 65 | (relu): ReLU(inplace=True) 66 | (conv2): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 67 | (bn2): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 68 | ) 69 | ) 70 | (layer4): Sequential( 71 | (0): BasicBlock( 72 | (conv1): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1), bias=False) 73 | (bn1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 74 | (relu): ReLU(inplace=True) 75 | (conv2): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 76 | (bn2): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 77 | (downsample): Sequential( 78 | (0): Conv3d(256, 512, kernel_size=(1, 1, 1), stride=(2, 2, 2), bias=False) 79 | (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 80 | ) 81 | ) 82 | (1): BasicBlock( 83 | (conv1): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 84 | (bn1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 85 | (relu): ReLU(inplace=True) 86 | (conv2): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False) 87 | (bn2): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 88 | ) 89 | ) 90 | (avgpool): AvgPool3d(kernel_size=(4, 4, 4), stride=1, padding=0) 91 | (fc): Linear(in_features=512, out_features=101, bias=True) 92 | ) 93 | ) 94 | dataset loading [0/9624] 95 | dataset loading [1000/9624] 96 | dataset loading [2000/9624] 97 | dataset loading [3000/9624] 98 | dataset loading [4000/9624] 99 | dataset loading [5000/9624] 100 | dataset loading [6000/9624] 101 | dataset loading [7000/9624] 102 | dataset loading [8000/9624] 103 | dataset loading [9000/9624] 104 | 9624 105 | dataset loading [0/3696] 106 | dataset loading [1000/3696] 107 | dataset loading [2000/3696] 108 | dataset loading [3000/3696] 109 | 3791 110 | run 111 | train at epoch 1 112 | THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1573049301898/work/aten/src/THC/THCCachingHostAllocator.cpp line=278 error=2 : out of memory 113 | Traceback (most recent call last): 114 | File "main.py", line 169, in 115 | train_logger, train_batch_logger, writer) 116 | File "/home/ptokmako/src/video_cluster/3D-ResNet/train.py", line 36, in train_epoch 117 | outputs = model(inputs.cuda()) 118 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__ 119 | result = self.forward(*input, **kwargs) 120 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward 121 | outputs = self.parallel_apply(replicas, inputs, kwargs) 122 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply 123 | return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) 124 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply 125 | output.reraise() 126 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/_utils.py", line 385, in reraise 127 | raise self.exc_type(msg) 128 | RuntimeError: Caught RuntimeError in replica 0 on device 0. 129 | Original Traceback (most recent call last): 130 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker 131 | output = module(*input, **kwargs) 132 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__ 133 | result = self.forward(*input, **kwargs) 134 | File "/home/ptokmako/src/video_cluster/3D-ResNet/models/resnet.py", line 195, in forward 135 | x = self.conv1(x) 136 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__ 137 | result = self.forward(*input, **kwargs) 138 | File "/home/ptokmako/miniconda2/envs/tubercnn/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 480, in forward 139 | self.padding, self.dilation, self.groups) 140 | RuntimeError: CUDA error: invalid argument 141 | 142 | -------------------------------------------------------------------------------- /LocalAggregation/src/datasets/spatial_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | from PIL import Image, ImageOps 3 | import torchvision.transforms.functional as F 4 | from torchvision.transforms.transforms import Lambda 5 | import numbers 6 | import torch 7 | import torch 8 | import numpy as np 9 | try: 10 | import accimage 11 | except ImportError: 12 | accimage = None 13 | 14 | 15 | class MultiScaleRandomCrop(object): 16 | 17 | def __init__(self, scales, size, interpolation=Image.BILINEAR): 18 | self.scales = scales 19 | self.size = size 20 | self.interpolation = interpolation 21 | 22 | def __call__(self, img): 23 | min_length = min(img.size[0], img.size[1]) 24 | crop_size = int(min_length * self.scale) 25 | 26 | image_width = img.size[0] 27 | image_height = img.size[1] 28 | 29 | x1 = self.tl_x * (image_width - crop_size) 30 | y1 = self.tl_y * (image_height - crop_size) 31 | x2 = x1 + crop_size 32 | y2 = y1 + crop_size 33 | 34 | img = img.crop((x1, y1, x2, y2)) 35 | 36 | return img.resize((self.size, self.size), self.interpolation) 37 | 38 | def randomize_parameters(self): 39 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 40 | self.tl_x = random.random() 41 | self.tl_y = random.random() 42 | 43 | 44 | class RandomGrayscale(object): 45 | """Randomly convert image to grayscale with a probability of p (default 0.1). 46 | 47 | Args: 48 | p (float): probability that image should be converted to grayscale. 49 | 50 | Returns: 51 | PIL Image: Grayscale version of the input image with probability p and unchanged 52 | with probability (1-p). 53 | - If input image is 1 channel: grayscale version is 1 channel 54 | - If input image is 3 channel: grayscale version is 3 channel with r == g == b 55 | 56 | """ 57 | 58 | def __init__(self, p=0.1): 59 | self.p = p 60 | 61 | def __call__(self, img): 62 | """ 63 | Args: 64 | img (PIL Image): Image to be converted to grayscale. 65 | 66 | Returns: 67 | PIL Image: Randomly grayscaled image. 68 | """ 69 | num_output_channels = 1 if img.mode == 'L' else 3 70 | if self.sample < self.p: 71 | return F.to_grayscale(img, num_output_channels=num_output_channels) 72 | return img 73 | 74 | def randomize_parameters(self): 75 | self.sample = random.random() 76 | 77 | def __repr__(self): 78 | return self.__class__.__name__ + '(p={0})'.format(self.p) 79 | 80 | 81 | class ColorJitter(object): 82 | """Randomly change the brightness, contrast and saturation of an image. 83 | 84 | Args: 85 | brightness (float or tuple of float (min, max)): How much to jitter brightness. 86 | brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness] 87 | or the given [min, max]. Should be non negative numbers. 88 | contrast (float or tuple of float (min, max)): How much to jitter contrast. 89 | contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast] 90 | or the given [min, max]. Should be non negative numbers. 91 | saturation (float or tuple of float (min, max)): How much to jitter saturation. 92 | saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation] 93 | or the given [min, max]. Should be non negative numbers. 94 | hue (float or tuple of float (min, max)): How much to jitter hue. 95 | hue_factor is chosen uniformly from [-hue, hue] or the given [min, max]. 96 | Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5. 97 | """ 98 | def __init__(self, brightness=0, contrast=0, saturation=0, hue=0): 99 | self.brightness = self._check_input(brightness, 'brightness') 100 | self.contrast = self._check_input(contrast, 'contrast') 101 | self.saturation = self._check_input(saturation, 'saturation') 102 | self.hue = self._check_input(hue, 'hue', center=0, bound=(-0.5, 0.5), 103 | clip_first_on_zero=False) 104 | 105 | def _check_input(self, value, name, center=1, bound=(0, float('inf')), clip_first_on_zero=True): 106 | if isinstance(value, numbers.Number): 107 | if value < 0: 108 | raise ValueError("If {} is a single number, it must be non negative.".format(name)) 109 | value = [center - value, center + value] 110 | if clip_first_on_zero: 111 | value[0] = max(value[0], 0) 112 | elif isinstance(value, (tuple, list)) and len(value) == 2: 113 | if not bound[0] <= value[0] <= value[1] <= bound[1]: 114 | raise ValueError("{} values should be between {}".format(name, bound)) 115 | else: 116 | raise TypeError("{} should be a single number or a list/tuple with lenght 2.".format(name)) 117 | 118 | # if value is 0 or (1., 1.) for brightness/contrast/saturation 119 | # or (0., 0.) for hue, do nothing 120 | if value[0] == value[1] == center: 121 | value = None 122 | return value 123 | 124 | def randomize_parameters(self): 125 | transforms = [] 126 | 127 | if self.brightness is not None: 128 | brightness_factor = random.uniform(self.brightness[0], self.brightness[1]) 129 | transforms.append(Lambda(lambda img: F.adjust_brightness(img, brightness_factor))) 130 | 131 | if self.contrast is not None: 132 | contrast_factor = random.uniform(self.contrast[0], self.contrast[1]) 133 | transforms.append(Lambda(lambda img: F.adjust_contrast(img, contrast_factor))) 134 | 135 | if self.saturation is not None: 136 | saturation_factor = random.uniform(self.saturation[0], self.saturation[1]) 137 | transforms.append(Lambda(lambda img: F.adjust_saturation(img, saturation_factor))) 138 | 139 | if self.hue is not None: 140 | hue_factor = random.uniform(self.hue[0], self.hue[1]) 141 | transforms.append(Lambda(lambda img: F.adjust_hue(img, hue_factor))) 142 | 143 | random.shuffle(transforms) 144 | self.transform = Compose(transforms) 145 | 146 | def get_params(self): 147 | """Get a randomized transform to be applied on image. 148 | 149 | Arguments are same as that of __init__. 150 | 151 | Returns: 152 | Transform which randomly adjusts brightness, contrast and 153 | saturation in a random order. 154 | """ 155 | 156 | 157 | return self.transform 158 | 159 | def __call__(self, img): 160 | """ 161 | Args: 162 | img (PIL Image): Input image. 163 | 164 | Returns: 165 | PIL Image: Color jittered image. 166 | """ 167 | transform = self.get_params() 168 | return transform(img) 169 | 170 | def __repr__(self): 171 | format_string = self.__class__.__name__ + '(' 172 | format_string += 'brightness={0}'.format(self.brightness) 173 | format_string += ', contrast={0}'.format(self.contrast) 174 | format_string += ', saturation={0}'.format(self.saturation) 175 | format_string += ', hue={0})'.format(self.hue) 176 | return format_string 177 | 178 | 179 | class RandomHorizontalFlip(object): 180 | """Horizontally flip the given PIL.Image randomly with a probability of 0.5.""" 181 | 182 | def __call__(self, img): 183 | """ 184 | Args: 185 | img (PIL.Image): Image to be flipped. 186 | Returns: 187 | PIL.Image: Randomly flipped image. 188 | """ 189 | if self.p < 0.5: 190 | return img.transpose(Image.FLIP_LEFT_RIGHT) 191 | return img 192 | 193 | def randomize_parameters(self): 194 | self.p = random.random() 195 | 196 | 197 | class ToTensor(object): 198 | """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. 199 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 200 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. 201 | """ 202 | 203 | def __init__(self, norm_value=255, flow_range=None): 204 | self.norm_value = norm_value 205 | self.flow_range = flow_range 206 | 207 | def __call__(self, pic): 208 | """ 209 | Args: 210 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. 211 | Returns: 212 | Tensor: Converted image. 213 | """ 214 | if isinstance(pic, np.ndarray): 215 | # handle numpy array 216 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 217 | # backward compatibility 218 | return img.float().div(self.norm_value) 219 | 220 | if accimage is not None and isinstance(pic, accimage.Image): 221 | nppic = np.zeros( 222 | [pic.channels, pic.height, pic.width], dtype=np.float32) 223 | pic.copyto(nppic) 224 | return torch.from_numpy(nppic) 225 | 226 | # handle PIL Image 227 | if pic.mode == 'I': 228 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 229 | elif pic.mode == 'I;16': 230 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 231 | else: 232 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 233 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 234 | if pic.mode == 'YCbCr': 235 | nchannel = 3 236 | elif pic.mode == 'I;16': 237 | nchannel = 1 238 | else: 239 | nchannel = len(pic.mode) 240 | img = img.view(pic.size[1], pic.size[0], nchannel) 241 | # put it from HWC to CHW format 242 | # yikes, this transpose takes 80% of the loading time/CPU 243 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 244 | if isinstance(img, torch.ByteTensor): 245 | img = img.float().div(self.norm_value) 246 | 247 | if self.flow_range: 248 | img = img[0:2, :, :] 249 | img = img * 40 - 20 250 | # img = img - 0.5 251 | 252 | return img 253 | 254 | def randomize_parameters(self): 255 | pass 256 | 257 | 258 | class Compose(object): 259 | """Composes several transforms together. 260 | Args: 261 | transforms (list of ``Transform`` objects): list of transforms to compose. 262 | Example: 263 | >>> transforms.Compose([ 264 | >>> transforms.CenterCrop(10), 265 | >>> transforms.ToTensor(), 266 | >>> ]) 267 | """ 268 | 269 | def __init__(self, transforms): 270 | self.transforms = transforms 271 | 272 | def __call__(self, img): 273 | for t in self.transforms: 274 | img = t(img) 275 | return img 276 | 277 | def randomize_parameters(self): 278 | for t in self.transforms: 279 | meth = getattr(t, "randomize_parameters", None) 280 | if callable(meth): 281 | t.randomize_parameters() -------------------------------------------------------------------------------- /3D-ResNet/spatial_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import numbers 4 | import collections 5 | import numpy as np 6 | import torch 7 | from PIL import Image, ImageOps 8 | try: 9 | import accimage 10 | except ImportError: 11 | accimage = None 12 | 13 | 14 | class Compose(object): 15 | """Composes several transforms together. 16 | Args: 17 | transforms (list of ``Transform`` objects): list of transforms to compose. 18 | Example: 19 | >>> transforms.Compose([ 20 | >>> transforms.CenterCrop(10), 21 | >>> transforms.ToTensor(), 22 | >>> ]) 23 | """ 24 | 25 | def __init__(self, transforms): 26 | self.transforms = transforms 27 | 28 | def __call__(self, img): 29 | for t in self.transforms: 30 | img = t(img) 31 | return img 32 | 33 | def randomize_parameters(self): 34 | for t in self.transforms: 35 | t.randomize_parameters() 36 | 37 | 38 | class ToTensor(object): 39 | """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. 40 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 41 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. 42 | """ 43 | 44 | def __init__(self, norm_value=255): 45 | self.norm_value = norm_value 46 | 47 | def __call__(self, pic): 48 | """ 49 | Args: 50 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. 51 | Returns: 52 | Tensor: Converted image. 53 | """ 54 | if isinstance(pic, np.ndarray): 55 | # handle numpy array 56 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 57 | # backward compatibility 58 | return img.float().div(self.norm_value) 59 | 60 | if accimage is not None and isinstance(pic, accimage.Image): 61 | nppic = np.zeros( 62 | [pic.channels, pic.height, pic.width], dtype=np.float32) 63 | pic.copyto(nppic) 64 | return torch.from_numpy(nppic) 65 | 66 | # handle PIL Image 67 | if pic.mode == 'I': 68 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 69 | elif pic.mode == 'I;16': 70 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 71 | else: 72 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 73 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 74 | if pic.mode == 'YCbCr': 75 | nchannel = 3 76 | elif pic.mode == 'I;16': 77 | nchannel = 1 78 | else: 79 | nchannel = len(pic.mode) 80 | img = img.view(pic.size[1], pic.size[0], nchannel) 81 | # put it from HWC to CHW format 82 | # yikes, this transpose takes 80% of the loading time/CPU 83 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 84 | if isinstance(img, torch.ByteTensor): 85 | img = img.float().div(self.norm_value) 86 | 87 | return img 88 | 89 | def randomize_parameters(self): 90 | pass 91 | 92 | 93 | class Normalize(object): 94 | """Normalize an tensor image with mean and standard deviation. 95 | Given mean: (R, G, B) and std: (R, G, B), 96 | will normalize each channel of the torch.*Tensor, i.e. 97 | channel = (channel - mean) / std 98 | Args: 99 | mean (sequence): Sequence of means for R, G, B channels respecitvely. 100 | std (sequence): Sequence of standard deviations for R, G, B channels 101 | respecitvely. 102 | """ 103 | 104 | def __init__(self, mean, std): 105 | self.mean = mean 106 | self.std = std 107 | 108 | def __call__(self, tensor): 109 | """ 110 | Args: 111 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 112 | Returns: 113 | Tensor: Normalized image. 114 | """ 115 | # TODO: make efficient 116 | for t, m, s in zip(tensor, self.mean, self.std): 117 | t.sub_(m).div_(s) 118 | return tensor 119 | 120 | def randomize_parameters(self): 121 | pass 122 | 123 | 124 | class Scale(object): 125 | """Rescale the input PIL.Image to the given size. 126 | Args: 127 | size (sequence or int): Desired output size. If size is a sequence like 128 | (w, h), output size will be matched to this. If size is an int, 129 | smaller edge of the image will be matched to this number. 130 | i.e, if height > width, then image will be rescaled to 131 | (size * height / width, size) 132 | interpolation (int, optional): Desired interpolation. Default is 133 | ``PIL.Image.BILINEAR`` 134 | """ 135 | 136 | def __init__(self, size, interpolation=Image.BILINEAR): 137 | assert isinstance(size, 138 | int) or (isinstance(size, collections.Iterable) and 139 | len(size) == 2) 140 | self.size = size 141 | self.interpolation = interpolation 142 | 143 | def __call__(self, img): 144 | """ 145 | Args: 146 | img (PIL.Image): Image to be scaled. 147 | Returns: 148 | PIL.Image: Rescaled image. 149 | """ 150 | if isinstance(self.size, int): 151 | w, h = img.size 152 | if (w <= h and w == self.size) or (h <= w and h == self.size): 153 | return img 154 | if w < h: 155 | ow = self.size 156 | oh = int(self.size * h / w) 157 | return img.resize((ow, oh), self.interpolation) 158 | else: 159 | oh = self.size 160 | ow = int(self.size * w / h) 161 | return img.resize((ow, oh), self.interpolation) 162 | else: 163 | return img.resize(self.size, self.interpolation) 164 | 165 | def randomize_parameters(self): 166 | pass 167 | 168 | 169 | class CenterCrop(object): 170 | """Crops the given PIL.Image at the center. 171 | Args: 172 | size (sequence or int): Desired output size of the crop. If size is an 173 | int instead of sequence like (h, w), a square crop (size, size) is 174 | made. 175 | """ 176 | 177 | def __init__(self, size): 178 | if isinstance(size, numbers.Number): 179 | self.size = (int(size), int(size)) 180 | else: 181 | self.size = size 182 | 183 | def __call__(self, img): 184 | """ 185 | Args: 186 | img (PIL.Image): Image to be cropped. 187 | Returns: 188 | PIL.Image: Cropped image. 189 | """ 190 | w, h = img.size 191 | th, tw = self.size 192 | x1 = int(round((w - tw) / 2.)) 193 | y1 = int(round((h - th) / 2.)) 194 | return img.crop((x1, y1, x1 + tw, y1 + th)) 195 | 196 | def randomize_parameters(self): 197 | pass 198 | 199 | 200 | class CornerCrop(object): 201 | 202 | def __init__(self, size, crop_position=None): 203 | self.size = size 204 | if crop_position is None: 205 | self.randomize = True 206 | else: 207 | self.randomize = False 208 | self.crop_position = crop_position 209 | self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br'] 210 | 211 | def __call__(self, img): 212 | image_width = img.size[0] 213 | image_height = img.size[1] 214 | 215 | if self.crop_position == 'c': 216 | th, tw = (self.size, self.size) 217 | x1 = int(round((image_width - tw) / 2.)) 218 | y1 = int(round((image_height - th) / 2.)) 219 | x2 = x1 + tw 220 | y2 = y1 + th 221 | elif self.crop_position == 'tl': 222 | x1 = 0 223 | y1 = 0 224 | x2 = self.size 225 | y2 = self.size 226 | elif self.crop_position == 'tr': 227 | x1 = image_width - self.size 228 | y1 = 0 229 | x2 = image_width 230 | y2 = self.size 231 | elif self.crop_position == 'bl': 232 | x1 = 0 233 | y1 = image_height - self.size 234 | x2 = self.size 235 | y2 = image_height 236 | elif self.crop_position == 'br': 237 | x1 = image_width - self.size 238 | y1 = image_height - self.size 239 | x2 = image_width 240 | y2 = image_height 241 | 242 | img = img.crop((x1, y1, x2, y2)) 243 | 244 | return img 245 | 246 | def randomize_parameters(self): 247 | if self.randomize: 248 | self.crop_position = self.crop_positions[random.randint( 249 | 0, 250 | len(self.crop_positions) - 1)] 251 | 252 | 253 | class RandomHorizontalFlip(object): 254 | """Horizontally flip the given PIL.Image randomly with a probability of 0.5.""" 255 | 256 | def __call__(self, img): 257 | """ 258 | Args: 259 | img (PIL.Image): Image to be flipped. 260 | Returns: 261 | PIL.Image: Randomly flipped image. 262 | """ 263 | if self.p < 0.5: 264 | return img.transpose(Image.FLIP_LEFT_RIGHT) 265 | return img 266 | 267 | def randomize_parameters(self): 268 | self.p = random.random() 269 | 270 | 271 | class MultiScaleCornerCrop(object): 272 | """Crop the given PIL.Image to randomly selected size. 273 | A crop of size is selected from scales of the original size. 274 | A position of cropping is randomly selected from 4 corners and 1 center. 275 | This crop is finally resized to given size. 276 | Args: 277 | scales: cropping scales of the original size 278 | size: size of the smaller edge 279 | interpolation: Default: PIL.Image.BILINEAR 280 | """ 281 | 282 | def __init__(self, 283 | scales, 284 | size, 285 | interpolation=Image.BILINEAR, 286 | crop_positions=['c', 'tl', 'tr', 'bl', 'br']): 287 | self.scales = scales 288 | self.size = size 289 | self.interpolation = interpolation 290 | 291 | self.crop_positions = crop_positions 292 | 293 | def __call__(self, img): 294 | min_length = min(img.size[0], img.size[1]) 295 | crop_size = int(min_length * self.scale) 296 | 297 | image_width = img.size[0] 298 | image_height = img.size[1] 299 | 300 | if self.crop_position == 'c': 301 | center_x = image_width // 2 302 | center_y = image_height // 2 303 | box_half = crop_size // 2 304 | x1 = center_x - box_half 305 | y1 = center_y - box_half 306 | x2 = center_x + box_half 307 | y2 = center_y + box_half 308 | elif self.crop_position == 'tl': 309 | x1 = 0 310 | y1 = 0 311 | x2 = crop_size 312 | y2 = crop_size 313 | elif self.crop_position == 'tr': 314 | x1 = image_width - crop_size 315 | y1 = 0 316 | x2 = image_width 317 | y2 = crop_size 318 | elif self.crop_position == 'bl': 319 | x1 = 0 320 | y1 = image_height - crop_size 321 | x2 = crop_size 322 | y2 = image_height 323 | elif self.crop_position == 'br': 324 | x1 = image_width - crop_size 325 | y1 = image_height - crop_size 326 | x2 = image_width 327 | y2 = image_height 328 | 329 | img = img.crop((x1, y1, x2, y2)) 330 | 331 | return img.resize((self.size, self.size), self.interpolation) 332 | 333 | def randomize_parameters(self): 334 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 335 | self.crop_position = self.crop_positions[random.randint( 336 | 0, 337 | len(self.crop_positions) - 1)] 338 | 339 | 340 | class MultiScaleRandomCrop(object): 341 | 342 | def __init__(self, scales, size, interpolation=Image.BILINEAR): 343 | self.scales = scales 344 | self.size = size 345 | self.interpolation = interpolation 346 | 347 | def __call__(self, img): 348 | min_length = min(img.size[0], img.size[1]) 349 | crop_size = int(min_length * self.scale) 350 | 351 | image_width = img.size[0] 352 | image_height = img.size[1] 353 | 354 | x1 = self.tl_x * (image_width - crop_size) 355 | y1 = self.tl_y * (image_height - crop_size) 356 | x2 = x1 + crop_size 357 | y2 = y1 + crop_size 358 | 359 | img = img.crop((x1, y1, x2, y2)) 360 | 361 | return img.resize((self.size, self.size), self.interpolation) 362 | 363 | def randomize_parameters(self): 364 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 365 | self.tl_x = random.random() 366 | self.tl_y = random.random() 367 | -------------------------------------------------------------------------------- /LocalAggregation/src/objectives/localagg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Local Aggregation Objective as defined in 3 | https://arxiv.org/abs/1903.12355 4 | 5 | Code is based on Tensorflow implementation: 6 | https://github.com/neuroailab/LocalAggregation 7 | """ 8 | 9 | import faiss 10 | import torch 11 | 12 | import numpy as np 13 | import time 14 | from termcolor import colored 15 | 16 | from src.utils.tensor import repeat_1d_tensor, l2_normalize 17 | 18 | DEFAULT_KMEANS_SEED = 1234 19 | 20 | class LocalAggregationLossModule(torch.nn.Module): 21 | 22 | def __init__(self, memory_bank_broadcast, cluster_label_broadcast, k=4096, t=0.07, m=0.5): 23 | super(LocalAggregationLossModule, self).__init__() 24 | self.k, self.t, self.m = k, t, m 25 | 26 | self.indices = None 27 | self.outputs = None 28 | self._bank = None # pass in via forward function 29 | self._cluster_labels = None 30 | self.memory_bank_broadcast = memory_bank_broadcast 31 | self.cluster_label_broadcast = cluster_label_broadcast 32 | self.data_len = memory_bank_broadcast[0].size(0) 33 | 34 | def _softmax(self, dot_prods): 35 | Z = 2876934.2 36 | return torch.exp(dot_prods / self.t) / Z 37 | 38 | def updated_new_data_memory(self, indices, outputs): 39 | outputs = l2_normalize(outputs) 40 | data_memory = torch.index_select(self._bank, 0, indices) 41 | new_data_memory = data_memory * self.m + (1 - self.m) * outputs 42 | return l2_normalize(new_data_memory, dim=1) 43 | 44 | def synchronization_check(self): 45 | for i in range(len(self.memory_bank_broadcast)): 46 | if i == 0: 47 | device = self.memory_bank_broadcast[0].device 48 | else: 49 | assert torch.equal(self.memory_bank_broadcast[0], self.memory_bank_broadcast[i].to(device)) 50 | 51 | def _get_all_dot_products(self, vec): 52 | assert len(vec.size()) == 2 53 | return torch.matmul(vec, torch.transpose(self._bank, 1, 0)) 54 | 55 | def __get_close_nei_in_back(self, each_k_idx, cluster_labels, 56 | back_nei_idxs, k): 57 | # get which neighbors are close in the background set 58 | batch_labels = cluster_labels[each_k_idx][self.indices] 59 | top_cluster_labels = cluster_labels[each_k_idx][back_nei_idxs] 60 | batch_labels = repeat_1d_tensor(batch_labels, k) 61 | 62 | curr_close_nei = torch.eq(batch_labels, top_cluster_labels) 63 | return curr_close_nei.byte() 64 | 65 | def __get_relative_prob(self, all_close_nei, back_nei_probs): 66 | relative_probs = torch.sum( 67 | torch.where( 68 | all_close_nei, 69 | back_nei_probs, 70 | torch.zeros_like(back_nei_probs), 71 | ), dim=1) 72 | # normalize probs 73 | relative_probs = relative_probs / torch.sum(back_nei_probs, dim=1, keepdim=True) 74 | return relative_probs 75 | 76 | def __get_close_nei(self, each_k_idx, cluster_labels, indices): 77 | batch_size = self.indices.size(0) 78 | dtype = torch.int32 # convert to 32-bit integer to save memory consumption 79 | batch_labels = cluster_labels[each_k_idx][indices].to(dtype) 80 | _cluster_labels = cluster_labels[each_k_idx].to(dtype).unsqueeze(0).expand(batch_size, -1) 81 | batch_labels = repeat_1d_tensor(batch_labels, _cluster_labels.size(1)) 82 | curr_close_nei = torch.eq(batch_labels, _cluster_labels) 83 | return curr_close_nei.byte() 84 | 85 | def forward(self, indices, outputs, gpu_idx): 86 | """ 87 | :param back_nei_idxs: shape (batch_size, 4096) 88 | :param all_close_nei: shape (batch_size, _size_of_dataset) in byte 89 | """ 90 | self.indices = indices.detach() 91 | self.outputs = l2_normalize(outputs, dim=1) 92 | self._bank = self.memory_bank_broadcast[gpu_idx] # select a mem bank based on gpu device 93 | self._cluster_labels = self.cluster_label_broadcast[gpu_idx] 94 | 95 | k = self.k 96 | 97 | all_dps = self._get_all_dot_products(self.outputs) 98 | back_nei_dps, back_nei_idxs = torch.topk(all_dps, k=k, sorted=False, dim=1) 99 | back_nei_probs = self._softmax(back_nei_dps) 100 | 101 | all_close_nei_in_back = None 102 | no_kmeans = self._cluster_labels.size(0) 103 | with torch.no_grad(): 104 | for each_k_idx in range(no_kmeans): 105 | curr_close_nei = self.__get_close_nei_in_back( 106 | each_k_idx, self._cluster_labels, back_nei_idxs, k) 107 | 108 | if all_close_nei_in_back is None: 109 | all_close_nei_in_back = curr_close_nei 110 | else: 111 | # assuming all_close_nei and curr_close_nei are byte tensors 112 | all_close_nei_in_back = all_close_nei_in_back | curr_close_nei 113 | 114 | relative_probs = self.__get_relative_prob(all_close_nei_in_back, back_nei_probs) 115 | loss = -torch.mean(torch.log(relative_probs + 1e-7)).unsqueeze(0) 116 | 117 | # compute new data memory 118 | new_data_memory = self.updated_new_data_memory(self.indices, self.outputs) 119 | 120 | return loss, new_data_memory 121 | 122 | 123 | class MemoryBank(object): 124 | """For efficiently computing the background vectors.""" 125 | 126 | def __init__(self, size, dim, device_ids, bank=None): 127 | self.size = size 128 | self.dim = dim 129 | self.device = torch.device("cuda:{}".format(device_ids[0])) 130 | if bank is not None: 131 | self._bank = bank 132 | else: 133 | self._bank = self._create() 134 | self.bank_broadcast = torch.cuda.comm.broadcast(self._bank, device_ids) 135 | self.device = [_bank.device for _bank in self.bank_broadcast] 136 | self.num_device = len(self.device) 137 | del self._bank 138 | # print(colored('Warning: using in-place scatter in memory bank update function', 'red')) 139 | 140 | def _create(self): 141 | # initialize random weights 142 | mb_init = torch.rand(self.size, self.dim, device=self.device) 143 | std_dev = 1. / np.sqrt(self.dim / 3) 144 | mb_init = mb_init * (2 * std_dev) - std_dev 145 | # L2 normalise so that the norm is 1 146 | mb_init = l2_normalize(mb_init, dim=1) 147 | return mb_init.detach() # detach so its not trainable 148 | 149 | def as_tensor(self): 150 | return self.bank_broadcast[0] 151 | 152 | def at_idxs(self, idxs): 153 | return torch.index_select(self.bank_broadcast[0], 0, idxs) 154 | 155 | def get_all_dot_products(self, vec): 156 | # [bs, dim] 157 | assert len(vec.size()) == 2 158 | return torch.matmul(vec, torch.transpose(self.bank_broadcast[0], 1, 0)) 159 | 160 | def get_dot_products(self, vec, idxs): 161 | vec_shape = list(vec.size()) # [bs, dim] 162 | idxs_shape = list(idxs.size()) # [bs, ...] 163 | 164 | assert len(idxs_shape) in [1, 2] 165 | assert len(vec_shape) == 2 166 | assert vec_shape[0] == idxs_shape[0] 167 | 168 | if len(idxs_shape) == 1: 169 | with torch.no_grad(): 170 | memory_vecs = torch.index_select(self._bank, 0, idxs) 171 | memory_vecs_shape = list(memory_vecs.size()) 172 | assert memory_vecs_shape[0] == idxs_shape[0] 173 | else: # len(idxs_shape) == 2 174 | with torch.no_grad(): 175 | batch_size, k_dim = idxs.size(0), idxs.size(1) 176 | flat_idxs = idxs.view(-1) 177 | memory_vecs = torch.index_select(self._bank, 0, flat_idxs) 178 | memory_vecs = memory_vecs.view(batch_size, k_dim, self._bank.size(1)) 179 | memory_vecs_shape = list(memory_vecs.size()) 180 | 181 | vec_shape[1:1] = [1] * (len(idxs_shape) - 1) 182 | vec = vec.view(vec_shape) # [bs, 1, dim] 183 | 184 | prods = memory_vecs * vec 185 | assert list(prods.size()) == memory_vecs_shape 186 | 187 | return torch.sum(prods, dim=-1) 188 | 189 | def update(self, indices, data_memory): 190 | # in lieu of scatter-update operation 191 | data_dim = data_memory.size(1) 192 | data_memory = data_memory.detach() 193 | indices = indices.unsqueeze(1).repeat(1, data_dim) 194 | 195 | for i in range(self.num_device): 196 | if i > 0: 197 | # start.record() 198 | device = self.device[i] 199 | indices = indices.to(device) 200 | data_memory = data_memory.to(device) 201 | self.bank_broadcast[i] = self.bank_broadcast[i].scatter_(0, indices, data_memory) 202 | 203 | def synchronization_check(self): 204 | for i in range(len(self.bank_broadcast)): 205 | if i == 0: 206 | device = self.bank_broadcast[0].device 207 | else: 208 | assert torch.equal(self.bank_broadcast[0], self.bank_broadcast[i].to(device)) 209 | 210 | 211 | def run_kmeans(x, nmb_clusters, verbose=False, 212 | seed=DEFAULT_KMEANS_SEED, gpu_device=0): 213 | """ 214 | Runs kmeans on 1 GPU. 215 | 216 | Args: 217 | ----- 218 | x: data 219 | nmb_clusters (int): number of clusters 220 | 221 | Returns: 222 | -------- 223 | list: ids of data in each cluster 224 | """ 225 | n_data, d = x.shape 226 | 227 | # faiss implementation of k-means 228 | clus = faiss.Clustering(d, nmb_clusters) 229 | clus.niter = 20 230 | clus.max_points_per_centroid = 10000000 231 | clus.seed = seed 232 | res = faiss.StandardGpuResources() 233 | flat_config = faiss.GpuIndexFlatConfig() 234 | flat_config.useFloat16 = False 235 | flat_config.device = gpu_device 236 | 237 | index = faiss.GpuIndexFlatL2(res, d, flat_config) 238 | 239 | # perform the training 240 | clus.train(x, index) 241 | _, I = index.search(x, 1) 242 | losses = faiss.vector_to_array(clus.obj) 243 | if verbose: 244 | print('k-means loss evolution: {0}'.format(losses)) 245 | 246 | return [int(n[0]) for n in I], losses[-1] 247 | 248 | 249 | def run_kmeans_multi_gpu(x, nmb_clusters, verbose=False, 250 | seed=DEFAULT_KMEANS_SEED, gpu_device=0): 251 | 252 | """ 253 | Runs kmeans on multi GPUs. 254 | 255 | Args: 256 | ----- 257 | x: data 258 | nmb_clusters (int): number of clusters 259 | 260 | Returns: 261 | -------- 262 | list: ids of data in each cluster 263 | """ 264 | n_data, d = x.shape 265 | ngpus = len(gpu_device) 266 | assert ngpus > 1 267 | 268 | # faiss implementation of k-means 269 | clus = faiss.Clustering(d, nmb_clusters) 270 | clus.niter = 20 271 | clus.max_points_per_centroid = 10000000 272 | clus.seed = seed 273 | res = [faiss.StandardGpuResources() for i in range(ngpus)] 274 | flat_config = [] 275 | for i in gpu_device: 276 | cfg = faiss.GpuIndexFlatConfig() 277 | cfg.useFloat16 = False 278 | cfg.device = i 279 | flat_config.append(cfg) 280 | 281 | indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(ngpus)] 282 | index = faiss.IndexReplicas() 283 | for sub_index in indexes: 284 | index.addIndex(sub_index) 285 | 286 | # perform the training 287 | clus.train(x, index) 288 | _, I = index.search(x, 1) 289 | losses = faiss.vector_to_array(clus.obj) 290 | if verbose: 291 | print('k-means loss evolution: {0}'.format(losses)) 292 | 293 | return [int(n[0]) for n in I], losses[-1] 294 | 295 | 296 | class Kmeans(object): 297 | """ 298 | Train different k-means clusterings with different 299 | random seeds. These will be used to compute close neighbors 300 | for a given encoding. 301 | """ 302 | def __init__(self, k, memory_bank, gpu_device=0, fvs=None): 303 | super().__init__() 304 | self.k = k 305 | self.memory_bank = memory_bank 306 | self.gpu_device = gpu_device 307 | self.fvs = fvs 308 | 309 | def compute_clusters(self): 310 | """ 311 | Performs many k-means clustering. 312 | 313 | Args: 314 | x_data (np.array N * dim): data to cluster 315 | """ 316 | data = self.memory_bank.as_tensor() 317 | if self.fvs is not None and len(self.fvs) != 0: 318 | data = torch.cat((data.cpu().float(), self.fvs.float()), 1) 319 | data_npy = data.detach().numpy() 320 | clusters = self._compute_clusters(data_npy) 321 | return clusters 322 | 323 | def _compute_clusters(self, data): 324 | pred_labels = [] 325 | for k_idx, each_k in enumerate(self.k): 326 | # cluster the data 327 | 328 | if len(self.gpu_device) == 1: # single gpu 329 | I, _ = run_kmeans(data, each_k, seed=k_idx + DEFAULT_KMEANS_SEED, 330 | gpu_device=self.gpu_device[0]) 331 | else: # multigpu 332 | I, _ = run_kmeans_multi_gpu(data, each_k, seed=k_idx + DEFAULT_KMEANS_SEED, 333 | gpu_device=self.gpu_device) 334 | 335 | clust_labels = np.asarray(I) 336 | pred_labels.append(clust_labels) 337 | pred_labels = np.stack(pred_labels, axis=0) 338 | pred_labels = torch.from_numpy(pred_labels).long() 339 | 340 | return pred_labels 341 | --------------------------------------------------------------------------------