├── models ├── __init__.py ├── wide_resnet.py ├── resnext.py ├── pre_act_resnet.py ├── densenet.py ├── resnet.py ├── resnet2p1d.py └── resnet_strg.py ├── datasets ├── __init__.py ├── loader.py ├── videodataset_multiclips.py ├── videodataset.py └── activitynet.py ├── util_scripts ├── __init__.py ├── utils.py ├── remove_dataparallel.py ├── add_fps_into_activitynet_json.py ├── vid2img_sthv2.py ├── hmdb51_json.py ├── eval_accuracy.py ├── ucf101_json.py ├── mit_json.py ├── generate_video_jpgs.py ├── sthv2_json.py ├── sthv1_json.py ├── kinetics_json.py └── generate_video_hdf5.py ├── requirements.txt ├── gen.sh ├── mean.py ├── LICENSE ├── module ├── roi_graph.py └── gcn.py ├── utils.py ├── inference.py ├── validation.py ├── strg.py ├── temporal_transforms.py ├── rpn.py ├── training.py ├── model.py ├── spatial_transforms.py ├── README.md ├── rgcn_models.py ├── dataset.py ├── transform.py ├── opts.py └── main.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /util_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | scikit-learn 3 | pandas 4 | numpy 5 | torch==2.2.0 6 | torchvision==0.6 7 | wandb 8 | h5py 9 | tensorboardX 10 | -------------------------------------------------------------------------------- /gen.sh: -------------------------------------------------------------------------------- 1 | python sthv1_json.py 'data/something/v1' 'data/something/v1/img' 'data/sthv1.json' 2 | #python sthv2_json.py 'data/something/v2' 'data/something/v2/img' 'data/sthv2.json' 3 | -------------------------------------------------------------------------------- /util_scripts/utils.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | 3 | 4 | def get_n_frames(video_path): 5 | return len([ 6 | x for x in video_path.iterdir() 7 | if 'jpg' in x.name and x.name[0] != '.' 8 | ]) 9 | 10 | 11 | def get_n_frames_hdf5(video_path): 12 | with h5py.File(video_path, 'r') as f: 13 | video_data = f['video'] 14 | return len(video_data) 15 | -------------------------------------------------------------------------------- /mean.py: -------------------------------------------------------------------------------- 1 | def get_mean_std(value_scale, dataset): 2 | assert dataset in ['activitynet', 'kinetics', '0.5'] 3 | 4 | if dataset == 'activitynet': 5 | mean = [0.4477, 0.4209, 0.3906] 6 | std = [0.2767, 0.2695, 0.2714] 7 | elif dataset == 'kinetics': 8 | mean = [0.4345, 0.4051, 0.3775] 9 | std = [0.2768, 0.2713, 0.2737] 10 | elif dataset == '0.5': 11 | mean = [0.5, 0.5, 0.5] 12 | std = [0.5, 0.5, 0.5] 13 | 14 | mean = [x * value_scale for x in mean] 15 | std = [x * value_scale for x in std] 16 | 17 | return mean, std -------------------------------------------------------------------------------- /util_scripts/remove_dataparallel.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import OrderedDict 3 | 4 | import torch 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('file_path', type=str) 8 | parser.add_argument('--dst_file_path', default=None, type=str) 9 | args = parser.parse_args() 10 | 11 | if args.dst_file_path is None: 12 | args.dst_file_path = args.file_path 13 | 14 | x = torch.load(args.file_path) 15 | state_dict = x['state_dict'] 16 | new_state_dict = OrderedDict() 17 | 18 | for k, v in state_dict.items(): 19 | new_k = '.'.join(k.split('.')[1:]) 20 | new_state_dict[new_k] = v 21 | 22 | x['state_dict'] = new_state_dict 23 | 24 | torch.save(x, args.dst_file_path) -------------------------------------------------------------------------------- /models/wide_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from . import resnet 6 | 7 | 8 | class WideBottleneck(resnet.Bottleneck): 9 | expansion = 2 10 | 11 | 12 | def generate_model(model_depth, k, **kwargs): 13 | assert model_depth in [50, 101, 152, 200] 14 | 15 | inplanes = [x * k for x in resnet.get_inplanes()] 16 | if model_depth == 50: 17 | model = resnet.ResNet(WideBottleneck, [3, 4, 6, 3], inplanes, **kwargs) 18 | elif model_depth == 101: 19 | model = resnet.ResNet(WideBottleneck, [3, 4, 23, 3], inplanes, **kwargs) 20 | elif model_depth == 152: 21 | model = resnet.ResNet(WideBottleneck, [3, 8, 36, 3], inplanes, **kwargs) 22 | elif model_depth == 200: 23 | model = resnet.ResNet(WideBottleneck, [3, 24, 36, 3], inplanes, 24 | **kwargs) 25 | 26 | return model 27 | -------------------------------------------------------------------------------- /util_scripts/add_fps_into_activitynet_json.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import subprocess 4 | from pathlib import Path 5 | 6 | if __name__ == '__main__': 7 | video_dir_path = Path(sys.argv[1]) 8 | json_path = Path(sys.argv[2]) 9 | if len(sys.argv) > 3: 10 | dst_json_path = Path(sys.argv[3]) 11 | else: 12 | dst_json_path = json_path 13 | 14 | with json_path.open('r') as f: 15 | json_data = json.load(f) 16 | 17 | for video_file_path in sorted(video_dir_path.iterdir()): 18 | file_name = video_file_path.name 19 | if '.mp4' not in file_name: 20 | continue 21 | name = video_file_path.stem 22 | 23 | ffprobe_cmd = ['ffprobe', str(video_file_path)] 24 | p = subprocess.Popen( 25 | ffprobe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 26 | res = p.communicate()[1].decode('utf-8') 27 | 28 | fps = float([x for x in res.split(',') if 'fps' in x][0].rstrip('fps')) 29 | json_data['database'][name[2:]]['fps'] = fps 30 | 31 | with dst_json_path.open('w') as f: 32 | json.dump(json_data, f) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Kensho Hara 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /util_scripts/vid2img_sthv2.py: -------------------------------------------------------------------------------- 1 | # Code for "TSM: Temporal Shift Module for Efficient Video Understanding" 2 | # arXiv:1811.08383 3 | # Ji Lin*, Chuang Gan, Song Han 4 | # {jilin, songhan}@mit.edu, ganchuang@csail.mit.edu 5 | 6 | import os 7 | import pdb 8 | 9 | import threading 10 | 11 | NUM_THREADS = 1000 12 | VIDEO_ROOT = 'data/something/v2/video' 13 | FRAME_ROOT = 'data/something/v2/img' 14 | 15 | 16 | def split(l, n): 17 | """Yield successive n-sized chunks from l.""" 18 | for i in range(0, len(l), n): 19 | yield l[i:i + n] 20 | 21 | 22 | def extract(video, tmpl='%06d.jpg'): 23 | # os.system(f'ffmpeg -i {VIDEO_ROOT}/{video} -vf -threads 1 -vf scale=-1:256 -q:v 0 ' 24 | # f'{FRAME_ROOT}/{video[:-5]}/{tmpl}') 25 | cmd = 'ffmpeg -i \"{}/{}\" -threads 1 -vf scale=-1:256 -q:v 0 \"{}/{}/%06d.jpg\"'.format(VIDEO_ROOT, video, 26 | FRAME_ROOT, video[:-5]) 27 | os.system(cmd) 28 | 29 | 30 | def target(video_list): 31 | for video in video_list: 32 | os.makedirs(os.path.join(FRAME_ROOT, video[:-5])) 33 | extract(video) 34 | 35 | 36 | if __name__ == '__main__': 37 | if not os.path.exists(VIDEO_ROOT): 38 | raise ValueError('Please download videos and set VIDEO_ROOT variable.') 39 | if not os.path.exists(FRAME_ROOT): 40 | os.makedirs(FRAME_ROOT) 41 | 42 | video_list = os.listdir(VIDEO_ROOT) 43 | splits = list(split(video_list, NUM_THREADS)) 44 | threads = [] 45 | for i, split in enumerate(splits): 46 | thread = threading.Thread(target=target, args=(split,)) 47 | thread.start() 48 | threads.append(thread) 49 | 50 | for thread in threads: 51 | thread.join() 52 | -------------------------------------------------------------------------------- /module/roi_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn as n 8 | import torch.nn.functional as F 9 | import pdb 10 | 11 | 12 | 13 | def get_iou(roi, rois, area, areas) : 14 | y_min = torch.max(roi[:,0:1], rois[:,:,0]) 15 | x_min = torch.max(roi[:,1:2], rois[:,:,1]) 16 | y_max = torch.min(roi[:,2:3], rois[:,:,2]) 17 | x_max = torch.min(roi[:,3:4], rois[:,:,3]) 18 | axis0 = x_max - x_min + 1 19 | axis1 = y_max - y_min + 1 20 | axis0[axis0 < 0] = 0 21 | axis1[axis1 < 0] = 0 22 | intersection = axis0 * axis1 23 | iou = intersection / (areas + area - intersection) 24 | return iou 25 | 26 | 27 | def get_st_graph(rois, threshold=0): 28 | B, T, N, _ = rois.size() 29 | 30 | M = T*N 31 | front_graph = torch.zeros((B,M,M)) 32 | 33 | if M ==0 : 34 | return front_graph, front_graph.transpoes(1,2) 35 | areas = (rois[:,:,:,3] - rois[:,:,:,1] + 1) * (rois[:,:,:,2] - rois[:,:,:,0] + 1) 36 | 37 | for t in range(T-1): 38 | for i in range(N): 39 | ious = get_iou(rois[:,t,i], rois[:,t+1], areas[:,t,i:i+1], areas[:,t+1]) 40 | ious[ious < threshold] = 0 41 | front_graph[:, t*N+i, (t+1)*N:(t+2)*N] = ious 42 | 43 | back_graph = front_graph.transpose(1,2) 44 | 45 | # Normalize 46 | front_graph = front_graph / front_graph.sum(dim=-1, keepdim=True) 47 | back_graph = back_graph / back_graph.sum(dim=-1, keepdim=True) 48 | # NaN to zero 49 | front_graph[front_graph != front_graph] = 0 50 | back_graph[back_graph != back_graph] = 0 51 | 52 | return front_graph, back_graph 53 | 54 | 55 | 56 | 57 | if __name__ == '__main__': 58 | rois = torch.rand((4,8,10,4)) 59 | front_graph, back_graph = get_st_graph(rois) 60 | 61 | pdb.set_trace() 62 | 63 | 64 | -------------------------------------------------------------------------------- /module/gcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from torch.nn.parameter import Parameter 5 | import math 6 | import pdb 7 | 8 | class GraphConvolution(nn.Module): 9 | """ 10 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 11 | """ 12 | 13 | def __init__(self, in_features, out_features, bias=True, batch=False): 14 | super(GraphConvolution, self).__init__() 15 | self.in_features = in_features 16 | self.out_features = out_features 17 | self.weight = Parameter(torch.Tensor(in_features, out_features)) 18 | if bias: 19 | self.bias = Parameter(torch.Tensor(out_features)) 20 | else: 21 | self.register_parameter('bias', None) 22 | self.reset_parameters() 23 | self.batch = batch 24 | 25 | def reset_parameters(self): 26 | stdv = 1. / math.sqrt(self.weight.size(1)) 27 | self.weight.data.uniform_(-stdv, stdv) 28 | if self.bias is not None: 29 | self.bias.data.uniform_(-stdv, stdv) 30 | 31 | def forward(self, input, adj): 32 | if self.batch: 33 | support = torch.matmul(input, self.weight) 34 | output = torch.matmul(adj, support) 35 | else: 36 | support = torch.mm(input, self.weight) 37 | output = torch.mm(adj, support) 38 | #output = SparseMM(adj)(support) 39 | if self.bias is not None: 40 | return output + self.bias 41 | else: 42 | return output 43 | 44 | def __repr__(self): 45 | return self.__class__.__name__ + ' (' \ 46 | + str(self.in_features) + ' -> ' \ 47 | + str(self.out_features) + ')' 48 | 49 | 50 | class GCN(nn.Module): 51 | def __init__(self, nfeat, nhid, nclass, dropout): 52 | super(GCN, self).__init__() 53 | 54 | self.gc1 = GraphConvolution(nfeat, nhid) 55 | self.gc2 = GraphConvolution(nhid, nclass) 56 | self.dropout = dropout 57 | 58 | def forward(self, x, adj): 59 | x = F.relu(self.gc1(x, adj)) 60 | x = F.dropout(x, self.dropout, training=self.training) 61 | x = self.gc2(x, adj) 62 | # x = F.relu(self.gc2(x, adj)) 63 | # x = F.dropout(x, self.dropout, training=self.training) 64 | return x 65 | -------------------------------------------------------------------------------- /datasets/loader.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import h5py 4 | from PIL import Image 5 | 6 | 7 | class ImageLoaderPIL(object): 8 | 9 | def __call__(self, path): 10 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 11 | with path.open('rb') as f: 12 | with Image.open(f) as img: 13 | return img.convert('RGB') 14 | 15 | 16 | class ImageLoaderAccImage(object): 17 | 18 | def __call__(self, path): 19 | import accimage 20 | return accimage.Image(str(path)) 21 | 22 | 23 | class VideoLoader(object): 24 | 25 | def __init__(self, image_name_formatter, image_loader=None): 26 | self.image_name_formatter = image_name_formatter 27 | if image_loader is None: 28 | self.image_loader = ImageLoaderPIL() 29 | else: 30 | self.image_loader = image_loader 31 | 32 | def __call__(self, video_path, frame_indices): 33 | video = [] 34 | for i in frame_indices: 35 | image_path = video_path / self.image_name_formatter(i) 36 | if image_path.exists(): 37 | video.append(self.image_loader(image_path)) 38 | 39 | return video 40 | 41 | 42 | class VideoLoaderHDF5(object): 43 | 44 | def __call__(self, video_path, frame_indices): 45 | with h5py.File(video_path, 'r') as f: 46 | video_data = f['video'] 47 | 48 | video = [] 49 | for i in frame_indices: 50 | if i < len(video_data): 51 | video.append(Image.open(io.BytesIO(video_data[i]))) 52 | else: 53 | return video 54 | 55 | return video 56 | 57 | 58 | class VideoLoaderFlowHDF5(object): 59 | 60 | def __init__(self): 61 | self.flows = ['u', 'v'] 62 | 63 | def __call__(self, video_path, frame_indices): 64 | with h5py.File(video_path, 'r') as f: 65 | 66 | flow_data = [] 67 | for flow in self.flows: 68 | flow_data.append(f['video_{}'.format(flow)]) 69 | 70 | video = [] 71 | for i in frame_indices: 72 | if i < len(flow_data[0]): 73 | frame = [ 74 | Image.open(io.BytesIO(video_data[i])) 75 | for video_data in flow_data 76 | ] 77 | frame.append(frame[-1]) # add dummy data into third channel 78 | video.append(Image.merge('RGB', frame)) 79 | 80 | return video 81 | -------------------------------------------------------------------------------- /datasets/videodataset_multiclips.py: -------------------------------------------------------------------------------- 1 | import json 2 | import copy 3 | import functools 4 | import pdb 5 | import torch 6 | from torch.utils.data.dataloader import default_collate 7 | 8 | from .videodataset import VideoDataset 9 | 10 | 11 | def collate_fn(batch): 12 | batch_clips, batch_targets = zip(*batch) 13 | 14 | batch_clips = [clip for multi_clips in batch_clips for clip in multi_clips] 15 | batch_targets = [ 16 | target for multi_targets in batch_targets for target in multi_targets 17 | ] 18 | 19 | target_element = batch_targets[0] 20 | if isinstance(target_element, int) or isinstance(target_element, str): 21 | return default_collate(batch_clips), default_collate(batch_targets) 22 | else: 23 | return default_collate(batch_clips), batch_targets 24 | 25 | 26 | class VideoDatasetMultiClips(VideoDataset): 27 | 28 | def __loading(self, path, video_frame_indices): 29 | clips = [] 30 | segments = [] 31 | for clip_frame_indices in video_frame_indices: 32 | clip = self.loader(path, clip_frame_indices) 33 | if self.spatial_transform is not None: 34 | self.spatial_transform.randomize_parameters() 35 | clip = [self.spatial_transform(img) for img in clip] 36 | clips.append(torch.stack(clip, 0).permute(1, 0, 2, 3)) 37 | segments.append( 38 | [min(clip_frame_indices), 39 | max(clip_frame_indices) + 1]) 40 | 41 | return clips, segments 42 | 43 | def __getitem__(self, index): 44 | path = self.data[index]['video'] 45 | 46 | video_frame_indices = self.data[index]['frame_indices'] 47 | if self.temporal_transform is not None: 48 | video_frame_indices = self.temporal_transform(video_frame_indices) 49 | clips, segments = self.__loading(path, video_frame_indices) 50 | 51 | if isinstance(self.target_type, list): 52 | target = [self.data[index][t] for t in self.target_type] 53 | else: 54 | target = self.data[index][self.target_type] 55 | 56 | if 'segment' in self.target_type: 57 | if isinstance(self.target_type, list): 58 | segment_index = self.target_type.index('segment') 59 | targets = [] 60 | for s in segments: 61 | targets.append(copy.deepcopy(target)) 62 | targets[-1][segment_index] = s 63 | else: 64 | targets = segments 65 | else: 66 | targets = [target for _ in range(len(segments))] 67 | 68 | return clips, targets 69 | -------------------------------------------------------------------------------- /models/resnext.py: -------------------------------------------------------------------------------- 1 | import math 2 | from functools import partial 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from .resnet import conv1x1x1, Bottleneck, ResNet 9 | from utils import partialclass 10 | 11 | 12 | def get_inplanes(): 13 | return [128, 256, 512, 1024] 14 | 15 | 16 | class ResNeXtBottleneck(Bottleneck): 17 | expansion = 2 18 | 19 | def __init__(self, inplanes, planes, cardinality, stride=1, 20 | downsample=None): 21 | super().__init__(inplanes, planes, stride, downsample) 22 | 23 | mid_planes = cardinality * planes // 32 24 | self.conv1 = conv1x1x1(inplanes, mid_planes) 25 | self.bn1 = nn.BatchNorm3d(mid_planes) 26 | self.conv2 = nn.Conv3d(mid_planes, 27 | mid_planes, 28 | kernel_size=3, 29 | stride=stride, 30 | padding=1, 31 | groups=cardinality, 32 | bias=False) 33 | self.bn2 = nn.BatchNorm3d(mid_planes) 34 | self.conv3 = conv1x1x1(mid_planes, planes * self.expansion) 35 | 36 | 37 | class ResNeXt(ResNet): 38 | 39 | def __init__(self, 40 | block, 41 | layers, 42 | block_inplanes, 43 | n_input_channels=3, 44 | conv1_t_size=7, 45 | conv1_t_stride=1, 46 | no_max_pool=False, 47 | shortcut_type='B', 48 | cardinality=32, 49 | n_classes=400): 50 | block = partialclass(block, cardinality=cardinality) 51 | super().__init__(block, layers, block_inplanes, n_input_channels, 52 | conv1_t_size, conv1_t_stride, no_max_pool, 53 | shortcut_type, n_classes) 54 | 55 | self.fc = nn.Linear(cardinality * 32 * block.expansion, n_classes) 56 | 57 | 58 | def generate_model(model_depth, **kwargs): 59 | assert model_depth in [50, 101, 152, 200] 60 | 61 | if model_depth == 50: 62 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], get_inplanes(), 63 | **kwargs) 64 | elif model_depth == 101: 65 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], get_inplanes(), 66 | **kwargs) 67 | elif model_depth == 152: 68 | model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], get_inplanes(), 69 | **kwargs) 70 | elif model_depth == 200: 71 | model = ResNeXt(ResNeXtBottleneck, [3, 24, 36, 3], get_inplanes(), 72 | **kwargs) 73 | 74 | return model 75 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import random 3 | from functools import partialmethod 4 | 5 | import torch 6 | import numpy as np 7 | from sklearn.metrics import precision_recall_fscore_support 8 | 9 | 10 | class AverageMeter(object): 11 | """Computes and stores the average and current value""" 12 | 13 | def __init__(self): 14 | self.reset() 15 | 16 | def reset(self): 17 | self.val = 0 18 | self.avg = 0 19 | self.sum = 0 20 | self.count = 0 21 | 22 | def update(self, val, n=1): 23 | self.val = val 24 | self.sum += val * n 25 | self.count += n 26 | self.avg = self.sum / self.count 27 | 28 | 29 | class Logger(object): 30 | 31 | def __init__(self, path, header): 32 | self.log_file = path.open('w') 33 | self.logger = csv.writer(self.log_file, delimiter='\t') 34 | 35 | self.logger.writerow(header) 36 | self.header = header 37 | 38 | def __del(self): 39 | self.log_file.close() 40 | 41 | def log(self, values): 42 | write_values = [] 43 | for col in self.header: 44 | assert col in values 45 | write_values.append(values[col]) 46 | 47 | self.logger.writerow(write_values) 48 | self.log_file.flush() 49 | 50 | 51 | def calculate_accuracy(outputs, targets): 52 | with torch.no_grad(): 53 | batch_size = targets.size(0) 54 | 55 | _, pred = outputs.topk(1, 1, largest=True, sorted=True) 56 | pred = pred.t() 57 | correct = pred.eq(targets.view(1, -1)) 58 | n_correct_elems = correct.float().sum().item() 59 | 60 | return n_correct_elems / batch_size 61 | 62 | 63 | def calculate_precision_and_recall(outputs, targets, pos_label=1): 64 | with torch.no_grad(): 65 | _, pred = outputs.topk(1, 1, largest=True, sorted=True) 66 | precision, recall, _, _ = precision_recall_fscore_support( 67 | targets.view(-1, 1).cpu().numpy(), 68 | pred.cpu().numpy()) 69 | 70 | return precision[pos_label], recall[pos_label] 71 | 72 | 73 | def worker_init_fn(worker_id): 74 | torch_seed = torch.initial_seed() 75 | 76 | random.seed(torch_seed + worker_id) 77 | 78 | if torch_seed >= 2**32: 79 | torch_seed = torch_seed % 2**32 80 | np.random.seed(torch_seed + worker_id) 81 | 82 | 83 | def get_lr(optimizer): 84 | lrs = [] 85 | for param_group in optimizer.param_groups: 86 | lr = float(param_group['lr']) 87 | lrs.append(lr) 88 | 89 | return max(lrs) 90 | 91 | 92 | def partialclass(cls, *args, **kwargs): 93 | 94 | class PartialClass(cls): 95 | __init__ = partialmethod(cls.__init__, *args, **kwargs) 96 | 97 | return PartialClass -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | from collections import defaultdict 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | from utils import AverageMeter 9 | 10 | 11 | def get_video_results(outputs, class_names, output_topk): 12 | sorted_scores, locs = torch.topk(outputs, 13 | k=min(output_topk, len(class_names))) 14 | 15 | video_results = [] 16 | for i in range(sorted_scores.size(0)): 17 | video_results.append({ 18 | 'label': class_names[locs[i].item()], 19 | 'score': sorted_scores[i].item() 20 | }) 21 | 22 | return video_results 23 | 24 | 25 | def inference(data_loader, model, result_path, class_names, no_average, 26 | output_topk): 27 | print('inference') 28 | 29 | model.eval() 30 | 31 | batch_time = AverageMeter() 32 | data_time = AverageMeter() 33 | results = {'results': defaultdict(list)} 34 | 35 | end_time = time.time() 36 | 37 | with torch.no_grad(): 38 | for i, (inputs, targets) in enumerate(data_loader): 39 | data_time.update(time.time() - end_time) 40 | 41 | video_ids, segments = zip(*targets) 42 | outputs = model(inputs) 43 | outputs = F.softmax(outputs, dim=1).cpu() 44 | 45 | for j in range(outputs.size(0)): 46 | results['results'][video_ids[j]].append({ 47 | 'segment': segments[j], 48 | 'output': outputs[j] 49 | }) 50 | 51 | batch_time.update(time.time() - end_time) 52 | end_time = time.time() 53 | 54 | print('[{}/{}]\t' 55 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 56 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format( 57 | i + 1, 58 | len(data_loader), 59 | batch_time=batch_time, 60 | data_time=data_time)) 61 | 62 | inference_results = {'results': {}} 63 | if not no_average: 64 | for video_id, video_results in results['results'].items(): 65 | video_outputs = [ 66 | segment_result['output'] for segment_result in video_results 67 | ] 68 | video_outputs = torch.stack(video_outputs) 69 | average_scores = torch.mean(video_outputs, dim=0) 70 | inference_results['results'][video_id] = get_video_results( 71 | average_scores, class_names, output_topk) 72 | else: 73 | for video_id, video_results in results['results'].items(): 74 | inference_results['results'][video_id] = [] 75 | for segment_result in video_results: 76 | segment = segment_result['segment'] 77 | result = get_video_results(segment_result['output'], 78 | class_names, output_topk) 79 | inference_results['results'][video_id].append({ 80 | 'segment': segment, 81 | 'result': result 82 | }) 83 | 84 | with result_path.open('w') as f: 85 | json.dump(inference_results, f) 86 | -------------------------------------------------------------------------------- /util_scripts/hmdb51_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | from .utils import get_n_frames 8 | 9 | 10 | def convert_csv_to_dict(csv_dir_path, split_index): 11 | database = {} 12 | for file_path in csv_dir_path.iterdir(): 13 | filename = file_path.name 14 | if 'split{}'.format(split_index) not in filename: 15 | continue 16 | 17 | data = pd.read_csv(csv_dir_path / filename, delimiter=' ', header=None) 18 | keys = [] 19 | subsets = [] 20 | for i in range(data.shape[0]): 21 | row = data.iloc[i, :] 22 | if row[1] == 0: 23 | continue 24 | elif row[1] == 1: 25 | subset = 'training' 26 | elif row[1] == 2: 27 | subset = 'validation' 28 | 29 | keys.append(row[0].split('.')[0]) 30 | subsets.append(subset) 31 | 32 | for i in range(len(keys)): 33 | key = keys[i] 34 | database[key] = {} 35 | database[key]['subset'] = subsets[i] 36 | label = '_'.join(filename.split('_')[:-2]) 37 | database[key]['annotations'] = {'label': label} 38 | 39 | return database 40 | 41 | 42 | def get_labels(csv_dir_path): 43 | labels = [] 44 | for file_path in csv_dir_path.iterdir(): 45 | labels.append('_'.join(file_path.name.split('_')[:-2])) 46 | return sorted(list(set(labels))) 47 | 48 | 49 | def convert_hmdb51_csv_to_json(csv_dir_path, split_index, video_dir_path, 50 | dst_json_path): 51 | labels = get_labels(csv_dir_path) 52 | database = convert_csv_to_dict(csv_dir_path, split_index) 53 | 54 | dst_data = {} 55 | dst_data['labels'] = labels 56 | dst_data['database'] = {} 57 | dst_data['database'].update(database) 58 | 59 | for k, v in dst_data['database'].items(): 60 | if v['annotations'] is not None: 61 | label = v['annotations']['label'] 62 | else: 63 | label = 'test' 64 | 65 | video_path = video_dir_path / label / k 66 | n_frames = get_n_frames(video_path) 67 | v['annotations']['segment'] = (1, n_frames + 1) 68 | 69 | with dst_json_path.open('w') as dst_file: 70 | json.dump(dst_data, dst_file) 71 | 72 | 73 | if __name__ == '__main__': 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument('dir_path', 76 | default=None, 77 | type=Path, 78 | help='Directory path of HMDB51 annotation files.') 79 | parser.add_argument('video_path', 80 | default=None, 81 | type=Path, 82 | help=('Path of video directory (jpg).' 83 | 'Using to get n_frames of each video.')) 84 | parser.add_argument('dst_dir_path', 85 | default=None, 86 | type=Path, 87 | help='Directory path of dst json file.') 88 | 89 | args = parser.parse_args() 90 | 91 | for split_index in range(1, 4): 92 | dst_json_path = args.dst_dir_path / 'hmdb51_{}.json'.format(split_index) 93 | convert_hmdb51_csv_to_json(args.dir_path, split_index, args.video_path, 94 | dst_json_path) 95 | -------------------------------------------------------------------------------- /util_scripts/eval_accuracy.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from pathlib import Path 4 | 5 | 6 | def get_class_labels(data): 7 | class_labels_map = {} 8 | index = 0 9 | for class_label in data['labels']: 10 | class_labels_map[class_label] = index 11 | index += 1 12 | return class_labels_map 13 | 14 | 15 | def load_ground_truth(ground_truth_path, subset): 16 | with ground_truth_path.open('r') as f: 17 | data = json.load(f) 18 | 19 | class_labels_map = get_class_labels(data) 20 | 21 | ground_truth = [] 22 | for video_id, v in data['database'].items(): 23 | if subset != v['subset']: 24 | continue 25 | this_label = v['annotations']['label'] 26 | ground_truth.append((video_id, class_labels_map[this_label])) 27 | 28 | return ground_truth, class_labels_map 29 | 30 | 31 | def load_result(result_path, top_k, class_labels_map): 32 | with result_path.open('r') as f: 33 | data = json.load(f) 34 | 35 | result = {} 36 | for video_id, v in data['results'].items(): 37 | labels_and_scores = [] 38 | for this_result in v: 39 | label = class_labels_map[this_result['label']] 40 | score = this_result['score'] 41 | labels_and_scores.append((label, score)) 42 | labels_and_scores.sort(key=lambda x: x[1], reverse=True) 43 | result[video_id] = list(zip(*labels_and_scores[:top_k]))[0] 44 | return result 45 | 46 | 47 | def remove_nonexistent_ground_truth(ground_truth, result): 48 | exist_ground_truth = [line for line in ground_truth if line[0] in result] 49 | 50 | return exist_ground_truth 51 | 52 | 53 | def evaluate(ground_truth_path, result_path, subset, top_k, ignore): 54 | print('load ground truth') 55 | ground_truth, class_labels_map = load_ground_truth(ground_truth_path, 56 | subset) 57 | print('number of ground truth: {}'.format(len(ground_truth))) 58 | 59 | print('load result') 60 | result = load_result(result_path, top_k, class_labels_map) 61 | print('number of result: {}'.format(len(result))) 62 | 63 | n_ground_truth = len(ground_truth) 64 | ground_truth = remove_nonexistent_ground_truth(ground_truth, result) 65 | if ignore: 66 | n_ground_truth = len(ground_truth) 67 | 68 | print('calculate top-{} accuracy'.format(top_k)) 69 | correct = [1 if line[1] in result[line[0]] else 0 for line in ground_truth] 70 | accuracy = sum(correct) / n_ground_truth 71 | 72 | print('top-{} accuracy: {}'.format(top_k, accuracy)) 73 | return accuracy 74 | 75 | 76 | if __name__ == '__main__': 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('ground_truth_path', type=Path) 79 | parser.add_argument('result_path', type=Path) 80 | parser.add_argument('-k', type=int, default=1) 81 | parser.add_argument('--subset', type=str, default='validation') 82 | parser.add_argument('--save', action='store_true') 83 | parser.add_argument( 84 | '--ignore', 85 | action='store_true', 86 | help='ignore nonexistent videos in result') 87 | 88 | args = parser.parse_args() 89 | 90 | accuracy = evaluate(args.ground_truth_path, args.result_path, args.subset, 91 | args.k, args.ignore) 92 | 93 | if args.save: 94 | with (args.result_path.parent / 'top{}.txt'.format( 95 | args.k)).open('w') as f: 96 | f.write(str(accuracy)) 97 | -------------------------------------------------------------------------------- /util_scripts/ucf101_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | from .utils import get_n_frames 8 | 9 | 10 | def convert_csv_to_dict(csv_path, subset): 11 | data = pd.read_csv(csv_path, delimiter=' ', header=None) 12 | keys = [] 13 | key_labels = [] 14 | for i in range(data.shape[0]): 15 | row = data.iloc[i, :] 16 | slash_rows = data.iloc[i, 0].split('/') 17 | class_name = slash_rows[0] 18 | basename = slash_rows[1].split('.')[0] 19 | 20 | keys.append(basename) 21 | key_labels.append(class_name) 22 | 23 | database = {} 24 | for i in range(len(keys)): 25 | key = keys[i] 26 | database[key] = {} 27 | database[key]['subset'] = subset 28 | label = key_labels[i] 29 | database[key]['annotations'] = {'label': label} 30 | 31 | return database 32 | 33 | 34 | def load_labels(label_csv_path): 35 | data = pd.read_csv(label_csv_path, delimiter=' ', header=None) 36 | labels = [] 37 | for i in range(data.shape[0]): 38 | labels.append(data.iloc[i, 1]) 39 | return labels 40 | 41 | 42 | def convert_ucf101_csv_to_json(label_csv_path, train_csv_path, val_csv_path, 43 | video_dir_path, dst_json_path): 44 | labels = load_labels(label_csv_path) 45 | train_database = convert_csv_to_dict(train_csv_path, 'training') 46 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 47 | 48 | dst_data = {} 49 | dst_data['labels'] = labels 50 | dst_data['database'] = {} 51 | dst_data['database'].update(train_database) 52 | dst_data['database'].update(val_database) 53 | 54 | for k, v in dst_data['database'].items(): 55 | if v['annotations'] is not None: 56 | label = v['annotations']['label'] 57 | else: 58 | label = 'test' 59 | 60 | video_path = video_dir_path / label / k 61 | n_frames = get_n_frames(video_path) 62 | v['annotations']['segment'] = (1, n_frames + 1) 63 | 64 | with dst_json_path.open('w') as dst_file: 65 | json.dump(dst_data, dst_file) 66 | 67 | 68 | if __name__ == '__main__': 69 | parser = argparse.ArgumentParser() 70 | parser.add_argument('dir_path', 71 | default=None, 72 | type=Path, 73 | help=('Directory path including classInd.txt, ' 74 | 'trainlist0-.txt, testlist0-.txt')) 75 | parser.add_argument('video_path', 76 | default=None, 77 | type=Path, 78 | help=('Path of video directory (jpg).' 79 | 'Using to get n_frames of each video.')) 80 | parser.add_argument('dst_path', 81 | default=None, 82 | type=Path, 83 | help='Directory path of dst json file.') 84 | 85 | args = parser.parse_args() 86 | 87 | for split_index in range(1, 4): 88 | label_csv_path = args.dir_path / 'classInd.txt' 89 | train_csv_path = args.dir_path / 'trainlist0{}.txt'.format(split_index) 90 | val_csv_path = args.dir_path / 'testlist0{}.txt'.format(split_index) 91 | dst_json_path = args.dst_path / 'ucf101_0{}.json'.format(split_index) 92 | 93 | convert_ucf101_csv_to_json(label_csv_path, train_csv_path, val_csv_path, 94 | args.video_path, dst_json_path) 95 | -------------------------------------------------------------------------------- /models/pre_act_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .resnet import conv3x3x3, conv1x1x1, get_inplanes, ResNet 6 | 7 | 8 | class PreActivationBasicBlock(nn.Module): 9 | expansion = 1 10 | 11 | def __init__(self, inplanes, planes, stride=1, downsample=None): 12 | super().__init__() 13 | 14 | self.bn1 = nn.BatchNorm3d(inplanes) 15 | self.conv1 = conv3x3x3(inplanes, planes, stride) 16 | self.bn2 = nn.BatchNorm3d(planes) 17 | self.conv2 = conv3x3x3(planes, planes) 18 | self.relu = nn.ReLU(inplace=True) 19 | self.downsample = downsample 20 | self.stride = stride 21 | 22 | def forward(self, x): 23 | residual = x 24 | 25 | out = self.bn1(x) 26 | out = self.relu(out) 27 | out = self.conv1(out) 28 | 29 | out = self.bn2(out) 30 | out = self.relu(out) 31 | out = self.conv2(out) 32 | 33 | if self.downsample is not None: 34 | residual = self.downsample(x) 35 | 36 | out += residual 37 | 38 | return out 39 | 40 | 41 | class PreActivationBottleneck(nn.Module): 42 | expansion = 4 43 | 44 | def __init__(self, inplanes, planes, stride=1, downsample=None): 45 | super().__init__() 46 | 47 | self.bn1 = nn.BatchNorm3d(inplanes) 48 | self.conv1 = conv1x1x1(inplanes, planes) 49 | self.bn2 = nn.BatchNorm3d(planes) 50 | self.conv2 = conv3x3x3(planes, planes, stride) 51 | self.bn3 = nn.BatchNorm3d(planes) 52 | self.conv3 = conv1x1x1(planes, planes * self.expansion) 53 | self.relu = nn.ReLU(inplace=True) 54 | self.downsample = downsample 55 | self.stride = stride 56 | 57 | def forward(self, x): 58 | residual = x 59 | 60 | out = self.bn1(x) 61 | out = self.relu(out) 62 | out = self.conv1(out) 63 | 64 | out = self.bn2(out) 65 | out = self.relu(out) 66 | out = self.conv2(out) 67 | 68 | out = self.bn3(out) 69 | out = self.relu(out) 70 | out = self.conv3(out) 71 | 72 | if self.downsample is not None: 73 | residual = self.downsample(x) 74 | 75 | out += residual 76 | 77 | return out 78 | 79 | 80 | def generate_model(model_depth, **kwargs): 81 | assert model_depth in [10, 18, 34, 50, 101, 152, 200] 82 | 83 | if model_depth == 10: 84 | model = ResNet(PreActivationBasicBlock, [1, 1, 1, 1], get_inplanes(), 85 | **kwargs) 86 | elif model_depth == 18: 87 | model = ResNet(PreActivationBasicBlock, [2, 2, 2, 2], get_inplanes(), 88 | **kwargs) 89 | elif model_depth == 34: 90 | model = ResNet(PreActivationBasicBlock, [3, 4, 6, 3], get_inplanes(), 91 | **kwargs) 92 | elif model_depth == 50: 93 | model = ResNet(PreActivationBottleneck, [3, 4, 6, 3], get_inplanes(), 94 | **kwargs) 95 | elif model_depth == 101: 96 | model = ResNet(PreActivationBottleneck, [3, 4, 23, 3], get_inplanes(), 97 | **kwargs) 98 | elif model_depth == 152: 99 | model = ResNet(PreActivationBottleneck, [3, 8, 36, 3], get_inplanes(), 100 | **kwargs) 101 | elif model_depth == 200: 102 | model = ResNet(PreActivationBottleneck, [3, 24, 36, 3], get_inplanes(), 103 | **kwargs) 104 | 105 | return model 106 | -------------------------------------------------------------------------------- /util_scripts/mit_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | from .utils import get_n_frames 8 | 9 | 10 | def convert_csv_to_dict(csv_path, subset): 11 | data = pd.read_csv(csv_path, header=None) 12 | keys = [] 13 | key_labels = [] 14 | if subset == 'testing': 15 | for i in range(data.shape[0]): 16 | basename = data.iloc[i, 0].split('/') 17 | assert len(basename) == 1 18 | basename = Path(basename[0]).stem 19 | 20 | keys.append(basename) 21 | else: 22 | for i in range(data.shape[0]): 23 | basename = data.iloc[i, 0].split('/') 24 | assert len(basename) == 2 25 | basename = Path(basename[1]).stem 26 | 27 | keys.append(basename) 28 | key_labels.append(data.iloc[i, 1]) 29 | 30 | database = {} 31 | for i in range(len(keys)): 32 | key = keys[i] 33 | database[key] = {} 34 | database[key]['subset'] = subset 35 | if subset != 'testing': 36 | label = key_labels[i] 37 | database[key]['annotations'] = {'label': label} 38 | else: 39 | database[key]['annotations'] = {} 40 | 41 | return database 42 | 43 | 44 | def load_labels(train_csv_path): 45 | data = pd.read_csv(train_csv_path, header=None) 46 | return data.iloc[:, 0].tolist() 47 | 48 | 49 | def convert_mit_csv_to_json(class_file_path, train_csv_path, val_csv_path, 50 | test_csv_path, video_dir_path, dst_json_path): 51 | labels = load_labels(class_file_path) 52 | train_database = convert_csv_to_dict(train_csv_path, 'training') 53 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 54 | if test_csv_path.exists(): 55 | test_database = convert_csv_to_dict(test_csv_path, 'testing') 56 | 57 | dst_data = {} 58 | dst_data['labels'] = labels 59 | dst_data['database'] = {} 60 | dst_data['database'].update(train_database) 61 | dst_data['database'].update(val_database) 62 | if test_csv_path.exists(): 63 | dst_data['database'].update(test_database) 64 | 65 | for k, v in dst_data['database'].items(): 66 | if 'label' in v['annotations']: 67 | label = v['annotations']['label'] 68 | else: 69 | label = 'test' 70 | 71 | video_path = video_dir_path / label / k 72 | n_frames = get_n_frames(video_path) 73 | v['annotations']['segment'] = (1, n_frames + 1) 74 | 75 | with dst_json_path.open('w') as dst_file: 76 | json.dump(dst_data, dst_file) 77 | 78 | 79 | if __name__ == '__main__': 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument( 82 | 'dir_path', 83 | default=None, 84 | type=Path, 85 | help=('Directory path including moments_categories.txt, ' 86 | 'trainingSet.csv, validationSet.csv, ' 87 | '(testingSet.csv (optional))')) 88 | parser.add_argument('video_path', 89 | default=None, 90 | type=Path, 91 | help=('Path of video directory (jpg).' 92 | 'Using to get n_frames of each video.')) 93 | parser.add_argument('dst_path', 94 | default=None, 95 | type=Path, 96 | help='Path of dst json file.') 97 | 98 | args = parser.parse_args() 99 | 100 | class_file_path = args.dir_path / 'moments_categories.txt' 101 | train_csv_path = args.dir_path / 'trainingSet.csv' 102 | val_csv_path = args.dir_path / 'validationSet.csv' 103 | test_csv_path = args.dir_path / 'testingSet.csv' 104 | 105 | convert_mit_csv_to_json(class_file_path, train_csv_path, val_csv_path, 106 | test_csv_path, args.video_path, args.dst_path) 107 | -------------------------------------------------------------------------------- /util_scripts/generate_video_jpgs.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import argparse 3 | from pathlib import Path 4 | 5 | from joblib import Parallel, delayed 6 | 7 | 8 | def video_process(video_file_path, dst_root_path, ext, fps=-1, size=240): 9 | if ext != video_file_path.suffix: 10 | return 11 | 12 | ffprobe_cmd = ('ffprobe -v error -select_streams v:0 ' 13 | '-of default=noprint_wrappers=1:nokey=1 -show_entries ' 14 | 'stream=width,height,avg_frame_rate,duration').split() 15 | ffprobe_cmd.append(str(video_file_path)) 16 | 17 | p = subprocess.run(ffprobe_cmd, capture_output=True) 18 | res = p.stdout.decode('utf-8').splitlines() 19 | if len(res) < 4: 20 | return 21 | 22 | frame_rate = [float(r) for r in res[2].split('/')] 23 | frame_rate = frame_rate[0] / frame_rate[1] 24 | duration = float(res[3]) 25 | n_frames = int(frame_rate * duration) 26 | 27 | name = video_file_path.stem 28 | dst_dir_path = dst_root_path / name 29 | dst_dir_path.mkdir(exist_ok=True) 30 | n_exist_frames = len([ 31 | x for x in dst_dir_path.iterdir() 32 | if x.suffix == '.jpg' and x.name[0] != '.' 33 | ]) 34 | 35 | if n_exist_frames >= n_frames: 36 | return 37 | 38 | width = int(res[0]) 39 | height = int(res[1]) 40 | 41 | if width > height: 42 | vf_param = 'scale=-1:{}'.format(size) 43 | else: 44 | vf_param = 'scale={}:-1'.format(size) 45 | 46 | if fps > 0: 47 | vf_param += ',minterpolate={}'.format(fps) 48 | 49 | ffmpeg_cmd = ['ffmpeg', '-i', str(video_file_path), '-vf', vf_param] 50 | ffmpeg_cmd += ['-threads', '1', '{}/image_%05d.jpg'.format(dst_dir_path)] 51 | print(ffmpeg_cmd) 52 | subprocess.run(ffmpeg_cmd) 53 | print('\n') 54 | 55 | 56 | def class_process(class_dir_path, dst_root_path, ext, fps=-1, size=240): 57 | if not class_dir_path.is_dir(): 58 | return 59 | 60 | dst_class_path = dst_root_path / class_dir_path.name 61 | dst_class_path.mkdir(exist_ok=True) 62 | 63 | for video_file_path in sorted(class_dir_path.iterdir()): 64 | video_process(video_file_path, dst_class_path, ext, fps, size) 65 | 66 | 67 | if __name__ == '__main__': 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument( 70 | 'dir_path', default=None, type=Path, help='Directory path of videos') 71 | parser.add_argument( 72 | 'dst_path', 73 | default=None, 74 | type=Path, 75 | help='Directory path of jpg videos') 76 | parser.add_argument( 77 | 'dataset', 78 | default='', 79 | type=str, 80 | help='Dataset name (kinetics | mit | ucf101 | hmdb51 | activitynet)') 81 | parser.add_argument( 82 | '--n_jobs', default=-1, type=int, help='Number of parallel jobs') 83 | parser.add_argument( 84 | '--fps', 85 | default=-1, 86 | type=int, 87 | help=('Frame rates of output videos. ' 88 | '-1 means original frame rates.')) 89 | parser.add_argument( 90 | '--size', default=240, type=int, help='Frame size of output videos.') 91 | args = parser.parse_args() 92 | 93 | if args.dataset in ['kinetics', 'mit', 'activitynet']: 94 | ext = '.mp4' 95 | else: 96 | ext = '.avi' 97 | 98 | if args.dataset == 'activitynet': 99 | video_file_paths = [x for x in sorted(args.dir_path.iterdir())] 100 | status_list = Parallel( 101 | n_jobs=args.n_jobs, 102 | backend='threading')(delayed(video_process)( 103 | video_file_path, args.dst_path, ext, args.fps, args.size) 104 | for video_file_path in video_file_paths) 105 | else: 106 | class_dir_paths = [x for x in sorted(args.dir_path.iterdir())] 107 | test_set_video_path = args.dir_path / 'test' 108 | if test_set_video_path.exists(): 109 | class_dir_paths.append(test_set_video_path) 110 | 111 | status_list = Parallel( 112 | n_jobs=args.n_jobs, 113 | backend='threading')(delayed(class_process)( 114 | class_dir_path, args.dst_path, ext, args.fps, args.size) 115 | for class_dir_path in class_dir_paths) 116 | -------------------------------------------------------------------------------- /util_scripts/sthv2_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | from utils import get_n_frames 8 | import pdb 9 | import tqdm 10 | import json 11 | 12 | 13 | def convert_json_to_dict(csv_path, subset): 14 | lines = json.load(open(csv_path,'r')) 15 | database = {} 16 | 17 | for line in lines: 18 | video_id = line['id'] 19 | database[video_id] = {} 20 | database[video_id]['subset'] = subset 21 | if subset != 'testing': 22 | label = line['template'].replace('[','').replace(']','') 23 | database[video_id]['annotations'] = {'label': label} 24 | else: 25 | database[video_id]['annotations'] = {} 26 | 27 | return database 28 | 29 | 30 | 31 | def convert_csv_to_dict(csv_path, subset): 32 | lines = open(csv_path, 'r').readlines() 33 | keys = [] 34 | key_labels = [] 35 | database = {} 36 | 37 | for line in lines: 38 | video_id, nframe, label = line.strip('\n').split(' ') 39 | 40 | database[video_id] = {} 41 | database[video_id]['subset'] = subset 42 | if subset != 'testing': 43 | database[video_id]['annotations'] = {'label': label} 44 | else: 45 | database[video_id]['annotations'] = {} 46 | 47 | return database 48 | 49 | 50 | def load_labels(train_csv_path): 51 | data = open(train_csv_path, 'r').readlines() 52 | data = [e.strip('\n') for e in data] 53 | return data 54 | # data = pd.read_csv(train_csv_path, header=None) 55 | # return data.iloc[:, 0].tolist() 56 | 57 | 58 | def convert_sthv2_csv_to_json(class_file_path, train_csv_path, val_csv_path, 59 | test_csv_path, video_dir_path, dst_json_path): 60 | labels = load_labels(class_file_path) 61 | train_database = convert_json_to_dict(train_csv_path, 'training') 62 | val_database = convert_json_to_dict(val_csv_path, 'validation') 63 | if test_csv_path.exists(): 64 | test_database = convert_json_to_dict(test_csv_path, 'testing') 65 | 66 | dst_data = {} 67 | dst_data['labels'] = labels 68 | dst_data['database'] = {} 69 | dst_data['database'].update(train_database) 70 | dst_data['database'].update(val_database) 71 | if test_csv_path.exists(): 72 | dst_data['database'].update(test_database) 73 | 74 | count = 0 75 | for k, v in tqdm.tqdm(dst_data['database'].items()): 76 | if 'label' in v['annotations']: 77 | label = v['annotations']['label'] 78 | else: 79 | label = 'test' 80 | 81 | video_path = video_dir_path / k 82 | n_frames = get_n_frames(video_path) 83 | v['annotations']['segment'] = (1, n_frames + 1) 84 | v['video_path'] = str(video_path) 85 | # count += 1 86 | # if count == 1000: 87 | # break 88 | 89 | with dst_json_path.open('w') as dst_file: 90 | json.dump(dst_data, dst_file) 91 | 92 | 93 | if __name__ == '__main__': 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument( 96 | 'dir_path', 97 | default='data/something/v2', 98 | type=Path, 99 | help=('Directory path including moments_categories.txt, ' 100 | 'trainingSet.csv, validationSet.csv, ' 101 | '(testingSet.csv (optional))')) 102 | parser.add_argument('video_path', 103 | default='data/something/v2/img', 104 | type=Path, 105 | help=('Path of video directory (jpg).' 106 | 'Using to get n_frames of each video.')) 107 | parser.add_argument('dst_path', 108 | default='./', 109 | type=Path, 110 | help='Path of dst json file.') 111 | 112 | args = parser.parse_args() 113 | 114 | class_file_path = args.dir_path / 'category.txt' 115 | train_csv_path = args.dir_path / 'something-something-v2-train.json' 116 | val_csv_path = args.dir_path / 'something-something-v2-validation.json' 117 | test_csv_path = args.dir_path / 'something-something-v2-test.json' 118 | # train_csv_path = args.dir_path / 'train_videofolder.txt' 119 | # val_csv_path = args.dir_path / 'val_videofolder.txt' 120 | # test_csv_path = args.dir_path / 'test_videofolder.txt' 121 | 122 | convert_sthv2_csv_to_json(class_file_path, train_csv_path, val_csv_path, 123 | test_csv_path, args.video_path, args.dst_path) 124 | -------------------------------------------------------------------------------- /util_scripts/sthv1_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | from utils import get_n_frames 8 | import pdb 9 | import tqdm 10 | import json 11 | 12 | 13 | def convert_json_to_dict(csv_path, subset): 14 | lines = json.load(open(csv_path,'r')) 15 | database = {} 16 | 17 | for line in lines: 18 | video_id = line['id'] 19 | database[video_id] = {} 20 | database[video_id]['subset'] = subset 21 | if subset != 'testing': 22 | label = line['template'].replace('[','').replace(']','') 23 | database[video_id]['annotations'] = {'label': label} 24 | else: 25 | database[video_id]['annotations'] = {} 26 | 27 | return database 28 | 29 | 30 | 31 | def convert_csv_to_dict(csv_path, subset): 32 | lines = open(csv_path, 'r').readlines() 33 | keys = [] 34 | key_labels = [] 35 | database = {} 36 | 37 | for line in lines: 38 | if subset != 'testing': 39 | video_id, label = line.strip('\n').split(';') 40 | else: 41 | video_id = line.strip('\n') 42 | 43 | database[video_id] = {} 44 | database[video_id]['subset'] = subset 45 | if subset != 'testing': 46 | database[video_id]['annotations'] = {'label': label} 47 | else: 48 | database[video_id]['annotations'] = {} 49 | 50 | return database 51 | 52 | 53 | def load_labels(train_csv_path): 54 | data = open(train_csv_path, 'r').readlines() 55 | data = [e.strip('\n') for e in data] 56 | return data 57 | # data = pd.read_csv(train_csv_path, header=None) 58 | # return data.iloc[:, 0].tolist() 59 | 60 | 61 | def convert_sthv1_csv_to_json(class_file_path, train_csv_path, val_csv_path, 62 | test_csv_path, video_dir_path, dst_json_path): 63 | labels = load_labels(class_file_path) 64 | train_database = convert_csv_to_dict(train_csv_path, 'training') 65 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 66 | if test_csv_path.exists(): 67 | test_database = convert_csv_to_dict(test_csv_path, 'testing') 68 | 69 | dst_data = {} 70 | dst_data['labels'] = labels 71 | dst_data['database'] = {} 72 | dst_data['database'].update(train_database) 73 | dst_data['database'].update(val_database) 74 | if test_csv_path.exists(): 75 | dst_data['database'].update(test_database) 76 | 77 | count = 0 78 | for k, v in tqdm.tqdm(dst_data['database'].items()): 79 | if 'label' in v['annotations']: 80 | label = v['annotations']['label'] 81 | else: 82 | label = 'test' 83 | 84 | video_path = video_dir_path / k 85 | n_frames = get_n_frames(video_path) 86 | v['annotations']['segment'] = (1, n_frames + 1) 87 | v['video_path'] = str(video_path) 88 | # count += 1 89 | # if count == 1000: 90 | # break 91 | 92 | with dst_json_path.open('w') as dst_file: 93 | json.dump(dst_data, dst_file) 94 | 95 | 96 | if __name__ == '__main__': 97 | parser = argparse.ArgumentParser() 98 | parser.add_argument( 99 | 'dir_path', 100 | default='data/something/v1', 101 | type=Path, 102 | help=('Directory path including moments_categories.txt, ' 103 | 'trainingSet.csv, validationSet.csv, ' 104 | '(testingSet.csv (optional))')) 105 | parser.add_argument('video_path', 106 | default='data/something/v1/img', 107 | type=Path, 108 | help=('Path of video directory (jpg).' 109 | 'Using to get n_frames of each video.')) 110 | parser.add_argument('dst_path', 111 | default='./', 112 | type=Path, 113 | help='Path of dst json file.') 114 | 115 | args = parser.parse_args() 116 | 117 | class_file_path = args.dir_path / 'category.txt' 118 | train_csv_path = args.dir_path / 'something-something-v1-train.csv' 119 | val_csv_path = args.dir_path / 'something-something-v1-validation.csv' 120 | test_csv_path = args.dir_path / 'something-something-v1-test.csv' 121 | # train_csv_path = args.dir_path / 'train_videofolder.txt' 122 | # val_csv_path = args.dir_path / 'val_videofolder.txt' 123 | # test_csv_path = args.dir_path / 'test_videofolder.txt' 124 | 125 | convert_sthv1_csv_to_json(class_file_path, train_csv_path, val_csv_path, 126 | test_csv_path, args.video_path, args.dst_path) 127 | -------------------------------------------------------------------------------- /util_scripts/kinetics_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | 7 | from .utils import get_n_frames, get_n_frames_hdf5 8 | 9 | 10 | def convert_csv_to_dict(csv_path, subset): 11 | data = pd.read_csv(csv_path) 12 | keys = [] 13 | key_labels = [] 14 | for i in range(data.shape[0]): 15 | row = data.iloc[i, :] 16 | basename = '%s_%s_%s' % (row['youtube_id'], '%06d' % row['time_start'], 17 | '%06d' % row['time_end']) 18 | keys.append(basename) 19 | if subset != 'testing': 20 | key_labels.append(row['label']) 21 | 22 | database = {} 23 | for i in range(len(keys)): 24 | key = keys[i] 25 | database[key] = {} 26 | database[key]['subset'] = subset 27 | if subset != 'testing': 28 | label = key_labels[i] 29 | database[key]['annotations'] = {'label': label} 30 | else: 31 | database[key]['annotations'] = {} 32 | 33 | return database 34 | 35 | 36 | def load_labels(train_csv_path): 37 | data = pd.read_csv(train_csv_path) 38 | return data['label'].unique().tolist() 39 | 40 | 41 | def convert_kinetics_csv_to_json(train_csv_path, val_csv_path, test_csv_path, 42 | video_dir_path, video_type, dst_json_path): 43 | labels = load_labels(train_csv_path) 44 | train_database = convert_csv_to_dict(train_csv_path, 'training') 45 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 46 | if test_csv_path.exists(): 47 | test_database = convert_csv_to_dict(test_csv_path, 'testing') 48 | 49 | dst_data = {} 50 | dst_data['labels'] = labels 51 | dst_data['database'] = {} 52 | dst_data['database'].update(train_database) 53 | dst_data['database'].update(val_database) 54 | if test_csv_path.exists(): 55 | dst_data['database'].update(test_database) 56 | 57 | for k, v in dst_data['database'].items(): 58 | if 'label' in v['annotations']: 59 | label = v['annotations']['label'] 60 | else: 61 | label = 'test' 62 | 63 | if video_type == 'jpg': 64 | video_path = video_dir_path / label / k 65 | if video_path.exists(): 66 | n_frames = get_n_frames(video_path) 67 | v['annotations']['segment'] = (1, n_frames + 1) 68 | else: 69 | video_path = video_dir_path / label / f'{k}.hdf5' 70 | if video_path.exists(): 71 | n_frames = get_n_frames_hdf5(video_path) 72 | v['annotations']['segment'] = (0, n_frames) 73 | 74 | with dst_json_path.open('w') as dst_file: 75 | json.dump(dst_data, dst_file) 76 | 77 | 78 | if __name__ == '__main__': 79 | parser = argparse.ArgumentParser() 80 | parser.add_argument('dir_path', 81 | default=None, 82 | type=Path, 83 | help=('Directory path including ' 84 | 'kinetics_train.csv, kinetics_val.csv, ' 85 | '(kinetics_test.csv (optional))')) 86 | parser.add_argument( 87 | 'n_classes', 88 | default=700, 89 | type=int, 90 | help='400, 600, or 700 (Kinetics-400, Kinetics-600, or Kinetics-700)') 91 | parser.add_argument('video_path', 92 | default=None, 93 | type=Path, 94 | help=('Path of video directory (jpg or hdf5).' 95 | 'Using to get n_frames of each video.')) 96 | parser.add_argument('video_type', 97 | default='jpg', 98 | type=str, 99 | help=('jpg or hdf5')) 100 | parser.add_argument('dst_path', 101 | default=None, 102 | type=Path, 103 | help='Path of dst json file.') 104 | 105 | args = parser.parse_args() 106 | 107 | assert args.video_type in ['jpg', 'hdf5'] 108 | 109 | train_csv_path = (args.dir_path / 110 | 'kinetics-{}_train.csv'.format(args.n_classes)) 111 | val_csv_path = (args.dir_path / 112 | 'kinetics-{}_val.csv'.format(args.n_classes)) 113 | test_csv_path = (args.dir_path / 114 | 'kinetics-{}_test.csv'.format(args.n_classes)) 115 | 116 | convert_kinetics_csv_to_json(train_csv_path, val_csv_path, test_csv_path, 117 | args.video_path, args.video_type, 118 | args.dst_path) 119 | -------------------------------------------------------------------------------- /validation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import sys 4 | import pdb 5 | import torch 6 | import torch.distributed as dist 7 | 8 | from utils import AverageMeter, calculate_accuracy 9 | 10 | 11 | def val_epoch(epoch, 12 | data_loader, 13 | model, 14 | criterion, 15 | device, 16 | logger, 17 | tb_writer=None, 18 | distributed=False, 19 | rpn=None, 20 | det_interval=2, 21 | nrois=10): 22 | print('validation at epoch {}'.format(epoch)) 23 | 24 | model.eval() 25 | if rpn is not None: 26 | rpn.eval() 27 | 28 | batch_time = AverageMeter() 29 | data_time = AverageMeter() 30 | losses = AverageMeter() 31 | accuracies = AverageMeter() 32 | 33 | end_time = time.time() 34 | 35 | with torch.no_grad(): 36 | for i, (inputs, targets) in enumerate(data_loader): 37 | data_time.update(time.time() - end_time) 38 | targets = targets.to(device, non_blocking=True) 39 | if rpn is not None: 40 | ''' 41 | There was an unexpected CUDNN_ERROR when len(rpn_inputs) is 42 | decrased. 43 | ''' 44 | T = inputs.shape[2] 45 | N, C, T, H, W = inputs.size() 46 | if i == 0: 47 | max_N = N 48 | # sample frames for RPN 49 | sample = torch.arange(0,T,det_interval) 50 | rpn_inputs = inputs[:,:,sample].transpose(1,2).contiguous() 51 | rpn_inputs = rpn_inputs.view(-1,C,H,W) 52 | if len(inputs) < max_N: 53 | print("Modified from {} to {}".format(len(inputs), max_N)) 54 | rpn_inputs = torch.cat((rpn_inputs, rpn_inputs[:(max_N-len(inputs))*(T//det_interval)])) 55 | with torch.no_grad(): 56 | proposals = rpn(rpn_inputs) 57 | proposals = proposals.view(-1,T//det_interval,nrois,4) 58 | if len(inputs) < max_N: 59 | proposals = proposals[:len(inputs)] 60 | outputs = model(inputs, proposals.detach()) 61 | # update to the largest batch_size 62 | max_N = max(N, max_N) 63 | else: 64 | outputs = model(inputs) 65 | 66 | loss = criterion(outputs, targets) 67 | acc = calculate_accuracy(outputs, targets) 68 | 69 | losses.update(loss.item(), inputs.size(0)) 70 | accuracies.update(acc, inputs.size(0)) 71 | 72 | batch_time.update(time.time() - end_time) 73 | end_time = time.time() 74 | 75 | print('Epoch: [{0}][{1}/{2}]\t' 76 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 77 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 78 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 79 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 80 | epoch, 81 | i + 1, 82 | len(data_loader), 83 | batch_time=batch_time, 84 | data_time=data_time, 85 | loss=losses, 86 | acc=accuracies)) 87 | 88 | if distributed: 89 | loss_sum = torch.tensor([losses.sum], 90 | dtype=torch.float32, 91 | device=device) 92 | loss_count = torch.tensor([losses.count], 93 | dtype=torch.float32, 94 | device=device) 95 | acc_sum = torch.tensor([accuracies.sum], 96 | dtype=torch.float32, 97 | device=device) 98 | acc_count = torch.tensor([accuracies.count], 99 | dtype=torch.float32, 100 | device=device) 101 | 102 | dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) 103 | dist.all_reduce(loss_count, op=dist.ReduceOp.SUM) 104 | dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) 105 | dist.all_reduce(acc_count, op=dist.ReduceOp.SUM) 106 | 107 | losses.avg = loss_sum.item() / loss_count.item() 108 | accuracies.avg = acc_sum.item() / acc_count.item() 109 | 110 | if logger is not None: 111 | logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg}) 112 | 113 | if tb_writer is not None: 114 | tb_writer.add_scalar('val/loss', losses.avg, epoch) 115 | tb_writer.add_scalar('val/acc', accuracies.avg, epoch) 116 | 117 | return losses.avg 118 | -------------------------------------------------------------------------------- /strg.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import pdb 3 | import os 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torchvision.ops import RoIAlign 9 | 10 | from rgcn_models import RGCN 11 | 12 | from model import (generate_model, load_pretrained_model, make_data_parallel, 13 | get_fine_tuning_parameters) 14 | 15 | from models import resnet, resnet2p1d, pre_act_resnet, wide_resnet, resnext, densenet 16 | 17 | class STRG(nn.Module): 18 | def __init__(self, base_model, in_channel=2048, out_channel=512, 19 | nclass=174, dropout=0.3, nrois=10, 20 | freeze_bn=True, freeze_bn_affine=True, 21 | roi_size=7 22 | ): 23 | super(STRG,self).__init__() 24 | self.base_model = base_model 25 | self.in_channel = in_channel 26 | self.out_channel = out_channel 27 | self.nclass = nclass 28 | self.nrois = nrois 29 | 30 | self.freeze_bn = freeze_bn 31 | self.freeze_bn_affine = freeze_bn_affine 32 | 33 | self.base_model.fc = nn.Identity() 34 | self.base_model.avgpool = nn.Identity() 35 | if False: 36 | self.base_model.maxpool.stride = (1,2,2) 37 | self.base_model.layer3[0].conv2.stride=(1,2,2) 38 | self.base_model.layer3[0].downsample[0].stride=(1,2,2) 39 | self.base_model.layer4[0].conv2.stride=(1,1,1) 40 | self.base_model.layer4[0].downsample[0].stride=(1,1,1) 41 | 42 | self.reducer = nn.Conv3d(self.in_channel, self.out_channel,1) 43 | self.classifier = nn.Linear(2*self.out_channel, nclass) 44 | self.avg_pool = nn.Sequential( 45 | nn.AdaptiveAvgPool3d(1), 46 | nn.Dropout(p=dropout) 47 | ) 48 | self.max_pool = nn.AdaptiveAvgPool2d(1) 49 | 50 | self.strg_gcn = RGCN() 51 | self.roi_align = RoIAlign((roi_size,roi_size), 1/8, -1, aligned=True) 52 | 53 | def extract_feature(self, x): 54 | return self.base_model.extract_feature(x) 55 | 56 | 57 | # x = self.base_model.conv1(x) 58 | # x = self.base_model.bn1(x) 59 | # x = self.base_model.relu(x) 60 | # if not self.base_model.no_max_pool: 61 | # x = self.base_model.maxpool(x) 62 | 63 | # x = self.base_model.layer1(x) 64 | # x = self.base_model.layer2(x) 65 | # x = self.base_model.layer3(x) 66 | # x = self.base_model.layer4(x) 67 | # return x 68 | 69 | 70 | def forward(self, inputs, rois=None): 71 | features = self.extract_feature(inputs) 72 | features = self.reducer(features) # N C T H W 73 | pooled_features = self.avg_pool(features).squeeze(-1).squeeze(-1).squeeze(-1) 74 | N, C, T, H, W = features.shape 75 | 76 | rois_list = rois.view(-1, self.nrois, 4) 77 | rois_list = [r for r in rois_list] 78 | 79 | features = features.transpose(1,2).contiguous().view(N*T,C,H,W) 80 | rois_features = self.roi_align(features, rois_list) 81 | rois_features = self.max_pool(rois_features) 82 | rois_features = rois_features.view(N,T,self.nrois,C) 83 | gcn_features = self.strg_gcn(rois_features, rois) 84 | 85 | features = torch.cat((pooled_features, gcn_features), dim=-1) 86 | outputs = self.classifier(features) 87 | 88 | return outputs 89 | 90 | 91 | def train(self, mode=True): 92 | """ 93 | Override the default train() to freeze the BN parameters 94 | """ 95 | 96 | super(STRG, self).train(mode) 97 | if self.freeze_bn: 98 | print("Freezing Mean/Var of BatchNorm2D.") 99 | if self.freeze_bn_affine: 100 | print("Freezing Weight/Bias of BatchNorm2D.") 101 | if self.freeze_bn: 102 | for m in self.base_model.modules(): 103 | if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm1d): 104 | m.eval() 105 | if self.freeze_bn_affine: 106 | m.weight.requires_grad = False 107 | m.bias.requires_grad = False 108 | 109 | 110 | if __name__ == '__main__': 111 | 112 | model = resnet.generate_model(model_depth=50, 113 | n_classes=174, 114 | n_input_channels=3, 115 | shortcut_type='B', 116 | conv1_t_size=7, 117 | conv1_t_stride=1, 118 | no_max_pool=False, 119 | widen_factor=1.0) 120 | 121 | rois = torch.rand((4,8,10,4)) 122 | inputs = torch.rand((4,3,16,224,224)) 123 | strg = STRG(model) 124 | out = strg(inputs, rois) 125 | 126 | pdb.set_trace() 127 | print(out.shape) 128 | -------------------------------------------------------------------------------- /temporal_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | 5 | class Compose(object): 6 | 7 | def __init__(self, transforms): 8 | self.transforms = transforms 9 | 10 | def __call__(self, frame_indices): 11 | for i, t in enumerate(self.transforms): 12 | if isinstance(frame_indices[0], list): 13 | next_transforms = Compose(self.transforms[i:]) 14 | dst_frame_indices = [ 15 | next_transforms(clip_frame_indices) 16 | for clip_frame_indices in frame_indices 17 | ] 18 | 19 | return dst_frame_indices 20 | else: 21 | frame_indices = t(frame_indices) 22 | return frame_indices 23 | 24 | 25 | class LoopPadding(object): 26 | 27 | def __init__(self, size): 28 | self.size = size 29 | 30 | def __call__(self, frame_indices): 31 | out = frame_indices 32 | 33 | for index in out: 34 | if len(out) >= self.size: 35 | break 36 | out.append(index) 37 | 38 | return out 39 | 40 | 41 | class TemporalBeginCrop(object): 42 | 43 | def __init__(self, size): 44 | self.size = size 45 | 46 | def __call__(self, frame_indices): 47 | out = frame_indices[:self.size] 48 | 49 | for index in out: 50 | if len(out) >= self.size: 51 | break 52 | out.append(index) 53 | 54 | return out 55 | 56 | 57 | class TemporalCenterCrop(object): 58 | 59 | def __init__(self, size): 60 | self.size = size 61 | 62 | def __call__(self, frame_indices): 63 | 64 | center_index = len(frame_indices) // 2 65 | begin_index = max(0, center_index - (self.size // 2)) 66 | end_index = min(begin_index + self.size, len(frame_indices)) 67 | 68 | out = frame_indices[begin_index:end_index] 69 | 70 | for index in out: 71 | if len(out) >= self.size: 72 | break 73 | out.append(index) 74 | 75 | return out 76 | 77 | 78 | class TemporalRandomCrop(object): 79 | 80 | def __init__(self, size): 81 | self.size = size 82 | self.loop = LoopPadding(size) 83 | 84 | def __call__(self, frame_indices): 85 | 86 | rand_end = max(0, len(frame_indices) - self.size - 1) 87 | begin_index = random.randint(0, rand_end) 88 | end_index = min(begin_index + self.size, len(frame_indices)) 89 | 90 | out = frame_indices[begin_index:end_index] 91 | 92 | if len(out) < self.size: 93 | out = self.loop(out) 94 | 95 | return out 96 | 97 | 98 | class TemporalEvenCrop(object): 99 | 100 | def __init__(self, size, n_samples=1): 101 | self.size = size 102 | self.n_samples = n_samples 103 | self.loop = LoopPadding(size) 104 | 105 | def __call__(self, frame_indices): 106 | n_frames = len(frame_indices) 107 | stride = max( 108 | 1, math.ceil((n_frames - 1 - self.size) / (self.n_samples - 1))) 109 | 110 | out = [] 111 | for begin_index in frame_indices[::stride]: 112 | if len(out) >= self.n_samples: 113 | break 114 | end_index = min(frame_indices[-1] + 1, begin_index + self.size) 115 | sample = list(range(begin_index, end_index)) 116 | 117 | if len(sample) < self.size: 118 | out.append(self.loop(sample)) 119 | break 120 | else: 121 | out.append(sample) 122 | 123 | return out 124 | 125 | 126 | class SlidingWindow(object): 127 | 128 | def __init__(self, size, stride=0): 129 | self.size = size 130 | if stride == 0: 131 | self.stride = self.size 132 | else: 133 | self.stride = stride 134 | self.loop = LoopPadding(size) 135 | 136 | def __call__(self, frame_indices): 137 | out = [] 138 | for begin_index in frame_indices[::self.stride]: 139 | end_index = min(frame_indices[-1] + 1, begin_index + self.size) 140 | sample = list(range(begin_index, end_index)) 141 | 142 | if len(sample) < self.size: 143 | out.append(self.loop(sample)) 144 | break 145 | else: 146 | out.append(sample) 147 | 148 | return out 149 | 150 | 151 | class TemporalSubsampling(object): 152 | 153 | def __init__(self, stride): 154 | self.stride = stride 155 | 156 | def __call__(self, frame_indices): 157 | return frame_indices[::self.stride] 158 | 159 | 160 | class Shuffle(object): 161 | 162 | def __init__(self, block_size): 163 | self.block_size = block_size 164 | 165 | def __call__(self, frame_indices): 166 | frame_indices = [ 167 | frame_indices[i:(i + self.block_size)] 168 | for i in range(0, len(frame_indices), self.block_size) 169 | ] 170 | random.shuffle(frame_indices) 171 | frame_indices = [t for block in frame_indices for t in block] 172 | return frame_indices -------------------------------------------------------------------------------- /datasets/videodataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | import torch 5 | import torch.utils.data as data 6 | 7 | from .loader import VideoLoader 8 | import pdb 9 | 10 | def get_class_labels(data): 11 | class_labels_map = {} 12 | index = 0 13 | for class_label in data['labels']: 14 | class_labels_map[class_label] = index 15 | index += 1 16 | return class_labels_map 17 | 18 | 19 | def get_database(data, subset, root_path, video_path_formatter): 20 | video_ids = [] 21 | video_paths = [] 22 | annotations = [] 23 | for key, value in data['database'].items(): 24 | this_subset = value['subset'] 25 | if this_subset == subset: 26 | video_ids.append(key) 27 | annotations.append(value['annotations']) 28 | if 'video_path' in value: 29 | video_paths.append(Path(value['video_path'])) 30 | else: 31 | label = value['annotations']['label'] 32 | video_paths.append(video_path_formatter(root_path, label, key)) 33 | 34 | return video_ids, video_paths, annotations 35 | 36 | 37 | class VideoDataset(data.Dataset): 38 | 39 | def __init__(self, 40 | root_path, 41 | annotation_path, 42 | subset, 43 | spatial_transform=None, 44 | temporal_transform=None, 45 | target_transform=None, 46 | video_loader=None, 47 | video_path_formatter=(lambda root_path, label, video_id: 48 | root_path / label / video_id), 49 | image_name_formatter=lambda x: 'image_{:05d}.jpg'.format(x), 50 | target_type='label'): 51 | self.data, self.class_names = self.__make_dataset( 52 | root_path, annotation_path, subset, video_path_formatter) 53 | 54 | self.spatial_transform = spatial_transform 55 | self.temporal_transform = temporal_transform 56 | self.target_transform = target_transform 57 | if video_loader is None: 58 | self.loader = VideoLoader(image_name_formatter) 59 | else: 60 | self.loader = video_loader 61 | 62 | self.target_type = target_type 63 | 64 | def __make_dataset(self, root_path, annotation_path, subset, 65 | video_path_formatter): 66 | with annotation_path.open('r') as f: 67 | data = json.load(f) 68 | video_ids, video_paths, annotations = get_database( 69 | data, subset, root_path, video_path_formatter) 70 | class_to_idx = get_class_labels(data) 71 | idx_to_class = {} 72 | for name, label in class_to_idx.items(): 73 | idx_to_class[label] = name 74 | 75 | n_videos = len(video_ids) 76 | dataset = [] 77 | for i in range(n_videos): 78 | if i % (n_videos // 5) == 0: 79 | print('dataset loading [{}/{}]'.format(i, len(video_ids))) 80 | 81 | if 'label' in annotations[i]: 82 | label = annotations[i]['label'] 83 | label_id = class_to_idx[label] 84 | else: 85 | label = 'test' 86 | label_id = -1 87 | 88 | video_path = video_paths[i] 89 | if not video_path.exists(): 90 | print(video_path) 91 | continue 92 | 93 | segment = annotations[i]['segment'] 94 | if segment[1] == 1: 95 | continue 96 | 97 | frame_indices = list(range(segment[0], segment[1])) 98 | sample = { 99 | 'video': video_path, 100 | 'segment': segment, 101 | 'frame_indices': frame_indices, 102 | 'video_id': video_ids[i], 103 | 'label': label_id 104 | } 105 | dataset.append(sample) 106 | 107 | return dataset, idx_to_class 108 | 109 | def __loading(self, path, frame_indices): 110 | clip = self.loader(path, frame_indices) 111 | if self.spatial_transform is not None: 112 | self.spatial_transform.randomize_parameters() 113 | clip = [self.spatial_transform(img) for img in clip] 114 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 115 | 116 | return clip 117 | 118 | def __getitem__(self, index): 119 | path = self.data[index]['video'] 120 | if isinstance(self.target_type, list): 121 | target = [self.data[index][t] for t in self.target_type] 122 | else: 123 | target = self.data[index][self.target_type] 124 | 125 | frame_indices = self.data[index]['frame_indices'] 126 | if self.temporal_transform is not None: 127 | frame_indices = self.temporal_transform(frame_indices) 128 | 129 | clip = self.__loading(path, frame_indices) 130 | 131 | if self.target_transform is not None: 132 | target = self.target_transform(target) 133 | 134 | return clip, target 135 | 136 | def __len__(self): 137 | return len(self.data) 138 | -------------------------------------------------------------------------------- /rpn.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | from collections import OrderedDict 3 | import copy 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torchvision.models.detection import fasterrcnn_resnet50_fpn 8 | from torchvision.models.detection.image_list import ImageList 9 | 10 | from torch.jit.annotations import Tuple, List, Dict, Optional 11 | from torch import Tensor 12 | import warnings 13 | from transform import STRGTransform 14 | 15 | class RPN(nn.Module): 16 | def __init__(self, pretrained=True, nrois=10): 17 | super(RPN,self).__init__() 18 | model = fasterrcnn_resnet50_fpn(pretrained=True).eval() 19 | self.transform = STRGTransform(model.transform.min_size, 20 | model.transform.max_size, 21 | 0,0) #copy.deepcopy(model.transform) 22 | self.backbone = copy.deepcopy(model.backbone) 23 | self.rpn = copy.deepcopy(model.rpn) 24 | # self.eaget_outputs = copy.deepcopy(model.eaget_outputs) 25 | self.roi_heads = copy.deepcopy(model.roi_heads) 26 | self.rpn._pre_nms_top_n = {'training':3*nrois, 'testing':3*nrois} 27 | self.rpn._post_nms_top_n = {'training':nrois, 'testing':nrois} 28 | self.rpn.fg_bg_sampler.positive_fraction = 1.0 29 | del model 30 | 31 | def forward(self, images, targets=None): 32 | # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]] 33 | """ 34 | Arguments: 35 | images (list[Tensor]): images to be processed 36 | targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional) 37 | Returns: 38 | result (list[BoxList] or dict[Tensor]): the output from the model. 39 | During training, it returns a dict[Tensor] which contains the losses. 40 | During testing, it returns list[BoxList] contains additional fields 41 | like `scores`, `labels` and `mask` (for Mask R-CNN models). 42 | """ 43 | bs = len(images) 44 | if self.training and targets is None: 45 | raise ValueError("In training mode, targets should be passed") 46 | if self.training: 47 | assert targets is not None 48 | for target in targets: 49 | boxes = target["boxes"] 50 | if isinstance(boxes, torch.Tensor): 51 | if len(boxes.shape) != 2 or boxes.shape[-1] != 4: 52 | raise ValueError("Expected target boxes to be a tensor" 53 | "of shape [N, 4], got {:}.".format( 54 | boxes.shape)) 55 | else: 56 | raise ValueError("Expected target boxes to be of type " 57 | "Tensor, got {:}.".format(type(boxes))) 58 | original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], []) 59 | for img in images: 60 | val = img.shape[-2:] 61 | assert len(val) == 2 62 | original_image_sizes.append((val[0], val[1])) 63 | 64 | images, targets = self.transform(images, targets) 65 | # Check for degenerate boxes 66 | # TODO: Move this to a function 67 | if targets is not None: 68 | for target_idx, target in enumerate(targets): 69 | boxes = target["boxes"] 70 | degenerate_boxes = boxes[:, 2:] <= boxes[:, :2] 71 | if degenerate_boxes.any(): 72 | # print the first degenrate box 73 | bb_idx = degenerate_boxes.any(dim=1).nonzero().view(-1)[0] 74 | degen_bb: List[float] = boxes[bb_idx].tolist() 75 | raise ValueError("All bounding boxes should have positive height and width." 76 | " Found invaid box {} for target at index {}." 77 | .format(degen_bb, target_idx)) 78 | 79 | features = self.backbone(images.tensors) 80 | if isinstance(features, torch.Tensor): 81 | features = OrderedDict([('0', features)]) 82 | proposals, proposal_losses = self.rpn(images, features, targets) 83 | proposals = self.transform.rpn_postprocess(proposals, images.image_sizes, original_image_sizes) 84 | if False: 85 | for i in range(len(proposals)): 86 | delta = self.rpn._post_nms_top_n['testing'] - len(proposals[i]) 87 | if delta != 0: 88 | print("RPN finds only {} among {}".format(len(proposals[i]), 89 | len(proposals[i])+delta)) 90 | dummy = -torch.ones((delta, 4)).to(proposals[i].device()) 91 | proposals[i] = torch.cat((proposals[i], dummy)) 92 | return torch.cat(proposals).view(bs, -1, 4) 93 | 94 | 95 | if __name__ == '__main__': 96 | rpn = RPN().eval() 97 | # rpn = nn.DataParallel(rpn, device_ids=None).cuda() 98 | inputs = torch.rand((5,3,224,224)) 99 | out = rpn(inputs) 100 | pdb.set_trace() 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /util_scripts/generate_video_hdf5.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import argparse 3 | from pathlib import Path 4 | 5 | from joblib import Parallel, delayed 6 | import h5py 7 | import numpy as np 8 | 9 | 10 | def video_process(video_file_path, dst_root_path, ext, fps=-1, size=240): 11 | if ext != video_file_path.suffix: 12 | return 13 | 14 | ffprobe_cmd = ('ffprobe -v error -select_streams v:0 ' 15 | '-of default=noprint_wrappers=1:nokey=1 -show_entries ' 16 | 'stream=width,height,avg_frame_rate,duration').split() 17 | ffprobe_cmd.append(str(video_file_path)) 18 | 19 | p = subprocess.run(ffprobe_cmd, capture_output=True) 20 | res = p.stdout.decode('utf-8').splitlines() 21 | if len(res) < 4: 22 | return 23 | 24 | name = video_file_path.stem 25 | dst_dir_path = dst_root_path / name 26 | dst_dir_path.mkdir(exist_ok=True) 27 | 28 | width = int(res[0]) 29 | height = int(res[1]) 30 | 31 | if width > height: 32 | vf_param = f'scale=-1:{size}' 33 | else: 34 | vf_param = f'scale={size}:-1' 35 | 36 | if fps > 0: 37 | vf_param += f',minterpolate={fps}' 38 | 39 | ffmpeg_cmd = ['ffmpeg', '-i', str(video_file_path), '-vf', vf_param] 40 | ffmpeg_cmd += ['-threads', '1', f'{dst_dir_path}/image_%05d.jpg'] 41 | print(ffmpeg_cmd) 42 | subprocess.run(ffmpeg_cmd) 43 | 44 | hdf5_path = dst_dir_path.parent / f'{dst_dir_path.name}.hdf5' 45 | try: 46 | with h5py.File(hdf5_path, 'w') as f: 47 | dtype = h5py.special_dtype(vlen='uint8') 48 | video = f.create_dataset('video', 49 | (len(list(dst_dir_path.glob('*.jpg'))),), 50 | dtype=dtype) 51 | except OSError as exc: 52 | if 'errno = 36' in exc.args[0]: 53 | hdf5_path = dst_dir_path.parent / f'{dst_dir_path.name[:250]}.hdf5' 54 | with h5py.File(hdf5_path, 'w') as f: 55 | dtype = h5py.special_dtype(vlen='uint8') 56 | video = f.create_dataset( 57 | 'video', (len(list(dst_dir_path.glob('*.jpg'))),), 58 | dtype=dtype) 59 | else: 60 | raise 61 | 62 | for i, file_path in enumerate(sorted(dst_dir_path.glob('*.jpg'))): 63 | with file_path.open('rb') as f: 64 | data = f.read() 65 | with h5py.File(hdf5_path, 'r+') as f: 66 | video = f['video'] 67 | video[i] = np.frombuffer(data, dtype='uint8') 68 | 69 | for file_path in dst_dir_path.glob('*.jpg'): 70 | file_path.unlink() 71 | dst_dir_path.rmdir() 72 | 73 | 74 | def class_process(class_dir_path, dst_root_path, ext, fps=-1, size=240): 75 | if not class_dir_path.is_dir(): 76 | return 77 | 78 | dst_class_path = dst_root_path / class_dir_path.name 79 | dst_class_path.mkdir(exist_ok=True) 80 | 81 | for video_file_path in sorted(class_dir_path.iterdir()): 82 | video_process(video_file_path, dst_class_path, ext, fps, size) 83 | 84 | 85 | if __name__ == '__main__': 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument('dir_path', 88 | default=None, 89 | type=Path, 90 | help='Directory path of videos') 91 | parser.add_argument('dst_path', 92 | default=None, 93 | type=Path, 94 | help='Directory path of jpg videos') 95 | parser.add_argument( 96 | 'dataset', 97 | default='', 98 | type=str, 99 | help='Dataset name (kinetics | mit | ucf101 | hmdb51 | activitynet)') 100 | parser.add_argument('--n_jobs', 101 | default=-1, 102 | type=int, 103 | help='Number of parallel jobs') 104 | parser.add_argument('--fps', 105 | default=-1, 106 | type=int, 107 | help=('Frame rates of output videos. ' 108 | '-1 means original frame rates.')) 109 | parser.add_argument('--size', 110 | default=240, 111 | type=int, 112 | help='Frame size of output videos.') 113 | args = parser.parse_args() 114 | 115 | if args.dataset in ['kinetics', 'mit', 'activitynet']: 116 | ext = '.mp4' 117 | else: 118 | ext = '.avi' 119 | 120 | if args.dataset == 'activitynet': 121 | video_file_paths = [x for x in sorted(args.dir_path.iterdir())] 122 | status_list = Parallel(n_jobs=args.n_jobs, backend='threading')( 123 | delayed(video_process)(video_file_path, args.dst_path, ext, 124 | args.fps, args.size) 125 | for video_file_path in video_file_paths) 126 | else: 127 | class_dir_paths = [x for x in sorted(args.dir_path.iterdir())] 128 | test_set_video_path = args.dir_path / 'test' 129 | if test_set_video_path.exists(): 130 | class_dir_paths.append(test_set_video_path) 131 | 132 | status_list = Parallel(n_jobs=args.n_jobs, backend='threading')( 133 | delayed(class_process)(class_dir_path, args.dst_path, ext, args.fps, 134 | args.size) 135 | for class_dir_path in class_dir_paths) 136 | -------------------------------------------------------------------------------- /training.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import os 4 | import sys 5 | import pdb 6 | 7 | import torch 8 | import torch.distributed as dist 9 | import torch.nn as nn 10 | 11 | from utils import AverageMeter, calculate_accuracy 12 | 13 | 14 | def freeze_bn(model): 15 | print("Freezing Mean/Var of BatchNorm2D.") 16 | print("Freezing Weight/Bias of BatchNorm2D.") 17 | for m in model.modules(): 18 | if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm1d): 19 | m.eval() 20 | m.weight.requires_grad = False 21 | m.bias.requires_grad = False 22 | 23 | 24 | def train_epoch(epoch, 25 | data_loader, 26 | model, 27 | criterion, 28 | optimizer, 29 | device, 30 | current_lr, 31 | epoch_logger, 32 | batch_logger, 33 | tb_writer=None, 34 | distributed=False, 35 | rpn=None, 36 | det_interval=2, 37 | nrois=10): 38 | print('train at epoch {}'.format(epoch)) 39 | 40 | model.train() 41 | if rpn is not None: 42 | rpn.eval() 43 | else: 44 | freeze_bn(model) 45 | 46 | batch_time = AverageMeter() 47 | data_time = AverageMeter() 48 | losses = AverageMeter() 49 | accuracies = AverageMeter() 50 | 51 | end_time = time.time() 52 | for i, (inputs, targets) in enumerate(data_loader): 53 | data_time.update(time.time() - end_time) 54 | targets = targets.to(device, non_blocking=True) 55 | if rpn is not None: 56 | ''' 57 | There was an unexpected CUDNN_ERROR when len(rpn_inputs) is 58 | decrased. 59 | ''' 60 | N, C, T, H, W = inputs.size() 61 | if i == 0: 62 | max_N = N 63 | # sample frames for RPN 64 | sample = torch.arange(0,T,det_interval) 65 | rpn_inputs = inputs[:,:,sample].transpose(1,2).contiguous() 66 | rpn_inputs = rpn_inputs.view(-1,C,H,W) 67 | if len(inputs) < max_N: 68 | print("Modified from {} to {}".format(len(inputs), max_N)) 69 | while len(rpn_inputs) < max_N * (T // det_interval): 70 | rpn_inputs = torch.cat((rpn_inputs, rpn_inputs[:(max_N-len(inputs))*(T//det_interval)])) 71 | with torch.no_grad(): 72 | proposals = rpn(rpn_inputs) 73 | proposals = proposals.view(-1,T//det_interval,nrois,4) 74 | if len(inputs) < max_N: 75 | proposals = proposals[:len(inputs)] 76 | outputs = model(inputs, proposals.detach()) 77 | # update to the largest batch_size 78 | max_N = max(N, max_N) 79 | else: 80 | outputs = model(inputs) 81 | loss = criterion(outputs, targets) 82 | acc = calculate_accuracy(outputs, targets) 83 | 84 | losses.update(loss.item(), inputs.size(0)) 85 | accuracies.update(acc, inputs.size(0)) 86 | 87 | optimizer.zero_grad() 88 | loss.backward() 89 | optimizer.step() 90 | 91 | batch_time.update(time.time() - end_time) 92 | end_time = time.time() 93 | 94 | if batch_logger is not None: 95 | batch_logger.log({ 96 | 'epoch': epoch, 97 | 'batch': i + 1, 98 | 'iter': (epoch - 1) * len(data_loader) + (i + 1), 99 | 'loss': losses.val, 100 | 'acc': accuracies.val, 101 | 'lr': current_lr 102 | }) 103 | if i % 20 == 0: 104 | print('Epoch: [{0}][{1}/{2}]\t' 105 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 106 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 107 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 108 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(epoch, 109 | i + 1, 110 | len(data_loader), 111 | batch_time=batch_time, 112 | data_time=data_time, 113 | loss=losses, 114 | acc=accuracies)) 115 | 116 | if distributed: 117 | loss_sum = torch.tensor([losses.sum], 118 | dtype=torch.float32, 119 | device=device) 120 | loss_count = torch.tensor([losses.count], 121 | dtype=torch.float32, 122 | device=device) 123 | acc_sum = torch.tensor([accuracies.sum], 124 | dtype=torch.float32, 125 | device=device) 126 | acc_count = torch.tensor([accuracies.count], 127 | dtype=torch.float32, 128 | device=device) 129 | 130 | dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) 131 | dist.all_reduce(loss_count, op=dist.ReduceOp.SUM) 132 | dist.all_reduce(acc_sum, op=dist.ReduceOp.SUM) 133 | dist.all_reduce(acc_count, op=dist.ReduceOp.SUM) 134 | 135 | losses.avg = loss_sum.item() / loss_count.item() 136 | accuracies.avg = acc_sum.item() / acc_count.item() 137 | 138 | if epoch_logger is not None: 139 | epoch_logger.log({ 140 | 'epoch': epoch, 141 | 'loss': losses.avg, 142 | 'acc': accuracies.avg, 143 | 'lr': current_lr 144 | }) 145 | 146 | if tb_writer is not None: 147 | tb_writer.add_scalar('train/loss', losses.avg, epoch) 148 | tb_writer.add_scalar('train/acc', accuracies.avg, epoch) 149 | tb_writer.add_scalar('train/lr', current_lr, epoch) 150 | -------------------------------------------------------------------------------- /datasets/activitynet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import json 3 | 4 | import torch 5 | import torch.utils.data as data 6 | 7 | from .loader import VideoLoader 8 | from .videodataset import VideoDataset 9 | 10 | 11 | def get_n_frames(video_path): 12 | return len([ 13 | x for x in video_path.iterdir() 14 | if 'image' in x.name and x.name[0] != '.' 15 | ]) 16 | 17 | 18 | def get_class_labels(data): 19 | class_names = [] 20 | for node1 in data['taxonomy']: 21 | is_leaf = True 22 | for node2 in data['taxonomy']: 23 | if node2['parentId'] == node1['nodeId']: 24 | is_leaf = False 25 | break 26 | if is_leaf: 27 | class_names.append(node1['nodeName']) 28 | 29 | class_labels_map = {} 30 | 31 | for i, class_name in enumerate(class_names): 32 | class_labels_map[class_name] = i 33 | 34 | return class_labels_map 35 | 36 | 37 | def get_video_ids_annotations_and_fps(data, subset): 38 | video_ids = [] 39 | annotations = [] 40 | fps_values = [] 41 | 42 | for key, value in data['database'].items(): 43 | this_subset = value['subset'] 44 | if this_subset == subset: 45 | video_ids.append(key) 46 | annotations.append(value['annotations']) 47 | fps_values.append(value['fps']) 48 | 49 | return video_ids, annotations, fps_values 50 | 51 | 52 | class ActivityNet(VideoDataset): 53 | 54 | def __init__( 55 | self, 56 | root_path, 57 | annotation_path, 58 | subset, 59 | spatial_transform=None, 60 | temporal_transform=None, 61 | target_transform=None, 62 | video_loader=None, 63 | video_path_formatter=( 64 | lambda root_path, label, video_id: root_path / 'v_{}'.format(video_id)), 65 | image_name_formatter=lambda x: 'image_{:05d}.jpg'.format(x), 66 | is_untrimmed_setting=False): 67 | if is_untrimmed_setting: 68 | self.data, self.class_names = self.__make_untrimmed_dataset( 69 | root_path, annotation_path, subset, video_path_formatter) 70 | else: 71 | self.data, self.class_names = self.__make_dataset( 72 | root_path, annotation_path, subset, video_path_formatter) 73 | 74 | self.spatial_transform = spatial_transform 75 | self.temporal_transform = temporal_transform 76 | self.target_transform = target_transform 77 | 78 | if video_loader is None: 79 | self.loader = VideoLoader(image_name_formatter) 80 | else: 81 | self.loader = video_loader 82 | 83 | def __make_dataset(self, root_path, annotation_path, subset, 84 | video_path_formatter): 85 | with annotation_path.open('r') as f: 86 | data = json.load(f) 87 | video_ids, annotations, fps_values = get_video_ids_annotations_and_fps( 88 | data, subset) 89 | class_to_idx = get_class_labels(data) 90 | idx_to_class = {} 91 | for name, label in class_to_idx.items(): 92 | idx_to_class[label] = name 93 | 94 | dataset = [] 95 | for i in range(len(video_ids)): 96 | if i % 1000 == 0: 97 | print('dataset loading [{}/{}]'.format(i, len(video_ids))) 98 | 99 | video_path = video_path_formatter(root_path, label, video_ids[i]) 100 | if not video_path.exists(): 101 | continue 102 | 103 | fps = fps_values[i] 104 | 105 | for annotation in annotations[i]: 106 | t_begin = math.floor(annotation['segment'][0] * fps) + 1 107 | t_end = math.floor(annotation['segment'][1] * fps) + 1 108 | n_video_frames = get_n_frames(video_path) 109 | t_end = min(t_end, n_video_frames) 110 | frame_indices = list(range(t_begin, t_end)) 111 | 112 | sample = { 113 | 'video': video_path, 114 | 'segment': (frame_indices[0], frame_indices[-1] + 1), 115 | 'frame_indices': frame_indices, 116 | 'fps': fps, 117 | 'video_id': video_ids[i] 118 | } 119 | if annotations is not None: 120 | sample['label'] = class_to_idx[annotation['label']] 121 | else: 122 | sample['label'] = -1 123 | 124 | if len(sample['frame_indices']) < 8: 125 | continue 126 | dataset.append(sample) 127 | 128 | return dataset, idx_to_class 129 | 130 | def __make_untrimmed_dataset(self, root_path, annotation_path, subset, 131 | video_path_formatter): 132 | with annotation_path.open('r') as f: 133 | data = json.load(f) 134 | video_ids, annotations, fps_values = get_video_ids_annotations_and_fps( 135 | data, subset) 136 | class_to_idx = get_class_labels(data) 137 | idx_to_class = {} 138 | for name, label in class_to_idx.items(): 139 | idx_to_class[label] = name 140 | 141 | dataset = [] 142 | for i in range(len(video_ids)): 143 | if i % 1000 == 0: 144 | print('dataset loading [{}/{}]'.format(i, len(video_ids))) 145 | 146 | video_path = video_path_formatter(root_path, label, video_ids[i]) 147 | if not video_path.exists(): 148 | continue 149 | 150 | fps = fps_values[i] 151 | 152 | t_begin = 1 153 | t_end = get_n_frames(video_path) + 1 154 | frame_indices = list(range(t_begin, t_end)) 155 | 156 | sample = { 157 | 'video': video_path, 158 | 'segment': (frame_indices[0], frame_indices[-1] + 1), 159 | 'frame_indices': frame_indices, 160 | 'fps': fps, 161 | 'video_id': video_ids[i] 162 | } 163 | dataset.append(sample) 164 | 165 | return dataset, idx_to_class 166 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from models import resnet, resnet2p1d, pre_act_resnet, wide_resnet, resnext, densenet, resnet_strg 5 | 6 | 7 | def get_module_name(name): 8 | name = name.split('.') 9 | if name[0] == 'module': 10 | i = 1 11 | else: 12 | i = 0 13 | if name[i] == 'features': 14 | i += 1 15 | 16 | return name[i] 17 | 18 | 19 | def get_fine_tuning_parameters(model, ft_begin_module): 20 | if not ft_begin_module: 21 | return model.parameters() 22 | 23 | parameters = [] 24 | add_flag = False 25 | for k, v in model.named_parameters(): 26 | if ft_begin_module == get_module_name(k): 27 | add_flag = True 28 | 29 | if add_flag: 30 | parameters.append({'params': v}) 31 | 32 | return parameters 33 | 34 | 35 | def generate_model(opt): 36 | assert opt.model in [ 37 | 'resnet', 'resnet2p1d', 'preresnet', 'wideresnet', 'resnext', 'densenet', 38 | 'resnet_strg' 39 | ] 40 | if opt.model == 'resnet': 41 | model = resnet.generate_model(model_depth=opt.model_depth, 42 | n_classes=opt.n_classes, 43 | n_input_channels=opt.n_input_channels, 44 | shortcut_type=opt.resnet_shortcut, 45 | conv1_t_size=opt.conv1_t_size, 46 | conv1_t_stride=opt.conv1_t_stride, 47 | no_max_pool=opt.no_max_pool, 48 | widen_factor=opt.resnet_widen_factor) 49 | elif opt.model == 'resnet_strg': 50 | model = resnet_strg.generate_model(model_depth=opt.model_depth, 51 | n_classes=opt.n_classes, 52 | n_input_channels=opt.n_input_channels, 53 | shortcut_type=opt.resnet_shortcut, 54 | conv1_t_size=opt.conv1_t_size, 55 | conv1_t_stride=opt.conv1_t_stride, 56 | no_max_pool=opt.no_max_pool, 57 | widen_factor=opt.resnet_widen_factor) 58 | elif opt.model == 'resnet2p1d': 59 | model = resnet2p1d.generate_model(model_depth=opt.model_depth, 60 | n_classes=opt.n_classes, 61 | n_input_channels=opt.n_input_channels, 62 | shortcut_type=opt.resnet_shortcut, 63 | conv1_t_size=opt.conv1_t_size, 64 | conv1_t_stride=opt.conv1_t_stride, 65 | no_max_pool=opt.no_max_pool, 66 | widen_factor=opt.resnet_widen_factor) 67 | elif opt.model == 'wideresnet': 68 | model = wide_resnet.generate_model( 69 | model_depth=opt.model_depth, 70 | k=opt.wide_resnet_k, 71 | n_classes=opt.n_classes, 72 | n_input_channels=opt.n_input_channels, 73 | shortcut_type=opt.resnet_shortcut, 74 | conv1_t_size=opt.conv1_t_size, 75 | conv1_t_stride=opt.conv1_t_stride, 76 | no_max_pool=opt.no_max_pool) 77 | elif opt.model == 'resnext': 78 | model = resnext.generate_model(model_depth=opt.model_depth, 79 | cardinality=opt.resnext_cardinality, 80 | n_classes=opt.n_classes, 81 | n_input_channels=opt.n_input_channels, 82 | shortcut_type=opt.resnet_shortcut, 83 | conv1_t_size=opt.conv1_t_size, 84 | conv1_t_stride=opt.conv1_t_stride, 85 | no_max_pool=opt.no_max_pool) 86 | elif opt.model == 'preresnet': 87 | model = pre_act_resnet.generate_model( 88 | model_depth=opt.model_depth, 89 | n_classes=opt.n_classes, 90 | n_input_channels=opt.n_input_channels, 91 | shortcut_type=opt.resnet_shortcut, 92 | conv1_t_size=opt.conv1_t_size, 93 | conv1_t_stride=opt.conv1_t_stride, 94 | no_max_pool=opt.no_max_pool) 95 | elif opt.model == 'densenet': 96 | model = densenet.generate_model(model_depth=opt.model_depth, 97 | n_classes=opt.n_classes, 98 | n_input_channels=opt.n_input_channels, 99 | conv1_t_size=opt.conv1_t_size, 100 | conv1_t_stride=opt.conv1_t_stride, 101 | no_max_pool=opt.no_max_pool) 102 | 103 | return model 104 | 105 | 106 | def load_pretrained_model(model, pretrain_path, model_name, n_finetune_classes, 107 | is_strg=False): 108 | if pretrain_path: 109 | print('loading pretrained model {}'.format(pretrain_path)) 110 | pretrain = torch.load(pretrain_path, map_location='cpu') 111 | 112 | model.load_state_dict(pretrain['state_dict']) 113 | if is_strg: 114 | return model 115 | 116 | tmp_model = model 117 | if model_name == 'densenet': 118 | tmp_model.classifier = nn.Linear(tmp_model.classifier.in_features, 119 | n_finetune_classes) 120 | else: 121 | tmp_model.fc = nn.Linear(tmp_model.fc.in_features, 122 | n_finetune_classes) 123 | 124 | return model 125 | 126 | 127 | def make_data_parallel(model, is_distributed, device): 128 | if is_distributed: 129 | if device.type == 'cuda' and device.index is not None: 130 | torch.cuda.set_device(device) 131 | model.to(device) 132 | 133 | model = nn.parallel.DistributedDataParallel(model, 134 | device_ids=[device]) 135 | else: 136 | model.to(device) 137 | model = nn.parallel.DistributedDataParallel(model) 138 | elif device.type == 'cuda': 139 | model = nn.DataParallel(model, device_ids=None).cuda() 140 | 141 | return model 142 | -------------------------------------------------------------------------------- /spatial_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from torchvision.transforms import transforms 4 | from torchvision.transforms import functional as F 5 | from PIL import Image 6 | 7 | 8 | class Compose(transforms.Compose): 9 | 10 | def randomize_parameters(self): 11 | for t in self.transforms: 12 | t.randomize_parameters() 13 | 14 | 15 | class ToTensor(transforms.ToTensor): 16 | 17 | def randomize_parameters(self): 18 | pass 19 | 20 | 21 | class Normalize(transforms.Normalize): 22 | 23 | def randomize_parameters(self): 24 | pass 25 | 26 | 27 | class ScaleValue(object): 28 | 29 | def __init__(self, s): 30 | self.s = s 31 | 32 | def __call__(self, tensor): 33 | tensor *= self.s 34 | return tensor 35 | 36 | def randomize_parameters(self): 37 | pass 38 | 39 | 40 | class Resize(transforms.Resize): 41 | 42 | def randomize_parameters(self): 43 | pass 44 | 45 | 46 | class Scale(transforms.Scale): 47 | 48 | def randomize_parameters(self): 49 | pass 50 | 51 | 52 | class CenterCrop(transforms.CenterCrop): 53 | 54 | def randomize_parameters(self): 55 | pass 56 | 57 | 58 | class CornerCrop(object): 59 | 60 | def __init__(self, 61 | size, 62 | crop_position=None, 63 | crop_positions=['c', 'tl', 'tr', 'bl', 'br']): 64 | self.size = size 65 | self.crop_position = crop_position 66 | self.crop_positions = crop_positions 67 | 68 | if crop_position is None: 69 | self.randomize = True 70 | else: 71 | self.randomize = False 72 | self.randomize_parameters() 73 | 74 | def __call__(self, img): 75 | image_width = img.size[0] 76 | image_height = img.size[1] 77 | 78 | h, w = (self.size, self.size) 79 | if self.crop_position == 'c': 80 | i = int(round((image_height - h) / 2.)) 81 | j = int(round((image_width - w) / 2.)) 82 | elif self.crop_position == 'tl': 83 | i = 0 84 | j = 0 85 | elif self.crop_position == 'tr': 86 | i = 0 87 | j = image_width - self.size 88 | elif self.crop_position == 'bl': 89 | i = image_height - self.size 90 | j = 0 91 | elif self.crop_position == 'br': 92 | i = image_height - self.size 93 | j = image_width - self.size 94 | 95 | img = F.crop(img, i, j, h, w) 96 | 97 | return img 98 | 99 | def randomize_parameters(self): 100 | if self.randomize: 101 | self.crop_position = self.crop_positions[random.randint( 102 | 0, 103 | len(self.crop_positions) - 1)] 104 | 105 | def __repr__(self): 106 | return self.__class__.__name__ + '(size={0}, crop_position={1}, randomize={2})'.format( 107 | self.size, self.crop_position, self.randomize) 108 | 109 | 110 | class RandomHorizontalFlip(transforms.RandomHorizontalFlip): 111 | 112 | def __init__(self, p=0.5): 113 | super().__init__(p) 114 | self.randomize_parameters() 115 | 116 | def __call__(self, img): 117 | """ 118 | Args: 119 | img (PIL.Image): Image to be flipped. 120 | Returns: 121 | PIL.Image: Randomly flipped image. 122 | """ 123 | if self.random_p < self.p: 124 | return F.hflip(img) 125 | return img 126 | 127 | def randomize_parameters(self): 128 | self.random_p = random.random() 129 | 130 | 131 | class MultiScaleCornerCrop(object): 132 | 133 | def __init__(self, 134 | size, 135 | scales, 136 | crop_positions=['c', 'tl', 'tr', 'bl', 'br'], 137 | interpolation=Image.BILINEAR): 138 | self.size = size 139 | self.scales = scales 140 | self.interpolation = interpolation 141 | self.crop_positions = crop_positions 142 | 143 | self.randomize_parameters() 144 | 145 | def __call__(self, img): 146 | short_side = min(img.size[0], img.size[1]) 147 | crop_size = int(short_side * self.scale) 148 | self.corner_crop.size = crop_size 149 | 150 | img = self.corner_crop(img) 151 | return img.resize((self.size, self.size), self.interpolation) 152 | 153 | def randomize_parameters(self): 154 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 155 | crop_position = self.crop_positions[random.randint( 156 | 0, 157 | len(self.crop_positions) - 1)] 158 | 159 | self.corner_crop = CornerCrop(None, crop_position) 160 | 161 | def __repr__(self): 162 | return self.__class__.__name__ + '(size={0}, scales={1}, interpolation={2})'.format( 163 | self.size, self.scales, self.interpolation) 164 | 165 | 166 | class RandomResizedCrop(transforms.RandomResizedCrop): 167 | 168 | def __init__(self, 169 | size, 170 | scale=(0.08, 1.0), 171 | ratio=(3. / 4., 4. / 3.), 172 | interpolation=Image.BILINEAR): 173 | super().__init__(size, scale, ratio, interpolation) 174 | self.randomize_parameters() 175 | 176 | def __call__(self, img): 177 | if self.randomize: 178 | self.random_crop = self.get_params(img, self.scale, self.ratio) 179 | self.randomize = False 180 | 181 | i, j, h, w = self.random_crop 182 | return F.resized_crop(img, i, j, h, w, self.size, self.interpolation) 183 | 184 | def randomize_parameters(self): 185 | self.randomize = True 186 | 187 | 188 | class ColorJitter(transforms.ColorJitter): 189 | 190 | def __init__(self, brightness=0, contrast=0, saturation=0, hue=0): 191 | super().__init__(brightness, contrast, saturation, hue) 192 | self.randomize_parameters() 193 | 194 | def __call__(self, img): 195 | if self.randomize: 196 | self.transform = self.get_params(self.brightness, self.contrast, 197 | self.saturation, self.hue) 198 | self.randomize = False 199 | 200 | return self.transform(img) 201 | 202 | def randomize_parameters(self): 203 | self.randomize = True 204 | 205 | 206 | class PickFirstChannels(object): 207 | 208 | def __init__(self, n): 209 | self.n = n 210 | 211 | def __call__(self, tensor): 212 | return tensor[:self.n, :, :] 213 | 214 | def randomize_parameters(self): 215 | pass -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Videos as Space-Time Region Graph 2 | 3 | ## Summary 4 | 5 | * This repository is for testing the idea of the following paper: 6 | 7 | [ 8 | Wang, Xiaolong, and Abhinav Gupta. "Videos as space-time region graphs." Proceedings of the European conference on computer vision (ECCV). 2018. 9 | ](http://openaccess.thecvf.com/content_ECCV_2018/papers/Xiaolong_Wang_Videos_as_Space-Time_ECCV_2018_paper.pdf) 10 | 11 | * It means that it may contain several mismatch with the original implementation introduced on the paper. 12 | 13 | * Also the performance is much lower than the publication (24 vs 43) and I never test Kinetics pre-trained ResNet-50-I3D. 14 | 15 | ## Notes 16 | 17 | * This repository is based on https://github.com/kenshohara/3D-ResNets-PyTorch. 18 | 19 | * The architecture of ResNet-50-I3D in the paper is different from that in the above repository. I did not use Kinetics pre-trained model but use ImageNet pre-trained model. 20 | 21 | * Currently, RPN is used on every iteration which requires approximately 3 times more training time. 22 | 23 | * Kinetics pre-trained model can be found in [here](https://github.com/joaanna/something_else). 24 | 25 | 26 | ## Requirements 27 | 28 | * [PyTorch](http://pytorch.org/) (ver. 1.2+ required) 29 | * [Torchvision](http://pytorch.org/) (ver. 0.4+ required) 30 | 31 | ```bash 32 | conda install pytorch torchvision cudatoolkit=10.1 -c soumith 33 | ``` 34 | 35 | ```bash 36 | pip install -r requirements.txt 37 | ``` 38 | 39 | 40 | * FFmpeg, FFprobe 41 | 42 | * Python 3 43 | 44 | ## Preparation 45 | 46 | ### Kinetics 47 | 48 | * Download videos using [the official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). 49 | * Locate test set in ```video_directory/test```. 50 | * Convert from avi to jpg files using ```util_scripts/generate_video_jpgs.py``` 51 | 52 | ```bash 53 | python -m util_scripts.generate_video_jpgs mp4_video_dir_path jpg_video_dir_path kinetics 54 | ``` 55 | 56 | * Generate annotation file in json format similar to ActivityNet using ```util_scripts/kinetics_json.py``` 57 | * The CSV files (kinetics_{train, val, test}.csv) are included in the crawler. 58 | 59 | ```bash 60 | python -m util_scripts.kinetics_json csv_dir_path 700 jpg_video_dir_path jpg dst_json_path 61 | ``` 62 | 63 | 64 | ### Something-Something v1/v2 65 | 66 | * Download videos from the official [website](https://20bn.com/datasets/something-something/v2#download). 67 | * For Something-Something v2, please run `util_scripts/vid2img_sthv1.[py` 68 | 69 | ```bash 70 | python util_scripts/sthv1_json.py 'data/something/v1' 'data/something/v1/img' 'data/sthv1.json' 71 | ``` 72 | 73 | ```bash 74 | python util_scripts/sthv2_json.py 'data/something/v2' 'data/something/v2/img' 'data/sthv2.json' 75 | ``` 76 | 77 | 78 | 79 | ## Running the code 80 | 81 | ### Data Path 82 | 83 | Assume the structure of data directories is the following: 84 | 85 | ```misc 86 | ~/ 87 | data/ 88 | something/ 89 | v1/ 90 | img/ 91 | .../ (directories of video names) 92 | ... (jpg files) 93 | v2/ 94 | img/ 95 | .../ (directories of video names) 96 | ... (jpg files) 97 | kinetics_videos/ 98 | jpg/ 99 | .../ (directories of class names) 100 | .../ (directories of video names) 101 | ... (jpg files) 102 | results/ 103 | save_100.pth 104 | kinetics.json 105 | ``` 106 | 107 | Confirm all options. 108 | 109 | ```bash 110 | python main.py -h 111 | ``` 112 | 113 | ### Kinetics Pre-training 114 | 115 | Train ResNets-50 on the Kinetics-700 dataset (700 classes) with 4 CPU threads (for data loading). 116 | Batch size is 128. 117 | Save models at every 5 epochs. 118 | All GPUs is used for the training. 119 | If you want a part of GPUs, use ```CUDA_VISIBLE_DEVICES=...```. 120 | 121 | ```bash 122 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \ 123 | --result_path results --dataset kinetics --model resnet \ 124 | --model_depth 50 --n_classes 700 --batch_size 128 --n_threads 4 --checkpoint 5 125 | ``` 126 | 127 | 128 | Calculate top-5 class probabilities of each video using a trained model (~/data/results/save_200.pth.) 129 | Note that ```inference_batch_size``` should be small because actual batch size is calculated by ```inference_batch_size * (n_video_frames / inference_stride)```. 130 | 131 | ```bash 132 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \ 133 | --result_path results --dataset kinetics --resume_path results/save_200.pth \ 134 | --model_depth 50 --n_classes 700 --n_threads 4 --no_train --no_val --inference --output_topk 5 --inference_batch_size 1 135 | ``` 136 | 137 | Evaluate top-1 video accuracy of a recognition result (data/results/val.json). 138 | 139 | ```bash 140 | python -m util_scripts.eval_accuracy data/sthv2.json data/results/val.json --subset val -k 1 --ignore 141 | ``` 142 | 143 | ### Something-Something-v1 144 | 145 | First of all, we need to train backbone network (ResNet-50-I3D) for 100 epochs with learning rate as 0.00125 (decayed at 90 epoch to 0.000125) 146 | The original batchsize is 8 but in this implementation, we use 32 to reduce the training time. 147 | 148 | ```bash 149 | python main.py --root_path data --video_path data/something/v1/img --annotation_path sthv1.json \ 150 | --result_path resnet_strg_imgnet_bs32 --dataset somethingv1 --n_classes 174 --n_pretrain_classes 700 \ 151 | --ft_begin_module fc --tensorboard --wandb --conv1_t_size 5 --learning_rate 0.00125 --sample_duration 32 \ 152 | --n_epochs 100 --multistep_milestones 90 --model resnet_strg --model_depth 50 --batch_size 32 \ 153 | --n_threads 8 --checkpoint 1 154 | ``` 155 | 156 | Then, we need to train with GCN module until 30 epochs with learning rate as 0.000125. 157 | 158 | ```bash 159 | python main.py --root_path data --video_path data/something/v1/img --annotation_path sthv1.json \ 160 | --result_path resnet_strg_imgnet_32_gcn --dataset somethingv1 --n_classes 174 --n_pretrain_classes 174 \ 161 | --ft_begin_module fc --tensorboard --wandb --conv1_t_size 5 --learning_rate 0.000125 \ 162 | --sample_duration 32 --n_epochs 30 --model resnet_strg --model_depth 50 --batch_size 32 \ 163 | --nrois 10 --det_interval 2 --strg \ 164 | --n_threads 8 --checkpoint 1 --pretrain_path resnet_strg_imgnet_bs32/save_100.pth 165 | ``` 166 | 167 | ## Results on Something-Something-v1 168 | 169 | ### The published results 170 | 171 | | Model name | ResNet-50-I3D | ResNet-50-I3D + STRG | 172 | | ------------------ |---------------- | -------------- | 173 | | Top-1 Accuracy | 41.6% | 43.3% | 174 | 175 | 176 | ### This repo results (without using Kinetic pretraining model) 177 | 178 | | Model name | ResNet-50-I3D | ResNet-50-I3D + STRG | 179 | | ------------------ |---------------- | -------------- | 180 | | Top-1 Accuracy | 23.2% | 24.5% | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /rgcn_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from torch.nn.parameter import Parameter 5 | import math 6 | import pdb 7 | import time 8 | 9 | from module.gcn import GCN, GraphConvolution 10 | from module.roi_graph import get_st_graph 11 | 12 | 13 | class RGCN(torch.nn.Module): 14 | def __init__(self, in_channel=512, out_channel=512, test_mode=False, 15 | dropout=0.5, 16 | separate_fb=True): 17 | super(RGCN, self).__init__() 18 | 19 | # 1 by 1 conv -> 512 wang: 2048 -> 512 20 | self.out_channel = out_channel 21 | in_channel = in_channel # 512 22 | dropout = dropout 23 | self.separate_fb = separate_fb 24 | 25 | 26 | # wang2018video differentiates forward graph and backward graph, 27 | # but in this implementation we ignore this. 28 | 29 | self.sim_embed1 = nn.Linear(in_channel, in_channel, bias=False) 30 | self.sim_embed2 = nn.Linear(in_channel, in_channel, bias=False) 31 | 32 | self.st_gc1 = GraphConvolution(in_channel, in_channel, bias=False, batch=True) 33 | self.st_gc2 = GraphConvolution(in_channel, in_channel, bias=False, batch=True) 34 | self.st_gc3 = GraphConvolution(in_channel, self.out_channel, bias=False, batch=True) 35 | if self.separate_fb: 36 | self.st_gc1_back = GraphConvolution(in_channel, in_channel, bias=False, batch=True) 37 | self.st_gc2_back = GraphConvolution(in_channel, in_channel, bias=False, batch=True) 38 | self.st_gc3_back = GraphConvolution(in_channel, self.out_channel, bias=False, batch=True) 39 | 40 | self.sim_gc1 = GraphConvolution(in_channel, in_channel, bias=False, batch=True) 41 | self.sim_gc2 = GraphConvolution(in_channel, in_channel, bias=False, batch=True) 42 | self.sim_gc3 = GraphConvolution(in_channel, self.out_channel, bias=False, batch=True) 43 | 44 | self.dropout = nn.Dropout(dropout) 45 | self.init_weight() 46 | 47 | 48 | def st_GCN(self, input, front_graph, back_graph=None): 49 | input = input.squeeze(2) 50 | out = F.relu(self.st_gc1(input,front_graph)) 51 | if self.separate_fb: 52 | out += F.relu(self.st_gc1_back(input,back_graph)) 53 | # out = self.dropout(out) 54 | 55 | out2 = F.relu(self.st_gc2(out,front_graph)) 56 | if self.separate_fb: 57 | out2 += F.relu(self.st_gc2_back(out,back_graph)) 58 | out = out2 59 | # out = self.dropout(out2) 60 | 61 | out2 = F.relu(self.st_gc3(out,front_graph)) 62 | if self.separate_fb: 63 | out2 += F.relu(self.st_gc3_back(out,back_graph)) 64 | return out2 65 | 66 | 67 | def sim_GCN(self, input, adj): 68 | out = F.relu(self.sim_gc1(input,adj)) 69 | # out = self.dropout(out) 70 | out = F.relu(self.sim_gc2(out,adj)) 71 | # out = self.dropout(out) 72 | out = F.relu(self.sim_gc3(out,adj)) 73 | return out 74 | 75 | 76 | 77 | def init_weight(self): 78 | # nn.init.constant_(self.sim_gc1.bias.data, 0) 79 | # nn.init.constant_(self.sim_gc2.bias.data, 0) 80 | # nn.init.constant_(self.sim_gc3.bias.data, 0) 81 | # 82 | # nn.init.constant_(self.st_gc1.bias.data, 0) 83 | # nn.init.constant_(self.st_gc2.bias.data, 0) 84 | # nn.init.constant_(self.st_gc3.bias.data, 0) 85 | 86 | nn.init.normal_(self.sim_gc1.weight.data, 0, 0.001) 87 | nn.init.normal_(self.sim_gc2.weight.data, 0, 0.001) 88 | nn.init.normal_(self.st_gc1.weight.data, 0, 0.001) 89 | nn.init.normal_(self.st_gc2.weight.data, 0, 0.001) 90 | 91 | nn.init.normal_(self.st_gc3.weight.data, 0, 0.001) 92 | nn.init.normal_(self.sim_gc3.weight.data, 0, 0.001) 93 | # nn.init.constant_(self.sim_gc3.weight.data, 0) 94 | # nn.init.constant_(self.st_gc3.weight.data, 0) 95 | 96 | if self.separate_fb: 97 | nn.init.normal_(self.st_gc1_back.weight.data, 0, 0.001) 98 | nn.init.normal_(self.st_gc2_back.weight.data, 0, 0.001) 99 | nn.init.constant_(self.st_gc3_back.weight.data, 0) 100 | 101 | 102 | 103 | 104 | def generate_st_graphs(self, rois, connection, return_dict, st=0): 105 | for i, (r, c) in enumerate(zip(rois, connection)): 106 | return_dict[i+st] = get_st_graph(r,c) 107 | 108 | 109 | 110 | def forward(self, rois_features, rois): 111 | front_graph, back_graph = get_st_graph(rois) 112 | 113 | front_graph = front_graph.to(rois.device).detach() 114 | back_graph = back_graph.to(rois.device).detach() 115 | 116 | B, T, N, C = rois_features.size() 117 | N_rois = T*N 118 | rois_features = rois_features.view(B, N_rois, -1) 119 | sim_graph = self.sim_graph(rois_features).detach() 120 | sim_gcn = self.sim_GCN(rois_features, sim_graph) 121 | st_gcn = self.st_GCN(rois_features, front_graph, back_graph) 122 | gcn_out = sim_gcn + st_gcn 123 | gcn_out = gcn_out.mean(1) 124 | gcn_out = self.dropout(gcn_out) 125 | return gcn_out 126 | 127 | 128 | 129 | def sim_graph(self, features): 130 | sim1 = self.sim_embed1(features) 131 | sim2 = self.sim_embed2(features) 132 | sim_features = torch.matmul(sim1, sim2.transpose(1,2)) # d x d mat. 133 | sim_graph = F.softmax(sim_features, dim=-1) 134 | return sim_graph 135 | 136 | 137 | def get_optim_policies(self): 138 | 139 | normal_weight = [] 140 | normal_bias = [] 141 | 142 | for m in self.modules(): 143 | if isinstance(m, torch.nn.Linear): 144 | ps = list(m.parameters()) 145 | normal_weight.append(ps[0]) 146 | if len(ps) == 2: 147 | normal_bias.append(ps[1]) 148 | elif isinstance(m, GraphConvolution): 149 | ps = list(m.parameters()) 150 | normal_weight.append(ps[0]) 151 | if len(ps) == 2: 152 | normal_bias.append(ps[1]) 153 | elif 'Conv' in str(type(m)): 154 | ps = list(m.parameters()) 155 | normal_weight.append(ps[0]) 156 | if len(ps) == 2: 157 | normal_bias.append(ps[1]) 158 | elif len(m._modules) == 0: 159 | if len(list(m.parameters())) > 0: 160 | raise ValueError("New atomic module type: {}. Need to give it a learning policy".format(type(m))) 161 | 162 | return [ 163 | {'params': normal_weight, 'lr_mult': 1, 'decay_mult': 1, 164 | 'name': "normal_weight"}, 165 | {'params': normal_bias, 'lr_mult': 2, 'decay_mult': 0, 166 | 'name': "normal_bias"}, 167 | ] 168 | 169 | 170 | if __name__ == '__main__': 171 | rois = torch.rand((4,8,10,4)) 172 | rois_features = torch.rand((4,8,10,512)) 173 | rgcn = RGCN() 174 | out = rgcn(rois_features, rois) 175 | 176 | pdb.set_trace() 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /models/densenet.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from collections import OrderedDict 7 | 8 | 9 | class _DenseLayer(nn.Sequential): 10 | 11 | def __init__(self, num_input_features, growth_rate, bn_size, drop_rate): 12 | super().__init__() 13 | self.add_module('norm1', nn.BatchNorm3d(num_input_features)) 14 | self.add_module('relu1', nn.ReLU(inplace=True)) 15 | self.add_module( 16 | 'conv1', 17 | nn.Conv3d(num_input_features, 18 | bn_size * growth_rate, 19 | kernel_size=1, 20 | stride=1, 21 | bias=False)) 22 | self.add_module('norm2', nn.BatchNorm3d(bn_size * growth_rate)) 23 | self.add_module('relu2', nn.ReLU(inplace=True)) 24 | self.add_module( 25 | 'conv2', 26 | nn.Conv3d(bn_size * growth_rate, 27 | growth_rate, 28 | kernel_size=3, 29 | stride=1, 30 | padding=1, 31 | bias=False)) 32 | self.drop_rate = drop_rate 33 | 34 | def forward(self, x): 35 | new_features = super().forward(x) 36 | if self.drop_rate > 0: 37 | new_features = F.dropout(new_features, 38 | p=self.drop_rate, 39 | training=self.training) 40 | return torch.cat([x, new_features], 1) 41 | 42 | 43 | class _DenseBlock(nn.Sequential): 44 | 45 | def __init__(self, num_layers, num_input_features, bn_size, growth_rate, 46 | drop_rate): 47 | super().__init__() 48 | for i in range(num_layers): 49 | layer = _DenseLayer(num_input_features + i * growth_rate, 50 | growth_rate, bn_size, drop_rate) 51 | self.add_module('denselayer{}'.format(i + 1), layer) 52 | 53 | 54 | class _Transition(nn.Sequential): 55 | 56 | def __init__(self, num_input_features, num_output_features): 57 | super().__init__() 58 | self.add_module('norm', nn.BatchNorm3d(num_input_features)) 59 | self.add_module('relu', nn.ReLU(inplace=True)) 60 | self.add_module( 61 | 'conv', 62 | nn.Conv3d(num_input_features, 63 | num_output_features, 64 | kernel_size=1, 65 | stride=1, 66 | bias=False)) 67 | self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2)) 68 | 69 | 70 | class DenseNet(nn.Module): 71 | """Densenet-BC model class 72 | Args: 73 | growth_rate (int) - how many filters to add each layer (k in paper) 74 | block_config (list of 4 ints) - how many layers in each pooling block 75 | num_init_features (int) - the number of filters to learn in the first convolution layer 76 | bn_size (int) - multiplicative factor for number of bottle neck layers 77 | (i.e. bn_size * k features in the bottleneck layer) 78 | drop_rate (float) - dropout rate after each dense layer 79 | num_classes (int) - number of classification classes 80 | """ 81 | 82 | def __init__(self, 83 | n_input_channels=3, 84 | conv1_t_size=7, 85 | conv1_t_stride=1, 86 | no_max_pool=False, 87 | growth_rate=32, 88 | block_config=(6, 12, 24, 16), 89 | num_init_features=64, 90 | bn_size=4, 91 | drop_rate=0, 92 | num_classes=1000): 93 | 94 | super().__init__() 95 | 96 | # First convolution 97 | self.features = [('conv1', 98 | nn.Conv3d(n_input_channels, 99 | num_init_features, 100 | kernel_size=(conv1_t_size, 7, 7), 101 | stride=(conv1_t_stride, 2, 2), 102 | padding=(conv1_t_size // 2, 3, 3), 103 | bias=False)), 104 | ('norm1', nn.BatchNorm3d(num_init_features)), 105 | ('relu1', nn.ReLU(inplace=True))] 106 | if not no_max_pool: 107 | self.features.append( 108 | ('pool1', nn.MaxPool3d(kernel_size=3, stride=2, padding=1))) 109 | self.features = nn.Sequential(OrderedDict(self.features)) 110 | 111 | # Each denseblock 112 | num_features = num_init_features 113 | for i, num_layers in enumerate(block_config): 114 | block = _DenseBlock(num_layers=num_layers, 115 | num_input_features=num_features, 116 | bn_size=bn_size, 117 | growth_rate=growth_rate, 118 | drop_rate=drop_rate) 119 | self.features.add_module('denseblock{}'.format(i + 1), block) 120 | num_features = num_features + num_layers * growth_rate 121 | if i != len(block_config) - 1: 122 | trans = _Transition(num_input_features=num_features, 123 | num_output_features=num_features // 2) 124 | self.features.add_module('transition{}'.format(i + 1), trans) 125 | num_features = num_features // 2 126 | 127 | # Final batch norm 128 | self.features.add_module('norm5', nn.BatchNorm3d(num_features)) 129 | 130 | for m in self.modules(): 131 | if isinstance(m, nn.Conv3d): 132 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 133 | elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d): 134 | m.weight.data.fill_(1) 135 | m.bias.data.zero_() 136 | 137 | # Linear layer 138 | self.classifier = nn.Linear(num_features, num_classes) 139 | 140 | for m in self.modules(): 141 | if isinstance(m, nn.Conv3d): 142 | nn.init.kaiming_normal_(m.weight, 143 | mode='fan_out', 144 | nonlinearity='relu') 145 | elif isinstance(m, nn.BatchNorm3d): 146 | nn.init.constant_(m.weight, 1) 147 | nn.init.constant_(m.bias, 0) 148 | elif isinstance(m, nn.Linear): 149 | nn.init.constant_(m.bias, 0) 150 | 151 | def forward(self, x): 152 | features = self.features(x) 153 | out = F.relu(features, inplace=True) 154 | out = F.adaptive_avg_pool3d(out, 155 | output_size=(1, 1, 156 | 1)).view(features.size(0), -1) 157 | out = self.classifier(out) 158 | return out 159 | 160 | 161 | def generate_model(model_depth, **kwargs): 162 | assert model_depth in [121, 169, 201, 264] 163 | 164 | if model_depth == 121: 165 | model = DenseNet(num_init_features=64, 166 | growth_rate=32, 167 | block_config=(6, 12, 24, 16), 168 | **kwargs) 169 | elif model_depth == 169: 170 | model = DenseNet(num_init_features=64, 171 | growth_rate=32, 172 | block_config=(6, 12, 32, 32), 173 | **kwargs) 174 | elif model_depth == 201: 175 | model = DenseNet(num_init_features=64, 176 | growth_rate=32, 177 | block_config=(6, 12, 48, 32), 178 | **kwargs) 179 | elif model_depth == 264: 180 | model = DenseNet(num_init_features=64, 181 | growth_rate=32, 182 | block_config=(6, 12, 64, 48), 183 | **kwargs) 184 | 185 | return model -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | from functools import partial 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def get_inplanes(): 9 | return [64, 128, 256, 512] 10 | 11 | 12 | def conv3x3x3(in_planes, out_planes, stride=1): 13 | return nn.Conv3d(in_planes, 14 | out_planes, 15 | kernel_size=3, 16 | stride=stride, 17 | padding=1, 18 | bias=False) 19 | 20 | 21 | def conv1x1x1(in_planes, out_planes, stride=1): 22 | return nn.Conv3d(in_planes, 23 | out_planes, 24 | kernel_size=1, 25 | stride=stride, 26 | bias=False) 27 | 28 | 29 | class BasicBlock(nn.Module): 30 | expansion = 1 31 | 32 | def __init__(self, in_planes, planes, stride=1, downsample=None): 33 | super().__init__() 34 | 35 | self.conv1 = conv3x3x3(in_planes, planes, stride) 36 | self.bn1 = nn.BatchNorm3d(planes) 37 | self.relu = nn.ReLU(inplace=True) 38 | self.conv2 = conv3x3x3(planes, planes) 39 | self.bn2 = nn.BatchNorm3d(planes) 40 | self.downsample = downsample 41 | self.stride = stride 42 | 43 | def forward(self, x): 44 | residual = x 45 | 46 | out = self.conv1(x) 47 | out = self.bn1(out) 48 | out = self.relu(out) 49 | 50 | out = self.conv2(out) 51 | out = self.bn2(out) 52 | 53 | if self.downsample is not None: 54 | residual = self.downsample(x) 55 | 56 | out += residual 57 | out = self.relu(out) 58 | 59 | return out 60 | 61 | 62 | class Bottleneck(nn.Module): 63 | expansion = 4 64 | 65 | def __init__(self, in_planes, planes, stride=1, downsample=None): 66 | super().__init__() 67 | 68 | self.conv1 = conv1x1x1(in_planes, planes) 69 | self.bn1 = nn.BatchNorm3d(planes) 70 | self.conv2 = conv3x3x3(planes, planes, stride) 71 | self.bn2 = nn.BatchNorm3d(planes) 72 | self.conv3 = conv1x1x1(planes, planes * self.expansion) 73 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 74 | self.relu = nn.ReLU(inplace=True) 75 | self.downsample = downsample 76 | self.stride = stride 77 | 78 | def forward(self, x): 79 | residual = x 80 | 81 | out = self.conv1(x) 82 | out = self.bn1(out) 83 | out = self.relu(out) 84 | 85 | out = self.conv2(out) 86 | out = self.bn2(out) 87 | out = self.relu(out) 88 | 89 | out = self.conv3(out) 90 | out = self.bn3(out) 91 | 92 | if self.downsample is not None: 93 | residual = self.downsample(x) 94 | 95 | out += residual 96 | out = self.relu(out) 97 | 98 | return out 99 | 100 | 101 | class ResNet(nn.Module): 102 | 103 | def __init__(self, 104 | block, 105 | layers, 106 | block_inplanes, 107 | n_input_channels=3, 108 | conv1_t_size=7, 109 | conv1_t_stride=1, 110 | no_max_pool=False, 111 | shortcut_type='B', 112 | widen_factor=1.0, 113 | n_classes=400): 114 | super().__init__() 115 | 116 | block_inplanes = [int(x * widen_factor) for x in block_inplanes] 117 | 118 | self.in_planes = block_inplanes[0] 119 | self.no_max_pool = no_max_pool 120 | 121 | self.conv1 = nn.Conv3d(n_input_channels, 122 | self.in_planes, 123 | kernel_size=(conv1_t_size, 7, 7), 124 | stride=(conv1_t_stride, 2, 2), 125 | padding=(conv1_t_size // 2, 3, 3), 126 | bias=False) 127 | self.bn1 = nn.BatchNorm3d(self.in_planes) 128 | self.relu = nn.ReLU(inplace=True) 129 | self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1) 130 | self.layer1 = self._make_layer(block, block_inplanes[0], layers[0], 131 | shortcut_type) 132 | self.layer2 = self._make_layer(block, 133 | block_inplanes[1], 134 | layers[1], 135 | shortcut_type, 136 | stride=2) 137 | self.layer3 = self._make_layer(block, 138 | block_inplanes[2], 139 | layers[2], 140 | shortcut_type, 141 | stride=2) 142 | self.layer4 = self._make_layer(block, 143 | block_inplanes[3], 144 | layers[3], 145 | shortcut_type, 146 | stride=2) 147 | 148 | self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) 149 | self.fc = nn.Linear(block_inplanes[3] * block.expansion, n_classes) 150 | 151 | for m in self.modules(): 152 | if isinstance(m, nn.Conv3d): 153 | nn.init.kaiming_normal_(m.weight, 154 | mode='fan_out', 155 | nonlinearity='relu') 156 | elif isinstance(m, nn.BatchNorm3d): 157 | nn.init.constant_(m.weight, 1) 158 | nn.init.constant_(m.bias, 0) 159 | 160 | def _downsample_basic_block(self, x, planes, stride): 161 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 162 | zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2), 163 | out.size(3), out.size(4)) 164 | if isinstance(out.data, torch.cuda.FloatTensor): 165 | zero_pads = zero_pads.cuda() 166 | 167 | out = torch.cat([out.data, zero_pads], dim=1) 168 | 169 | return out 170 | 171 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 172 | downsample = None 173 | if stride != 1 or self.in_planes != planes * block.expansion: 174 | if shortcut_type == 'A': 175 | downsample = partial(self._downsample_basic_block, 176 | planes=planes * block.expansion, 177 | stride=stride) 178 | else: 179 | downsample = nn.Sequential( 180 | conv1x1x1(self.in_planes, planes * block.expansion, stride), 181 | nn.BatchNorm3d(planes * block.expansion)) 182 | 183 | layers = [] 184 | layers.append( 185 | block(in_planes=self.in_planes, 186 | planes=planes, 187 | stride=stride, 188 | downsample=downsample)) 189 | self.in_planes = planes * block.expansion 190 | for i in range(1, blocks): 191 | layers.append(block(self.in_planes, planes)) 192 | 193 | return nn.Sequential(*layers) 194 | 195 | 196 | 197 | def extract_feature(self, x): 198 | x = self.conv1(x) 199 | x = self.bn1(x) 200 | x = self.relu(x) 201 | if not self.no_max_pool: 202 | x = self.maxpool(x) 203 | 204 | x = self.layer1(x) 205 | x = self.layer2(x) 206 | x = self.layer3(x) 207 | x = self.layer4(x) 208 | return x 209 | 210 | 211 | 212 | def forward(self, x): 213 | x = self.conv1(x) 214 | x = self.bn1(x) 215 | x = self.relu(x) 216 | if not self.no_max_pool: 217 | x = self.maxpool(x) 218 | 219 | x = self.layer1(x) 220 | x = self.layer2(x) 221 | x = self.layer3(x) 222 | x = self.layer4(x) 223 | 224 | x = self.avgpool(x) 225 | 226 | x = x.view(x.size(0), -1) 227 | x = self.fc(x) 228 | 229 | return x 230 | 231 | 232 | def generate_model(model_depth, **kwargs): 233 | assert model_depth in [10, 18, 34, 50, 101, 152, 200] 234 | 235 | if model_depth == 10: 236 | model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs) 237 | elif model_depth == 18: 238 | model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs) 239 | elif model_depth == 34: 240 | model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs) 241 | elif model_depth == 50: 242 | model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs) 243 | elif model_depth == 101: 244 | model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs) 245 | elif model_depth == 152: 246 | model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs) 247 | elif model_depth == 200: 248 | model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs) 249 | 250 | return model 251 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | from torchvision import get_image_backend 2 | 3 | from datasets.videodataset import VideoDataset 4 | from datasets.videodataset_multiclips import (VideoDatasetMultiClips, 5 | collate_fn) 6 | from datasets.activitynet import ActivityNet 7 | from datasets.loader import VideoLoader, VideoLoaderHDF5, VideoLoaderFlowHDF5 8 | import pdb 9 | 10 | def image_name_formatter(x): 11 | return 'image_{:05d}.jpg'.format(x) 12 | 13 | def sthv2_image_name_formatter(x): 14 | return '{:06d}.jpg'.format(x) 15 | 16 | def sthv1_image_name_formatter(x): 17 | return '{:05d}.jpg'.format(x) 18 | 19 | def get_training_data(video_path, 20 | annotation_path, 21 | dataset_name, 22 | input_type, 23 | file_type, 24 | spatial_transform=None, 25 | temporal_transform=None, 26 | target_transform=None): 27 | assert dataset_name in [ 28 | 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2', 29 | 'somethingv1' 30 | ] 31 | assert input_type in ['rgb', 'flow'] 32 | assert file_type in ['jpg', 'hdf5'] 33 | if 'somethingv1' in dataset_name: 34 | formatter = sthv1_image_name_formatter 35 | elif 'somethingv2' in dataset_name: 36 | formatter = sthv2_image_name_formatter 37 | else: 38 | formatter = image_name_formatter 39 | if file_type == 'jpg': 40 | assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' 41 | 42 | if get_image_backend() == 'accimage': 43 | from datasets.loader import ImageLoaderAccImage 44 | loader = VideoLoader(formatter, ImageLoaderAccImage()) 45 | else: 46 | loader = VideoLoader(formatter) 47 | 48 | video_path_formatter = ( 49 | lambda root_path, label, video_id: root_path / label / video_id) 50 | else: 51 | if input_type == 'rgb': 52 | loader = VideoLoaderHDF5() 53 | else: 54 | loader = VideoLoaderFlowHDF5() 55 | video_path_formatter = (lambda root_path, label, video_id: root_path / 56 | label / '{}.hdf5'.format(video_id)) 57 | 58 | if dataset_name == 'activitynet': 59 | training_data = ActivityNet(video_path, 60 | annotation_path, 61 | 'training', 62 | spatial_transform=spatial_transform, 63 | temporal_transform=temporal_transform, 64 | target_transform=target_transform, 65 | video_loader=loader, 66 | video_path_formatter=video_path_formatter) 67 | else: 68 | training_data = VideoDataset(video_path, 69 | annotation_path, 70 | 'training', 71 | spatial_transform=spatial_transform, 72 | temporal_transform=temporal_transform, 73 | target_transform=target_transform, 74 | video_loader=loader, 75 | video_path_formatter=video_path_formatter) 76 | 77 | return training_data 78 | 79 | 80 | def get_validation_data(video_path, 81 | annotation_path, 82 | dataset_name, 83 | input_type, 84 | file_type, 85 | spatial_transform=None, 86 | temporal_transform=None, 87 | target_transform=None): 88 | assert dataset_name in [ 89 | 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2', 90 | 'somethingv1' 91 | ] 92 | assert input_type in ['rgb', 'flow'] 93 | assert file_type in ['jpg', 'hdf5'] 94 | 95 | if 'somethingv1' in dataset_name: 96 | formatter = sthv1_image_name_formatter 97 | elif 'somethingv2' in dataset_name: 98 | formatter = sthv2_image_name_formatter 99 | else: 100 | formatter = image_name_formatter 101 | 102 | if file_type == 'jpg': 103 | assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' 104 | if get_image_backend() == 'accimage': 105 | from datasets.loader import ImageLoaderAccImage 106 | loader = VideoLoader(formatter, ImageLoaderAccImage()) 107 | else: 108 | loader = VideoLoader(formatter) 109 | 110 | video_path_formatter = ( 111 | lambda root_path, label, video_id: root_path / label / video_id) 112 | else: 113 | if input_type == 'rgb': 114 | loader = VideoLoaderHDF5() 115 | else: 116 | loader = VideoLoaderFlowHDF5() 117 | video_path_formatter = (lambda root_path, label, video_id: root_path / 118 | label / '{}.hdf5'.format(video_id)) 119 | 120 | if dataset_name == 'activitynet': 121 | validation_data = ActivityNet(video_path, 122 | annotation_path, 123 | 'validation', 124 | spatial_transform=spatial_transform, 125 | temporal_transform=temporal_transform, 126 | target_transform=target_transform, 127 | video_loader=loader, 128 | video_path_formatter=video_path_formatter) 129 | else: 130 | validation_data = VideoDatasetMultiClips( 131 | video_path, 132 | annotation_path, 133 | 'validation', 134 | spatial_transform=spatial_transform, 135 | temporal_transform=temporal_transform, 136 | target_transform=target_transform, 137 | video_loader=loader, 138 | video_path_formatter=video_path_formatter) 139 | 140 | return validation_data, collate_fn 141 | 142 | 143 | def get_inference_data(video_path, 144 | annotation_path, 145 | dataset_name, 146 | input_type, 147 | file_type, 148 | inference_subset, 149 | spatial_transform=None, 150 | temporal_transform=None, 151 | target_transform=None): 152 | assert dataset_name in [ 153 | 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2' 154 | ] 155 | assert input_type in ['rgb', 'flow'] 156 | assert file_type in ['jpg', 'hdf5'] 157 | assert inference_subset in ['train', 'val', 'test'] 158 | 159 | if file_type == 'jpg': 160 | assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' 161 | 162 | if get_image_backend() == 'accimage': 163 | from datasets.loader import ImageLoaderAccImage 164 | loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) 165 | else: 166 | loader = VideoLoader(image_name_formatter) 167 | 168 | video_path_formatter = ( 169 | lambda root_path, label, video_id: root_path / label / video_id) 170 | else: 171 | if input_type == 'rgb': 172 | loader = VideoLoaderHDF5() 173 | else: 174 | loader = VideoLoaderFlowHDF5() 175 | video_path_formatter = (lambda root_path, label, video_id: root_path / 176 | label / '{}.hdf5'.format(video_id)) 177 | 178 | if inference_subset == 'train': 179 | subset = 'training' 180 | elif inference_subset == 'val': 181 | subset = 'validation' 182 | elif inference_subset == 'test': 183 | subset = 'testing' 184 | if dataset_name == 'activitynet': 185 | inference_data = ActivityNet(video_path, 186 | annotation_path, 187 | subset, 188 | spatial_transform=spatial_transform, 189 | temporal_transform=temporal_transform, 190 | target_transform=target_transform, 191 | video_loader=loader, 192 | video_path_formatter=video_path_formatter, 193 | is_untrimmed_setting=True) 194 | else: 195 | inference_data = VideoDatasetMultiClips( 196 | video_path, 197 | annotation_path, 198 | subset, 199 | spatial_transform=spatial_transform, 200 | temporal_transform=temporal_transform, 201 | target_transform=target_transform, 202 | video_loader=loader, 203 | video_path_formatter=video_path_formatter, 204 | target_type=['video_id', 'segment']) 205 | 206 | return inference_data, collate_fn 207 | -------------------------------------------------------------------------------- /models/resnet2p1d.py: -------------------------------------------------------------------------------- 1 | import math 2 | from functools import partial 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | 9 | def get_inplanes(): 10 | return [64, 128, 256, 512] 11 | 12 | 13 | def conv1x3x3(in_planes, mid_planes, stride=1): 14 | return nn.Conv3d(in_planes, 15 | mid_planes, 16 | kernel_size=(1, 3, 3), 17 | stride=(1, stride, stride), 18 | padding=(0, 1, 1), 19 | bias=False) 20 | 21 | 22 | def conv3x1x1(mid_planes, planes, stride=1): 23 | return nn.Conv3d(mid_planes, 24 | planes, 25 | kernel_size=(3, 1, 1), 26 | stride=(stride, 1, 1), 27 | padding=(1, 0, 0), 28 | bias=False) 29 | 30 | 31 | def conv1x1x1(in_planes, out_planes, stride=1): 32 | return nn.Conv3d(in_planes, 33 | out_planes, 34 | kernel_size=1, 35 | stride=stride, 36 | bias=False) 37 | 38 | 39 | class BasicBlock(nn.Module): 40 | expansion = 1 41 | 42 | def __init__(self, in_planes, planes, stride=1, downsample=None): 43 | super().__init__() 44 | 45 | n_3d_parameters1 = in_planes * planes * 3 * 3 * 3 46 | n_2p1d_parameters1 = in_planes * 3 * 3 + 3 * planes 47 | mid_planes1 = n_3d_parameters1 // n_2p1d_parameters1 48 | self.conv1_s = conv1x3x3(in_planes, mid_planes1, stride) 49 | self.bn1_s = nn.BatchNorm3d(mid_planes1) 50 | self.conv1_t = conv3x1x1(mid_planes1, planes, stride) 51 | self.bn1_t = nn.BatchNorm3d(planes) 52 | 53 | n_3d_parameters2 = planes * planes * 3 * 3 * 3 54 | n_2p1d_parameters2 = planes * 3 * 3 + 3 * planes 55 | mid_planes2 = n_3d_parameters2 // n_2p1d_parameters2 56 | self.conv2_s = conv1x3x3(planes, mid_planes2) 57 | self.bn2_s = nn.BatchNorm3d(mid_planes2) 58 | self.conv2_t = conv3x1x1(mid_planes2, planes) 59 | self.bn2_t = nn.BatchNorm3d(planes) 60 | 61 | self.relu = nn.ReLU(inplace=True) 62 | self.downsample = downsample 63 | self.stride = stride 64 | 65 | def forward(self, x): 66 | residual = x 67 | 68 | out = self.conv1_s(x) 69 | out = self.bn1_s(out) 70 | out = self.relu(out) 71 | out = self.conv1_t(out) 72 | out = self.bn1_t(out) 73 | out = self.relu(out) 74 | 75 | out = self.conv2_s(out) 76 | out = self.bn2_s(out) 77 | out = self.relu(out) 78 | out = self.conv2_t(out) 79 | out = self.bn2_t(out) 80 | 81 | if self.downsample is not None: 82 | residual = self.downsample(x) 83 | 84 | out += residual 85 | out = self.relu(out) 86 | 87 | return out 88 | 89 | 90 | class Bottleneck(nn.Module): 91 | expansion = 4 92 | 93 | def __init__(self, in_planes, planes, stride=1, downsample=None): 94 | super().__init__() 95 | 96 | self.conv1 = conv1x1x1(in_planes, planes) 97 | self.bn1 = nn.BatchNorm3d(planes) 98 | 99 | n_3d_parameters = planes * planes * 3 * 3 * 3 100 | n_2p1d_parameters = planes * 3 * 3 + 3 * planes 101 | mid_planes = n_3d_parameters // n_2p1d_parameters 102 | self.conv2_s = conv1x3x3(planes, mid_planes, stride) 103 | self.bn2_s = nn.BatchNorm3d(mid_planes) 104 | self.conv2_t = conv3x1x1(mid_planes, planes, stride) 105 | self.bn2_t = nn.BatchNorm3d(planes) 106 | 107 | self.conv3 = conv1x1x1(planes, planes * self.expansion) 108 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 109 | self.relu = nn.ReLU(inplace=True) 110 | self.downsample = downsample 111 | self.stride = stride 112 | 113 | def forward(self, x): 114 | residual = x 115 | 116 | out = self.conv1(x) 117 | out = self.bn1(out) 118 | out = self.relu(out) 119 | 120 | out = self.conv2_s(out) 121 | out = self.bn2_s(out) 122 | out = self.relu(out) 123 | out = self.conv2_t(out) 124 | out = self.bn2_t(out) 125 | out = self.relu(out) 126 | 127 | out = self.conv3(out) 128 | out = self.bn3(out) 129 | 130 | if self.downsample is not None: 131 | residual = self.downsample(x) 132 | 133 | out += residual 134 | out = self.relu(out) 135 | 136 | return out 137 | 138 | 139 | class ResNet(nn.Module): 140 | 141 | def __init__(self, 142 | block, 143 | layers, 144 | block_inplanes, 145 | n_input_channels=3, 146 | conv1_t_size=7, 147 | conv1_t_stride=1, 148 | no_max_pool=False, 149 | shortcut_type='B', 150 | widen_factor=1.0, 151 | n_classes=400): 152 | super().__init__() 153 | 154 | block_inplanes = [int(x * widen_factor) for x in block_inplanes] 155 | 156 | self.in_planes = block_inplanes[0] 157 | self.no_max_pool = no_max_pool 158 | 159 | n_3d_parameters = 3 * self.in_planes * conv1_t_size * 7 * 7 160 | n_2p1d_parameters = 3 * 7 * 7 + conv1_t_size * self.in_planes 161 | mid_planes = n_3d_parameters // n_2p1d_parameters 162 | self.conv1_s = nn.Conv3d(n_input_channels, 163 | mid_planes, 164 | kernel_size=(1, 7, 7), 165 | stride=(1, 2, 2), 166 | padding=(0, 3, 3), 167 | bias=False) 168 | self.bn1_s = nn.BatchNorm3d(mid_planes) 169 | self.conv1_t = nn.Conv3d(mid_planes, 170 | self.in_planes, 171 | kernel_size=(conv1_t_size, 1, 1), 172 | stride=(conv1_t_stride, 1, 1), 173 | padding=(conv1_t_size // 2, 0, 0), 174 | bias=False) 175 | self.bn1_t = nn.BatchNorm3d(self.in_planes) 176 | self.relu = nn.ReLU(inplace=True) 177 | 178 | self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1) 179 | self.layer1 = self._make_layer(block, block_inplanes[0], layers[0], 180 | shortcut_type) 181 | self.layer2 = self._make_layer(block, 182 | block_inplanes[1], 183 | layers[1], 184 | shortcut_type, 185 | stride=2) 186 | self.layer3 = self._make_layer(block, 187 | block_inplanes[2], 188 | layers[2], 189 | shortcut_type, 190 | stride=2) 191 | self.layer4 = self._make_layer(block, 192 | block_inplanes[3], 193 | layers[3], 194 | shortcut_type, 195 | stride=2) 196 | 197 | self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) 198 | self.fc = nn.Linear(block_inplanes[3] * block.expansion, n_classes) 199 | 200 | for m in self.modules(): 201 | if isinstance(m, nn.Conv3d): 202 | nn.init.kaiming_normal_(m.weight, 203 | mode='fan_out', 204 | nonlinearity='relu') 205 | elif isinstance(m, nn.BatchNorm3d): 206 | nn.init.constant_(m.weight, 1) 207 | nn.init.constant_(m.bias, 0) 208 | 209 | def _downsample_basic_block(self, x, planes, stride): 210 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 211 | zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2), 212 | out.size(3), out.size(4)) 213 | if isinstance(out.data, torch.cuda.FloatTensor): 214 | zero_pads = zero_pads.cuda() 215 | 216 | out = torch.cat([out.data, zero_pads], dim=1) 217 | 218 | return out 219 | 220 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 221 | downsample = None 222 | if stride != 1 or self.in_planes != planes * block.expansion: 223 | if shortcut_type == 'A': 224 | downsample = partial(self._downsample_basic_block, 225 | planes=planes * block.expansion, 226 | stride=stride) 227 | else: 228 | downsample = nn.Sequential( 229 | conv1x1x1(self.in_planes, planes * block.expansion, stride), 230 | nn.BatchNorm3d(planes * block.expansion)) 231 | 232 | layers = [] 233 | layers.append( 234 | block(in_planes=self.in_planes, 235 | planes=planes, 236 | stride=stride, 237 | downsample=downsample)) 238 | self.in_planes = planes * block.expansion 239 | for i in range(1, blocks): 240 | layers.append(block(self.in_planes, planes)) 241 | 242 | return nn.Sequential(*layers) 243 | 244 | def forward(self, x): 245 | x = self.conv1_s(x) 246 | x = self.bn1_s(x) 247 | x = self.relu(x) 248 | x = self.conv1_t(x) 249 | x = self.bn1_t(x) 250 | x = self.relu(x) 251 | 252 | if not self.no_max_pool: 253 | x = self.maxpool(x) 254 | 255 | x = self.layer1(x) 256 | x = self.layer2(x) 257 | x = self.layer3(x) 258 | x = self.layer4(x) 259 | 260 | x = self.avgpool(x) 261 | 262 | x = x.view(x.size(0), -1) 263 | x = self.fc(x) 264 | 265 | return x 266 | 267 | 268 | def generate_model(model_depth, **kwargs): 269 | assert model_depth in [10, 18, 34, 50, 101, 152, 200] 270 | 271 | if model_depth == 10: 272 | model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs) 273 | elif model_depth == 18: 274 | model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs) 275 | elif model_depth == 34: 276 | model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs) 277 | elif model_depth == 50: 278 | model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs) 279 | elif model_depth == 101: 280 | model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs) 281 | elif model_depth == 152: 282 | model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs) 283 | elif model_depth == 200: 284 | model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs) 285 | 286 | return model -------------------------------------------------------------------------------- /models/resnet_strg.py: -------------------------------------------------------------------------------- 1 | import math 2 | from functools import partial 3 | import pdb 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torchvision.models as models 8 | 9 | def get_inplanes(): 10 | return [64, 128, 256, 512] 11 | 12 | 13 | def conv3x3x3(in_planes, out_planes, stride=1): 14 | return nn.Conv3d(in_planes, 15 | out_planes, 16 | kernel_size=3, 17 | stride=stride, 18 | padding=1, 19 | bias=False) 20 | 21 | 22 | def conv1x1x1(in_planes, out_planes, stride=1): 23 | return nn.Conv3d(in_planes, 24 | out_planes, 25 | kernel_size=1, 26 | stride=stride, 27 | bias=False) 28 | 29 | 30 | def conv1x3x3(in_planes, out_planes, stride=1): 31 | return nn.Conv3d(in_planes, 32 | out_planes, 33 | kernel_size=(1,3,3), 34 | stride=stride, 35 | padding=(0,1,1), 36 | bias=False) 37 | 38 | 39 | def conv3x1x1(in_planes, out_planes, stride=1): 40 | return nn.Conv3d(in_planes, 41 | out_planes, 42 | kernel_size=(3,1,1), 43 | stride=stride, 44 | padding=(1,0,0), 45 | bias=False) 46 | 47 | 48 | 49 | class BasicBlock(nn.Module): 50 | expansion = 1 51 | 52 | def __init__(self, in_planes, planes, stride=1, downsample=None): 53 | super().__init__() 54 | 55 | self.conv1 = conv3x3x3(in_planes, planes, stride) 56 | self.bn1 = nn.BatchNorm3d(planes) 57 | self.relu = nn.ReLU(inplace=True) 58 | self.conv2 = conv3x3x3(planes, planes) 59 | self.bn2 = nn.BatchNorm3d(planes) 60 | self.downsample = downsample 61 | self.stride = stride 62 | 63 | def forward(self, x): 64 | residual = x 65 | 66 | out = self.conv1(x) 67 | out = self.bn1(out) 68 | out = self.relu(out) 69 | 70 | out = self.conv2(out) 71 | out = self.bn2(out) 72 | 73 | if self.downsample is not None: 74 | residual = self.downsample(x) 75 | 76 | out += residual 77 | out = self.relu(out) 78 | 79 | return out 80 | 81 | 82 | class Bottleneck(nn.Module): 83 | expansion = 4 84 | 85 | def __init__(self, in_planes, planes, stride=1, downsample=None): 86 | super().__init__() 87 | 88 | self.conv1 = conv3x1x1(in_planes, planes) 89 | self.bn1 = nn.BatchNorm3d(planes) 90 | self.conv2 = conv1x3x3(planes, planes, stride) 91 | self.bn2 = nn.BatchNorm3d(planes) 92 | self.conv3 = conv1x1x1(planes, planes * self.expansion) 93 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 94 | self.relu = nn.ReLU(inplace=True) 95 | self.downsample = downsample 96 | self.stride = stride 97 | 98 | def forward(self, x): 99 | residual = x 100 | 101 | out = self.conv1(x) 102 | out = self.bn1(out) 103 | out = self.relu(out) 104 | 105 | out = self.conv2(out) 106 | out = self.bn2(out) 107 | out = self.relu(out) 108 | 109 | out = self.conv3(out) 110 | out = self.bn3(out) 111 | 112 | if self.downsample is not None: 113 | residual = self.downsample(x) 114 | 115 | out += residual 116 | out = self.relu(out) 117 | 118 | return out 119 | 120 | 121 | class ResNet(nn.Module): 122 | 123 | def __init__(self, 124 | block, 125 | layers, 126 | block_inplanes, 127 | n_input_channels=3, 128 | conv1_t_size=5, 129 | conv1_t_stride=1, 130 | no_max_pool=False, 131 | shortcut_type='B', 132 | widen_factor=1.0, 133 | n_classes=400): 134 | super().__init__() 135 | 136 | block_inplanes = [int(x * widen_factor) for x in block_inplanes] 137 | 138 | self.in_planes = block_inplanes[0] 139 | self.no_max_pool = no_max_pool 140 | 141 | self.conv1 = nn.Conv3d(n_input_channels, 142 | self.in_planes, 143 | kernel_size=(conv1_t_size, 7, 7), 144 | stride=(conv1_t_stride, 2, 2), 145 | padding=(conv1_t_size // 2, 3, 3), 146 | bias=False) 147 | self.bn1 = nn.BatchNorm3d(self.in_planes) 148 | self.relu = nn.ReLU(inplace=True) 149 | self.maxpool1 = nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)) 150 | self.maxpool2 = nn.MaxPool3d(kernel_size=(3,1,1), stride=(2,1,1), padding=(1,0,0)) 151 | self.layer1 = self._make_layer(block, block_inplanes[0], layers[0], 152 | shortcut_type) 153 | self.layer2 = self._make_layer(block, 154 | block_inplanes[1], 155 | layers[1], 156 | shortcut_type, 157 | stride=(1,2,2)) 158 | self.layer3 = self._make_layer(block, 159 | block_inplanes[2], 160 | layers[2], 161 | shortcut_type, 162 | stride=(1,2,2)) 163 | self.layer4 = self._make_layer(block, 164 | block_inplanes[3], 165 | layers[3], 166 | shortcut_type, 167 | stride=1) 168 | 169 | self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) 170 | self.fc = nn.Linear(block_inplanes[3] * block.expansion, n_classes) 171 | 172 | for m in self.modules(): 173 | if isinstance(m, nn.Conv3d): 174 | nn.init.kaiming_normal_(m.weight, 175 | mode='fan_out', 176 | nonlinearity='relu') 177 | elif isinstance(m, nn.BatchNorm3d): 178 | nn.init.constant_(m.weight, 1) 179 | nn.init.constant_(m.bias, 0) 180 | 181 | def _downsample_basic_block(self, x, planes, stride): 182 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 183 | zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2), 184 | out.size(3), out.size(4)) 185 | if isinstance(out.data, torch.cuda.FloatTensor): 186 | zero_pads = zero_pads.cuda() 187 | 188 | out = torch.cat([out.data, zero_pads], dim=1) 189 | 190 | return out 191 | 192 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 193 | downsample = None 194 | if stride != 1 or self.in_planes != planes * block.expansion: 195 | if shortcut_type == 'A': 196 | downsample = partial(self._downsample_basic_block, 197 | planes=planes * block.expansion, 198 | stride=stride) 199 | else: 200 | downsample = nn.Sequential( 201 | conv1x1x1(self.in_planes, planes * block.expansion, stride), 202 | nn.BatchNorm3d(planes * block.expansion)) 203 | 204 | layers = [] 205 | layers.append( 206 | block(in_planes=self.in_planes, 207 | planes=planes, 208 | stride=stride, 209 | downsample=downsample)) 210 | self.in_planes = planes * block.expansion 211 | for i in range(1, blocks): 212 | layers.append(block(self.in_planes, planes)) 213 | 214 | return nn.Sequential(*layers) 215 | 216 | def extract_feature(self, x): 217 | x = self.conv1(x) 218 | x = self.bn1(x) 219 | x = self.relu(x) 220 | if not self.no_max_pool: 221 | x = self.maxpool1(x) 222 | 223 | x = self.layer1(x) 224 | if not self.no_max_pool: 225 | x = self.maxpool2(x) 226 | x = self.layer2(x) 227 | x = self.layer3(x) 228 | x = self.layer4(x) 229 | return x 230 | 231 | 232 | def forward(self, x): 233 | x = self.conv1(x) 234 | x = self.bn1(x) 235 | x = self.relu(x) 236 | if not self.no_max_pool: 237 | x = self.maxpool1(x) 238 | 239 | x = self.layer1(x) 240 | if not self.no_max_pool: 241 | x = self.maxpool2(x) 242 | x = self.layer2(x) 243 | x = self.layer3(x) 244 | x = self.layer4(x) 245 | x = self.avgpool(x) 246 | 247 | x = x.view(x.size(0), -1) 248 | x = self.fc(x) 249 | 250 | return x 251 | 252 | def load_imagenet_pretrained(self, resnet2d): # only ResNet 50 implemented 253 | print("Load ImageNet pre-trained weight") 254 | state_dict_2d = resnet2d.state_dict() 255 | state_dict = self.state_dict() 256 | for k in state_dict.keys(): 257 | v_2d = state_dict_2d[k] 258 | if len(state_dict[k].shape) != len(v_2d.shape): 259 | state_dict[k] = v_2d.unsqueeze(2) 260 | else: 261 | state_dict[k] = v_2d 262 | 263 | 264 | 265 | def generate_model(model_depth, **kwargs): 266 | assert model_depth in [10, 18, 34, 50, 101, 152, 200] 267 | 268 | if model_depth == 10: 269 | model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs) 270 | model.load_imagenet_pretrained(models.resnet10(pretrained=True)) 271 | elif model_depth == 18: 272 | model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs) 273 | model.load_imagenet_pretrained(models.resnet18(pretrained=True)) 274 | elif model_depth == 34: 275 | model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs) 276 | model.load_imagenet_pretrained(models.resnet34(pretrained=True)) 277 | elif model_depth == 50: 278 | model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs) 279 | model.load_imagenet_pretrained(models.resnet50(pretrained=True)) 280 | elif model_depth == 101: 281 | model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs) 282 | model.load_imagenet_pretrained(models.resnet101(pretrained=True)) 283 | elif model_depth == 152: 284 | model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs) 285 | model.load_imagenet_pretrained(models.resnet152(pretrained=True)) 286 | elif model_depth == 200: 287 | model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs) 288 | model.load_imagenet_pretrained(models.resnet200(pretrained=True)) 289 | 290 | return model 291 | 292 | if __name__ == '__main__': 293 | 294 | model = generate_model(model_depth=50, 295 | n_classes=174, 296 | n_input_channels=3, 297 | shortcut_type='B', 298 | conv1_t_size=5, 299 | conv1_t_stride=1, 300 | no_max_pool=False, 301 | widen_factor=1.0) 302 | model = model#.cuda() 303 | 304 | pdb.set_trace() 305 | inputs = torch.rand((4,3,32,224,224))#.cuda() 306 | out = model(inputs) 307 | pdb.set_trace() 308 | -------------------------------------------------------------------------------- /transform.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import torch 4 | from torch import nn, Tensor 5 | from torch.nn import functional as F 6 | import torchvision 7 | from torch.jit.annotations import List, Tuple, Dict, Optional 8 | 9 | from torchvision.models.detection.image_list import ImageList 10 | from torchvision.models.detection.roi_heads import paste_masks_in_image 11 | import pdb 12 | 13 | @torch.jit.unused 14 | def _resize_image_and_masks_onnx(image, self_min_size, self_max_size, target): 15 | # type: (Tensor, float, float, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] 16 | from torch.onnx import operators 17 | im_shape = operators.shape_as_tensor(image)[-2:] 18 | min_size = torch.min(im_shape).to(dtype=torch.float32) 19 | max_size = torch.max(im_shape).to(dtype=torch.float32) 20 | scale_factor = torch.min(self_min_size / min_size, self_max_size / max_size) 21 | 22 | image = torch.nn.functional.interpolate( 23 | image[None], scale_factor=scale_factor, mode='bilinear', 24 | align_corners=False)[0] 25 | 26 | if target is None: 27 | return image, target 28 | 29 | if "masks" in target: 30 | mask = target["masks"] 31 | mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor)[:, 0].byte() 32 | target["masks"] = mask 33 | return image, target 34 | 35 | 36 | def _resize_image_and_masks(image, self_min_size, self_max_size, target): 37 | # type: (Tensor, float, float, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] 38 | im_shape = torch.tensor(image.shape[-2:]) 39 | min_size = float(torch.min(im_shape)) 40 | max_size = float(torch.max(im_shape)) 41 | scale_factor = self_min_size / min_size 42 | if max_size * scale_factor > self_max_size: 43 | scale_factor = self_max_size / max_size 44 | image = torch.nn.functional.interpolate( 45 | image[None], scale_factor=scale_factor, mode='bilinear', 46 | align_corners=False)[0] 47 | 48 | if target is None: 49 | return image, target 50 | 51 | if "masks" in target: 52 | mask = target["masks"] 53 | mask = F.interpolate(mask[:, None].float(), scale_factor=scale_factor)[:, 0].byte() 54 | target["masks"] = mask 55 | return image, target 56 | 57 | 58 | class STRGTransform(nn.Module): 59 | """ 60 | Performs input / target transformation before feeding the data to a GeneralizedRCNN 61 | model. 62 | 63 | The transformations it perform are: 64 | - input normalization (mean subtraction and std division) 65 | - input / target resizing to match min_size / max_size 66 | 67 | It returns a ImageList for the inputs, and a List[Dict[Tensor]] for the targets 68 | """ 69 | 70 | def __init__(self, min_size, max_size, image_mean, image_std): 71 | super(STRGTransform, self).__init__() 72 | if not isinstance(min_size, (list, tuple)): 73 | min_size = (min_size,) 74 | self.min_size = min_size 75 | self.max_size = max_size 76 | self.image_mean = image_mean 77 | self.image_std = image_std 78 | 79 | def forward(self, 80 | images, # type: List[Tensor] 81 | targets=None # type: Optional[List[Dict[str, Tensor]]] 82 | ): 83 | # type: (...) -> Tuple[ImageList, Optional[List[Dict[str, Tensor]]]] 84 | images = [img for img in images] 85 | if targets is not None: 86 | # make a copy of targets to avoid modifying it in-place 87 | # once torchscript supports dict comprehension 88 | # this can be simplified as as follows 89 | # targets = [{k: v for k,v in t.items()} for t in targets] 90 | targets_copy: List[Dict[str, Tensor]] = [] 91 | for t in targets: 92 | data: Dict[str, Tensor] = {} 93 | for k, v in t.items(): 94 | data[k] = v 95 | targets_copy.append(data) 96 | targets = targets_copy 97 | for i in range(len(images)): 98 | image = images[i] 99 | target_index = targets[i] if targets is not None else None 100 | 101 | if image.dim() != 3: 102 | raise ValueError("images is expected to be a list of 3d tensors " 103 | "of shape [C, H, W], got {}".format(image.shape)) 104 | image = self.normalize(image) 105 | image, target_index = self.resize(image, target_index) 106 | images[i] = image 107 | if targets is not None and target_index is not None: 108 | targets[i] = target_index 109 | 110 | image_sizes = [img.shape[-2:] for img in images] 111 | images = self.batch_images(images) 112 | image_sizes_list = torch.jit.annotate(List[Tuple[int, int]], []) 113 | for image_size in image_sizes: 114 | assert len(image_size) == 2 115 | image_sizes_list.append((image_size[0], image_size[1])) 116 | 117 | image_list = ImageList(images, image_sizes_list) 118 | return image_list, targets 119 | 120 | 121 | def normalize(self, image): 122 | return image 123 | # dtype, device = image.dtype, image.device 124 | # mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device) 125 | # std = torch.as_tensor(self.image_std, dtype=dtype, device=device) 126 | # return (image - mean[:, None, None]) / std[:, None, None] 127 | 128 | def torch_choice(self, k): 129 | # type: (List[int]) -> int 130 | """ 131 | Implements `random.choice` via torch ops so it can be compiled with 132 | TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803 133 | is fixed. 134 | """ 135 | index = int(torch.empty(1).uniform_(0., float(len(k))).item()) 136 | return k[index] 137 | 138 | def resize(self, image, target): 139 | # type: (Tensor, Optional[Dict[str, Tensor]]) -> Tuple[Tensor, Optional[Dict[str, Tensor]]] 140 | h, w = image.shape[-2:] 141 | if self.training: 142 | size = float(self.torch_choice(self.min_size)) 143 | else: 144 | # FIXME assume for now that testing uses the largest scale 145 | size = float(self.min_size[-1]) 146 | if torchvision._is_tracing(): 147 | image, target = _resize_image_and_masks_onnx(image, size, float(self.max_size), target) 148 | else: 149 | image, target = _resize_image_and_masks(image, size, float(self.max_size), target) 150 | 151 | if target is None: 152 | return image, target 153 | 154 | bbox = target["boxes"] 155 | bbox = resize_boxes(bbox, (h, w), image.shape[-2:]) 156 | target["boxes"] = bbox 157 | 158 | if "keypoints" in target: 159 | keypoints = target["keypoints"] 160 | keypoints = resize_keypoints(keypoints, (h, w), image.shape[-2:]) 161 | target["keypoints"] = keypoints 162 | return image, target 163 | 164 | # _onnx_batch_images() is an implementation of 165 | # batch_images() that is supported by ONNX tracing. 166 | @torch.jit.unused 167 | def _onnx_batch_images(self, images, size_divisible=32): 168 | # type: (List[Tensor], int) -> Tensor 169 | max_size = [] 170 | for i in range(images[0].dim()): 171 | max_size_i = torch.max(torch.stack([img.shape[i] for img in images]).to(torch.float32)).to(torch.int64) 172 | max_size.append(max_size_i) 173 | stride = size_divisible 174 | max_size[1] = (torch.ceil((max_size[1].to(torch.float32)) / stride) * stride).to(torch.int64) 175 | max_size[2] = (torch.ceil((max_size[2].to(torch.float32)) / stride) * stride).to(torch.int64) 176 | max_size = tuple(max_size) 177 | 178 | # work around for 179 | # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 180 | # which is not yet supported in onnx 181 | padded_imgs = [] 182 | for img in images: 183 | padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] 184 | padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) 185 | padded_imgs.append(padded_img) 186 | 187 | return torch.stack(padded_imgs) 188 | 189 | def max_by_axis(self, the_list): 190 | # type: (List[List[int]]) -> List[int] 191 | maxes = the_list[0] 192 | for sublist in the_list[1:]: 193 | for index, item in enumerate(sublist): 194 | maxes[index] = max(maxes[index], item) 195 | return maxes 196 | 197 | def batch_images(self, images, size_divisible=32): 198 | # type: (List[Tensor], int) -> Tensor 199 | if torchvision._is_tracing(): 200 | # batch_images() does not export well to ONNX 201 | # call _onnx_batch_images() instead 202 | return self._onnx_batch_images(images, size_divisible) 203 | 204 | max_size = self.max_by_axis([list(img.shape) for img in images]) 205 | stride = float(size_divisible) 206 | max_size = list(max_size) 207 | max_size[1] = int(math.ceil(float(max_size[1]) / stride) * stride) 208 | max_size[2] = int(math.ceil(float(max_size[2]) / stride) * stride) 209 | 210 | batch_shape = [len(images)] + max_size 211 | batched_imgs = images[0].new_full(batch_shape, 0) 212 | for img, pad_img in zip(images, batched_imgs): 213 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 214 | 215 | return batched_imgs 216 | 217 | 218 | def rpn_postprocess(self, 219 | result, # type: List[Dict[str, Tensor]] 220 | image_shapes, # type: List[Tuple[int, int]] 221 | original_image_sizes # type: List[Tuple[int, int]] 222 | ): 223 | # type: (...) -> List[Dict[str, Tensor]] 224 | if self.training: 225 | return result 226 | new_boxes = [] 227 | for i, (boxes, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)): 228 | boxes = resize_boxes(boxes, im_s, o_im_s) 229 | new_boxes.append(boxes) 230 | return new_boxes 231 | 232 | 233 | 234 | def postprocess(self, 235 | result, # type: List[Dict[str, Tensor]] 236 | image_shapes, # type: List[Tuple[int, int]] 237 | original_image_sizes # type: List[Tuple[int, int]] 238 | ): 239 | # type: (...) -> List[Dict[str, Tensor]] 240 | if self.training: 241 | return result 242 | for i, (pred, im_s, o_im_s) in enumerate(zip(result, image_shapes, original_image_sizes)): 243 | boxes = pred["boxes"] 244 | boxes = resize_boxes(boxes, im_s, o_im_s) 245 | result[i]["boxes"] = boxes 246 | if "masks" in pred: 247 | masks = pred["masks"] 248 | masks = paste_masks_in_image(masks, boxes, o_im_s) 249 | result[i]["masks"] = masks 250 | if "keypoints" in pred: 251 | keypoints = pred["keypoints"] 252 | keypoints = resize_keypoints(keypoints, im_s, o_im_s) 253 | result[i]["keypoints"] = keypoints 254 | return result 255 | 256 | def __repr__(self): 257 | format_string = self.__class__.__name__ + '(' 258 | _indent = '\n ' 259 | format_string += "{0}Normalize(mean={1}, std={2})".format(_indent, self.image_mean, self.image_std) 260 | format_string += "{0}Resize(min_size={1}, max_size={2}, mode='bilinear')".format(_indent, self.min_size, 261 | self.max_size) 262 | format_string += '\n)' 263 | return format_string 264 | 265 | 266 | def resize_keypoints(keypoints, original_size, new_size): 267 | # type: (Tensor, List[int], List[int]) -> Tensor 268 | ratios = [ 269 | torch.tensor(s, dtype=torch.float32, device=keypoints.device) / 270 | torch.tensor(s_orig, dtype=torch.float32, device=keypoints.device) 271 | for s, s_orig in zip(new_size, original_size) 272 | ] 273 | ratio_h, ratio_w = ratios 274 | resized_data = keypoints.clone() 275 | if torch._C._get_tracing_state(): 276 | resized_data_0 = resized_data[:, :, 0] * ratio_w 277 | resized_data_1 = resized_data[:, :, 1] * ratio_h 278 | resized_data = torch.stack((resized_data_0, resized_data_1, resized_data[:, :, 2]), dim=2) 279 | else: 280 | resized_data[..., 0] *= ratio_w 281 | resized_data[..., 1] *= ratio_h 282 | return resized_data 283 | 284 | 285 | def resize_boxes(boxes, original_size, new_size): 286 | # type: (Tensor, List[int], List[int]) -> Tensor 287 | ratios = [ 288 | torch.tensor(s, dtype=torch.float32, device=boxes.device) / 289 | torch.tensor(s_orig, dtype=torch.float32, device=boxes.device) 290 | for s, s_orig in zip(new_size, original_size) 291 | ] 292 | ratio_height, ratio_width = ratios 293 | xmin, ymin, xmax, ymax = boxes.unbind(1) 294 | 295 | xmin = xmin * ratio_width 296 | xmax = xmax * ratio_width 297 | ymin = ymin * ratio_height 298 | ymax = ymax * ratio_height 299 | return torch.stack((xmin, ymin, xmax, ymax), dim=1) 300 | -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | 5 | def parse_opts(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--root_path', 8 | default=None, 9 | type=Path, 10 | help='Root directory path') 11 | parser.add_argument('--video_path', 12 | default=None, 13 | type=Path, 14 | help='Directory path of videos') 15 | parser.add_argument('--annotation_path', 16 | default=None, 17 | type=Path, 18 | help='Annotation file path') 19 | parser.add_argument('--result_path', 20 | default=None, 21 | type=Path, 22 | help='Result directory path') 23 | parser.add_argument( 24 | '--dataset', 25 | default='kinetics', 26 | type=str, 27 | help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)') 28 | parser.add_argument( 29 | '--n_classes', 30 | default=400, 31 | type=int, 32 | help= 33 | 'Number of classes (activitynet: 200, kinetics: 400 or 600, ucf101: 101, hmdb51: 51)' 34 | ) 35 | parser.add_argument('--n_pretrain_classes', 36 | default=0, 37 | type=int, 38 | help=('Number of classes of pretraining task.' 39 | 'When using --pretrain_path, this must be set.')) 40 | parser.add_argument('--pretrain_path', 41 | default=None, 42 | type=Path, 43 | help='Pretrained model path (.pth).') 44 | parser.add_argument( 45 | '--ft_begin_module', 46 | default='', 47 | type=str, 48 | help=('Module name of beginning of fine-tuning' 49 | '(conv1, layer1, fc, denseblock1, classifier, ...).' 50 | 'The default means all layers are fine-tuned.')) 51 | parser.add_argument('--sample_size', 52 | default=224, 53 | type=int, 54 | help='Height and width of inputs') 55 | parser.add_argument('--sample_duration', 56 | default=16, 57 | type=int, 58 | help='Temporal duration of inputs') 59 | parser.add_argument( 60 | '--sample_t_stride', 61 | default=1, 62 | type=int, 63 | help='If larger than 1, input frames are subsampled with the stride.') 64 | parser.add_argument( 65 | '--train_crop', 66 | default='random', 67 | type=str, 68 | help=('Spatial cropping method in training. ' 69 | 'random is uniform. ' 70 | 'corner is selection from 4 corners and 1 center. ' 71 | '(random | corner | center)')) 72 | parser.add_argument('--train_crop_min_scale', 73 | default=0.25, 74 | type=float, 75 | help='Min scale for random cropping in training') 76 | parser.add_argument('--train_crop_min_ratio', 77 | default=0.75, 78 | type=float, 79 | help='Min aspect ratio for random cropping in training') 80 | parser.add_argument('--no_hflip', 81 | action='store_true', 82 | help='If true holizontal flipping is not performed.') 83 | parser.add_argument('--colorjitter', 84 | action='store_true', 85 | help='If true colorjitter is performed.') 86 | parser.add_argument('--train_t_crop', 87 | default='random', 88 | type=str, 89 | help=('Temporal cropping method in training. ' 90 | 'random is uniform. ' 91 | '(random | center)')) 92 | parser.add_argument('--learning_rate', 93 | default=0.1, 94 | type=float, 95 | help=('Initial learning rate' 96 | '(divided by 10 while training by lr scheduler)')) 97 | parser.add_argument('--momentum', default=0.9, type=float, help='Momentum') 98 | parser.add_argument('--dampening', 99 | default=0.0, 100 | type=float, 101 | help='dampening of SGD') 102 | parser.add_argument('--weight_decay', 103 | default=1e-3, 104 | type=float, 105 | help='Weight Decay') 106 | parser.add_argument('--mean_dataset', 107 | default='kinetics', 108 | type=str, 109 | help=('dataset for mean values of mean subtraction' 110 | '(activitynet | kinetics | 0.5)')) 111 | parser.add_argument('--no_mean_norm', 112 | action='store_true', 113 | help='If true, inputs are not normalized by mean.') 114 | parser.add_argument( 115 | '--no_std_norm', 116 | action='store_true', 117 | help='If true, inputs are not normalized by standard deviation.') 118 | parser.add_argument( 119 | '--value_scale', 120 | default=1, 121 | type=int, 122 | help= 123 | 'If 1, range of inputs is [0-1]. If 255, range of inputs is [0-255].') 124 | parser.add_argument('--nesterov', 125 | action='store_true', 126 | help='Nesterov momentum') 127 | parser.add_argument('--optimizer', 128 | default='sgd', 129 | type=str, 130 | help='Currently only support SGD') 131 | parser.add_argument('--lr_scheduler', 132 | default='multistep', 133 | type=str, 134 | help='Type of LR scheduler (multistep | plateau)') 135 | parser.add_argument( 136 | '--multistep_milestones', 137 | default=[50, 100, 150], 138 | type=int, 139 | nargs='+', 140 | help='Milestones of LR scheduler. See documentation of MultistepLR.') 141 | parser.add_argument( 142 | '--overwrite_milestones', 143 | action='store_true', 144 | help='If true, overwriting multistep_milestones when resuming training.' 145 | ) 146 | parser.add_argument( 147 | '--plateau_patience', 148 | default=10, 149 | type=int, 150 | help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.' 151 | ) 152 | parser.add_argument('--batch_size', 153 | default=128, 154 | type=int, 155 | help='Batch Size') 156 | parser.add_argument( 157 | '--inference_batch_size', 158 | default=0, 159 | type=int, 160 | help='Batch Size for inference. 0 means this is the same as batch_size.' 161 | ) 162 | parser.add_argument( 163 | '--batchnorm_sync', 164 | action='store_true', 165 | help='If true, SyncBatchNorm is used instead of BatchNorm.') 166 | parser.add_argument('--n_epochs', 167 | default=200, 168 | type=int, 169 | help='Number of total epochs to run') 170 | parser.add_argument('--n_val_samples', 171 | default=3, 172 | type=int, 173 | help='Number of validation samples for each activity') 174 | parser.add_argument('--resume_path', 175 | default=None, 176 | type=Path, 177 | help='Save data (.pth) of previous training') 178 | parser.add_argument('--no_train', 179 | action='store_true', 180 | help='If true, training is not performed.') 181 | parser.add_argument('--no_val', 182 | action='store_true', 183 | help='If true, validation is not performed.') 184 | parser.add_argument('--inference', 185 | action='store_true', 186 | help='If true, inference is performed.') 187 | parser.add_argument('--inference_subset', 188 | default='val', 189 | type=str, 190 | help='Used subset in inference (train | val | test)') 191 | parser.add_argument('--inference_stride', 192 | default=16, 193 | type=int, 194 | help='Stride of sliding window in inference.') 195 | parser.add_argument( 196 | '--inference_crop', 197 | default='center', 198 | type=str, 199 | help=('Cropping method in inference. (center | nocrop)' 200 | 'When nocrop, fully convolutional inference is performed,' 201 | 'and mini-batch consists of clips of one video.')) 202 | parser.add_argument( 203 | '--inference_no_average', 204 | action='store_true', 205 | help='If true, outputs for segments in a video are not averaged.') 206 | parser.add_argument('--no_cuda', 207 | action='store_true', 208 | help='If true, cuda is not used.') 209 | parser.add_argument('--n_threads', 210 | default=1, 211 | type=int, 212 | help='Number of threads for multi-thread loading') 213 | parser.add_argument('--checkpoint', 214 | default=10, 215 | type=int, 216 | help='Trained model is saved at every this epochs.') 217 | parser.add_argument( 218 | '--model', 219 | default='resnet', 220 | type=str, 221 | help= 222 | '(resnet | resnet2p1d | preresnet | wideresnet | resnext | densenet | ') 223 | parser.add_argument('--model_depth', 224 | default=18, 225 | type=int, 226 | help='Depth of resnet (10 | 18 | 34 | 50 | 101)') 227 | parser.add_argument('--conv1_t_size', 228 | default=7, 229 | type=int, 230 | help='Kernel size in t dim of conv1.') 231 | parser.add_argument('--conv1_t_stride', 232 | default=1, 233 | type=int, 234 | help='Stride in t dim of conv1.') 235 | parser.add_argument('--no_max_pool', 236 | action='store_true', 237 | help='If true, the max pooling after conv1 is removed.') 238 | parser.add_argument('--resnet_shortcut', 239 | default='B', 240 | type=str, 241 | help='Shortcut type of resnet (A | B)') 242 | parser.add_argument( 243 | '--resnet_widen_factor', 244 | default=1.0, 245 | type=float, 246 | help='The number of feature maps of resnet is multiplied by this value') 247 | parser.add_argument('--wide_resnet_k', 248 | default=2, 249 | type=int, 250 | help='Wide resnet k') 251 | parser.add_argument('--resnext_cardinality', 252 | default=32, 253 | type=int, 254 | help='ResNeXt cardinality') 255 | parser.add_argument('--input_type', 256 | default='rgb', 257 | type=str, 258 | help='(rgb | flow)') 259 | parser.add_argument('--manual_seed', 260 | default=1, 261 | type=int, 262 | help='Manually set random seed') 263 | parser.add_argument('--accimage', 264 | action='store_true', 265 | help='If true, accimage is used to load images.') 266 | parser.add_argument('--output_topk', 267 | default=5, 268 | type=int, 269 | help='Top-k scores are saved in json file.') 270 | parser.add_argument('--file_type', 271 | default='jpg', 272 | type=str, 273 | help='(jpg | hdf5)') 274 | parser.add_argument('--tensorboard', 275 | action='store_true', 276 | help='If true, output tensorboard log file.') 277 | parser.add_argument( 278 | '--distributed', 279 | action='store_true', 280 | help='Use multi-processing distributed training to launch ' 281 | 'N processes per node, which has N GPUs.') 282 | parser.add_argument('--dist_url', 283 | default='tcp://127.0.0.1:23456', 284 | type=str, 285 | help='url used to set up distributed training') 286 | parser.add_argument('--world_size', 287 | default=-1, 288 | type=int, 289 | help='number of nodes for distributed training') 290 | 291 | parser.add_argument('--wandb', 292 | action='store_true', 293 | help='Use wandb.') 294 | parser.add_argument('--strg', 295 | action='store_true', 296 | help='Use STRG.') 297 | 298 | parser.add_argument('--det_interval', 299 | default=2, 300 | type=int, 301 | help='Detection Interval which should be aligned with' 302 | 'backbone architecture.') 303 | 304 | parser.add_argument('--nrois', 305 | default=10, 306 | type=int, 307 | help='The number of rois') 308 | 309 | args = parser.parse_args() 310 | 311 | return args 312 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import json 3 | import random 4 | import os 5 | import pdb 6 | import numpy as np 7 | import wandb 8 | import torch 9 | from torch.nn import CrossEntropyLoss 10 | from torch.optim import SGD, lr_scheduler 11 | import torch.multiprocessing as mp 12 | import torch.distributed as dist 13 | from torch.backends import cudnn 14 | import torchvision 15 | from torchvision.models.detection import fasterrcnn_resnet50_fpn 16 | 17 | from opts import parse_opts 18 | from model import (generate_model, load_pretrained_model, make_data_parallel, 19 | get_fine_tuning_parameters) 20 | from strg import STRG 21 | from mean import get_mean_std 22 | from spatial_transforms import (Compose, Normalize, Resize, CenterCrop, 23 | CornerCrop, MultiScaleCornerCrop, 24 | RandomResizedCrop, RandomHorizontalFlip, 25 | ToTensor, ScaleValue, ColorJitter, 26 | PickFirstChannels) 27 | from temporal_transforms import (LoopPadding, TemporalRandomCrop, 28 | TemporalCenterCrop, TemporalEvenCrop, 29 | SlidingWindow, TemporalSubsampling) 30 | from temporal_transforms import Compose as TemporalCompose 31 | from dataset import get_training_data, get_validation_data, get_inference_data 32 | from utils import Logger, worker_init_fn, get_lr 33 | from training import train_epoch 34 | from validation import val_epoch 35 | import inference 36 | 37 | from rpn import RPN 38 | 39 | 40 | def json_serial(obj): 41 | if isinstance(obj, Path): 42 | return str(obj) 43 | 44 | 45 | def get_opt(): 46 | opt = parse_opts() 47 | 48 | if opt.root_path is not None: 49 | opt.video_path = opt.root_path / opt.video_path 50 | opt.annotation_path = opt.root_path / opt.annotation_path 51 | opt.result_path = opt.root_path / opt.result_path 52 | if opt.resume_path is not None: 53 | opt.resume_path = opt.root_path / opt.resume_path 54 | if opt.pretrain_path is not None: 55 | opt.pretrain_path = opt.root_path / opt.pretrain_path 56 | 57 | if opt.pretrain_path is not None: 58 | opt.n_finetune_classes = opt.n_classes 59 | opt.n_classes = opt.n_pretrain_classes 60 | 61 | if opt.output_topk <= 0: 62 | opt.output_topk = opt.n_classes 63 | 64 | if opt.inference_batch_size == 0: 65 | opt.inference_batch_size = opt.batch_size 66 | 67 | opt.arch = '{}-{}'.format(opt.model, opt.model_depth) 68 | opt.begin_epoch = 1 69 | opt.mean, opt.std = get_mean_std(opt.value_scale, dataset=opt.mean_dataset) 70 | opt.n_input_channels = 3 71 | if opt.input_type == 'flow': 72 | opt.n_input_channels = 2 73 | opt.mean = opt.mean[:2] 74 | opt.std = opt.std[:2] 75 | 76 | if opt.distributed: 77 | opt.dist_rank = 0 #int(os.environ["OMPI_COMM_WORLD_RANK"]) 78 | 79 | if opt.dist_rank == 0: 80 | print(opt) 81 | with (opt.result_path / 'opts.json').open('w') as opt_file: 82 | json.dump(vars(opt), opt_file, default=json_serial) 83 | else: 84 | print(opt) 85 | with (opt.result_path / 'opts.json').open('w') as opt_file: 86 | json.dump(vars(opt), opt_file, default=json_serial) 87 | 88 | return opt 89 | 90 | 91 | def resume_model(resume_path, arch, model): 92 | print('loading checkpoint {} model'.format(resume_path)) 93 | checkpoint = torch.load(resume_path, map_location='cpu') 94 | assert arch == checkpoint['arch'] 95 | 96 | if hasattr(model, 'module'): 97 | model.module.load_state_dict(checkpoint['state_dict']) 98 | else: 99 | model.load_state_dict(checkpoint['state_dict']) 100 | 101 | return model 102 | 103 | 104 | def resume_train_utils(resume_path, begin_epoch, optimizer, scheduler): 105 | print('loading checkpoint {} train utils'.format(resume_path)) 106 | checkpoint = torch.load(resume_path, map_location='cpu') 107 | 108 | begin_epoch = checkpoint['epoch'] + 1 109 | if optimizer is not None and 'optimizer' in checkpoint: 110 | optimizer.load_state_dict(checkpoint['optimizer']) 111 | if scheduler is not None and 'scheduler' in checkpoint: 112 | scheduler.load_state_dict(checkpoint['scheduler']) 113 | 114 | return begin_epoch, optimizer, scheduler 115 | 116 | 117 | def get_normalize_method(mean, std, no_mean_norm, no_std_norm): 118 | if no_mean_norm: 119 | if no_std_norm: 120 | return Normalize([0, 0, 0], [1, 1, 1]) 121 | else: 122 | return Normalize([0, 0, 0], std) 123 | else: 124 | if no_std_norm: 125 | return Normalize(mean, [1, 1, 1]) 126 | else: 127 | return Normalize(mean, std) 128 | 129 | 130 | def get_train_utils(opt, model_parameters): 131 | assert opt.train_crop in ['random', 'corner', 'center'] 132 | spatial_transform = [] 133 | if opt.train_crop == 'random': 134 | spatial_transform.append( 135 | RandomResizedCrop( 136 | opt.sample_size, (opt.train_crop_min_scale, 1.0), 137 | (opt.train_crop_min_ratio, 1.0 / opt.train_crop_min_ratio))) 138 | elif opt.train_crop == 'corner': 139 | scales = [1.0] 140 | scale_step = 1 / (2**(1 / 4)) 141 | for _ in range(1, 5): 142 | scales.append(scales[-1] * scale_step) 143 | spatial_transform.append(MultiScaleCornerCrop(opt.sample_size, scales)) 144 | elif opt.train_crop == 'center': 145 | spatial_transform.append(Resize(opt.sample_size)) 146 | spatial_transform.append(CenterCrop(opt.sample_size)) 147 | normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, 148 | opt.no_std_norm) 149 | if not opt.no_hflip: 150 | spatial_transform.append(RandomHorizontalFlip()) 151 | if opt.colorjitter: 152 | spatial_transform.append(ColorJitter()) 153 | spatial_transform.append(ToTensor()) 154 | if opt.input_type == 'flow': 155 | spatial_transform.append(PickFirstChannels(n=2)) 156 | spatial_transform.append(ScaleValue(opt.value_scale)) 157 | spatial_transform.append(normalize) 158 | spatial_transform = Compose(spatial_transform) 159 | 160 | assert opt.train_t_crop in ['random', 'center'] 161 | temporal_transform = [] 162 | if opt.sample_t_stride > 1: 163 | temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) 164 | if opt.train_t_crop == 'random': 165 | temporal_transform.append(TemporalRandomCrop(opt.sample_duration)) 166 | elif opt.train_t_crop == 'center': 167 | temporal_transform.append(TemporalCenterCrop(opt.sample_duration)) 168 | temporal_transform = TemporalCompose(temporal_transform) 169 | 170 | train_data = get_training_data(opt.video_path, opt.annotation_path, 171 | opt.dataset, opt.input_type, opt.file_type, 172 | spatial_transform, temporal_transform) 173 | if opt.distributed: 174 | train_sampler = torch.utils.data.distributed.DistributedSampler( 175 | train_data) 176 | else: 177 | train_sampler = None 178 | train_loader = torch.utils.data.DataLoader(train_data, 179 | batch_size=opt.batch_size, 180 | shuffle=(train_sampler is None), 181 | num_workers=opt.n_threads, 182 | pin_memory=True, 183 | sampler=train_sampler, 184 | worker_init_fn=worker_init_fn) 185 | 186 | if opt.is_master_node: 187 | train_logger = Logger(opt.result_path / 'train.log', 188 | ['epoch', 'loss', 'acc', 'lr']) 189 | train_batch_logger = Logger( 190 | opt.result_path / 'train_batch.log', 191 | ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) 192 | else: 193 | train_logger = None 194 | train_batch_logger = None 195 | 196 | if opt.nesterov: 197 | dampening = 0 198 | else: 199 | dampening = opt.dampening 200 | optimizer = SGD(model_parameters, 201 | lr=opt.learning_rate, 202 | momentum=opt.momentum, 203 | dampening=dampening, 204 | weight_decay=opt.weight_decay, 205 | nesterov=opt.nesterov) 206 | 207 | assert opt.lr_scheduler in ['plateau', 'multistep'] 208 | assert not (opt.lr_scheduler == 'plateau' and opt.no_val) 209 | if opt.lr_scheduler == 'plateau': 210 | scheduler = lr_scheduler.ReduceLROnPlateau( 211 | optimizer, 'min', patience=opt.plateau_patience) 212 | else: 213 | scheduler = lr_scheduler.MultiStepLR(optimizer, 214 | opt.multistep_milestones) 215 | 216 | return (train_loader, train_sampler, train_logger, train_batch_logger, 217 | optimizer, scheduler) 218 | 219 | 220 | def get_val_utils(opt): 221 | normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, 222 | opt.no_std_norm) 223 | spatial_transform = [ 224 | Resize(opt.sample_size), 225 | CenterCrop(opt.sample_size), 226 | ToTensor() 227 | ] 228 | if opt.input_type == 'flow': 229 | spatial_transform.append(PickFirstChannels(n=2)) 230 | spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) 231 | spatial_transform = Compose(spatial_transform) 232 | 233 | temporal_transform = [] 234 | if opt.sample_t_stride > 1: 235 | temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) 236 | temporal_transform.append( 237 | TemporalEvenCrop(opt.sample_duration, opt.n_val_samples)) 238 | temporal_transform = TemporalCompose(temporal_transform) 239 | 240 | val_data, collate_fn = get_validation_data(opt.video_path, 241 | opt.annotation_path, opt.dataset, 242 | opt.input_type, opt.file_type, 243 | spatial_transform, 244 | temporal_transform) 245 | if opt.distributed: 246 | val_sampler = torch.utils.data.distributed.DistributedSampler( 247 | val_data, shuffle=False) 248 | else: 249 | val_sampler = None 250 | val_loader = torch.utils.data.DataLoader(val_data, 251 | # batch_size=opt.batch_size, 252 | (opt.batch_size // 253 | opt.n_val_samples), 254 | shuffle=False, 255 | num_workers=opt.n_threads, 256 | pin_memory=True, 257 | sampler=val_sampler, 258 | worker_init_fn=worker_init_fn, 259 | collate_fn=collate_fn) 260 | 261 | if opt.is_master_node: 262 | val_logger = Logger(opt.result_path / 'val.log', 263 | ['epoch', 'loss', 'acc']) 264 | else: 265 | val_logger = None 266 | 267 | return val_loader, val_logger 268 | 269 | 270 | def get_inference_utils(opt): 271 | assert opt.inference_crop in ['center', 'nocrop'] 272 | 273 | normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, 274 | opt.no_std_norm) 275 | 276 | spatial_transform = [Resize(opt.sample_size)] 277 | if opt.inference_crop == 'center': 278 | spatial_transform.append(CenterCrop(opt.sample_size)) 279 | spatial_transform.append(ToTensor()) 280 | if opt.input_type == 'flow': 281 | spatial_transform.append(PickFirstChannels(n=2)) 282 | spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) 283 | spatial_transform = Compose(spatial_transform) 284 | 285 | temporal_transform = [] 286 | if opt.sample_t_stride > 1: 287 | temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) 288 | temporal_transform.append( 289 | SlidingWindow(opt.sample_duration, opt.inference_stride)) 290 | temporal_transform = TemporalCompose(temporal_transform) 291 | 292 | inference_data, collate_fn = get_inference_data( 293 | opt.video_path, opt.annotation_path, opt.dataset, opt.input_type, 294 | opt.file_type, opt.inference_subset, spatial_transform, 295 | temporal_transform) 296 | 297 | inference_loader = torch.utils.data.DataLoader( 298 | inference_data, 299 | batch_size=opt.inference_batch_size, 300 | shuffle=False, 301 | num_workers=opt.n_threads, 302 | pin_memory=True, 303 | worker_init_fn=worker_init_fn, 304 | collate_fn=collate_fn) 305 | 306 | return inference_loader, inference_data.class_names 307 | 308 | 309 | def save_checkpoint(save_file_path, epoch, arch, model, optimizer, scheduler): 310 | if hasattr(model, 'module'): 311 | model_state_dict = model.module.state_dict() 312 | else: 313 | model_state_dict = model.state_dict() 314 | save_states = { 315 | 'epoch': epoch, 316 | 'arch': arch, 317 | 'state_dict': model_state_dict, 318 | 'optimizer': optimizer.state_dict(), 319 | 'scheduler': scheduler.state_dict() 320 | } 321 | torch.save(save_states, save_file_path) 322 | 323 | 324 | def main_worker(index, opt): 325 | random.seed(opt.manual_seed) 326 | np.random.seed(opt.manual_seed) 327 | torch.manual_seed(opt.manual_seed) 328 | 329 | if index >= 0 and opt.device.type == 'cuda': 330 | # opt.device = torch.device(f'cuda:{index}') 331 | opt.device = torch.device('cuda:{}'.format(index)) 332 | 333 | if opt.distributed: 334 | opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index 335 | dist.init_process_group(backend='nccl', 336 | init_method=opt.dist_url, 337 | world_size=opt.world_size, 338 | rank=opt.dist_rank) 339 | opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) 340 | opt.n_threads = int( 341 | (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) 342 | opt.is_master_node = not opt.distributed or opt.dist_rank == 0 343 | 344 | model = generate_model(opt) 345 | if opt.batchnorm_sync: 346 | assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' 347 | model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) 348 | if opt.pretrain_path: 349 | model = load_pretrained_model(model, opt.pretrain_path, opt.model, 350 | opt.n_finetune_classes, opt.strg) 351 | 352 | if opt.strg: 353 | model = STRG(model, nclass=opt.n_classes, nrois=opt.nrois) 354 | rpn = RPN(nrois=opt.nrois) 355 | rpn = make_data_parallel(rpn, opt.distributed, opt.device) 356 | else: 357 | rpn = None 358 | 359 | if opt.resume_path is not None: 360 | model = resume_model(opt.resume_path, opt.arch, model) 361 | 362 | model = make_data_parallel(model, opt.distributed, opt.device) 363 | 364 | # if opt.pretrain_path: 365 | # parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) 366 | # else: 367 | parameters = model.parameters() 368 | 369 | if opt.is_master_node: 370 | print(model) 371 | 372 | criterion = CrossEntropyLoss().to(opt.device) 373 | 374 | if not opt.no_train: 375 | (train_loader, train_sampler, train_logger, train_batch_logger, 376 | optimizer, scheduler) = get_train_utils(opt, parameters) 377 | if opt.resume_path is not None: 378 | opt.begin_epoch, optimizer, scheduler = resume_train_utils( 379 | opt.resume_path, opt.begin_epoch, optimizer, scheduler) 380 | if opt.overwrite_milestones: 381 | scheduler.milestones = opt.multistep_milestones 382 | if not opt.no_val: 383 | val_loader, val_logger = get_val_utils(opt) 384 | 385 | if opt.tensorboard and opt.is_master_node: 386 | #from torch.utils.tensorboard import SummaryWriter 387 | from tensorboardX import SummaryWriter 388 | if opt.begin_epoch == 1: 389 | tb_writer = SummaryWriter(log_dir=opt.result_path) 390 | else: 391 | tb_writer = SummaryWriter(log_dir=opt.result_path, 392 | purge_step=opt.begin_epoch) 393 | else: 394 | tb_writer = None 395 | 396 | if opt.wandb: 397 | name = str(opt.result_path) 398 | wandb.init( 399 | project='strg', 400 | name=name, 401 | config=opt, 402 | dir= name, 403 | # resume=str(opt.resume_path) != '', 404 | sync_tensorboard=True) 405 | 406 | 407 | 408 | prev_val_loss = None 409 | for i in range(opt.begin_epoch, opt.n_epochs + 1): 410 | if not opt.no_train: 411 | if opt.distributed: 412 | train_sampler.set_epoch(i) 413 | current_lr = get_lr(optimizer) 414 | train_epoch(i, train_loader, model, criterion, optimizer, 415 | opt.device, current_lr, train_logger, 416 | train_batch_logger, tb_writer, opt.distributed,rpn=rpn, 417 | det_interval=opt.det_interval, nrois=opt.nrois) 418 | 419 | if i % opt.checkpoint == 0 and opt.is_master_node: 420 | save_file_path = opt.result_path / 'save_{}.pth'.format(i) 421 | save_checkpoint(save_file_path, i, opt.arch, model, optimizer, 422 | scheduler) 423 | 424 | if not opt.no_val: 425 | prev_val_loss = val_epoch(i, val_loader, model, criterion, 426 | opt.device, val_logger, tb_writer, 427 | opt.distributed, rpn=rpn, 428 | det_interval=opt.det_interval, nrois=opt.nrois) 429 | 430 | if not opt.no_train and opt.lr_scheduler == 'multistep': 431 | scheduler.step() 432 | elif not opt.no_train and opt.lr_scheduler == 'plateau': 433 | scheduler.step(prev_val_loss) 434 | 435 | if opt.inference: 436 | inference_loader, inference_class_names = get_inference_utils(opt) 437 | inference_result_path = opt.result_path / '{}.json'.format( 438 | opt.inference_subset) 439 | 440 | inference.inference(inference_loader, model, inference_result_path, 441 | inference_class_names, opt.inference_no_average, 442 | opt.output_topk) 443 | 444 | 445 | if __name__ == '__main__': 446 | opt = get_opt() 447 | 448 | opt.device = torch.device('cpu' if opt.no_cuda else 'cuda') 449 | if not opt.no_cuda: 450 | cudnn.benchmark = True 451 | if opt.accimage: 452 | torchvision.set_image_backend('accimage') 453 | 454 | opt.ngpus_per_node = torch.cuda.device_count() 455 | if opt.distributed: 456 | opt.world_size = opt.ngpus_per_node * opt.world_size 457 | mp.spawn(main_worker, nprocs=opt.ngpus_per_node, args=(opt,)) 458 | else: 459 | main_worker(-1, opt) 460 | --------------------------------------------------------------------------------