├── mmaction ├── core │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── dist_utils.py │ └── evaluation │ │ ├── __init__.py │ │ ├── accuracy.py │ │ └── eval_hooks.py ├── models │ ├── tenons │ │ ├── necks │ │ │ └── __init__.py │ │ ├── cls_heads │ │ │ ├── __init__.py │ │ │ └── cls_head.py │ │ ├── segmental_consensuses │ │ │ ├── TODO.md │ │ │ ├── __init__.py │ │ │ ├── simple_consensus.py │ │ │ └── stpp.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── nonlocal_block.py │ │ │ ├── norm.py │ │ │ └── conv_module.py │ │ ├── backbones │ │ │ └── __init__.py │ │ └── spatial_temporal_modules │ │ │ ├── __init__.py │ │ │ ├── simple_spatial_module.py │ │ │ ├── simple_spatial_temporal_module.py │ │ │ ├── avgfusion.py │ │ │ └── non_local.py │ ├── recognizers │ │ ├── __init__.py │ │ ├── base.py │ │ ├── TSN3D.py │ │ └── TSN2D.py │ ├── __init__.py │ ├── registry.py │ └── builder.py ├── __init__.py ├── apis │ ├── __init__.py │ ├── env.py │ └── train.py ├── datasets │ ├── loader │ │ ├── __init__.py │ │ ├── build_loader.py │ │ └── sampler.py │ └── __init__.py ├── losses │ ├── __init__.py │ └── losses.py ├── README.md └── utils │ └── misc.py ├── demo ├── demo_pred.gif └── category.txt ├── docs ├── figures │ ├── empirical.png │ ├── exp_result.png │ └── framework.png ├── assets │ ├── font.css │ └── style.css └── index.html ├── .style.yapf ├── tools ├── dist_train_recognizer.sh ├── dist_test_recognizer.sh ├── extract_backbone_weights.py ├── train_recognizer.py ├── README.md └── test_recognizer.py ├── INSTALL.md ├── data └── README.md ├── .gitignore ├── README.md ├── setup.py ├── config_files ├── sthv1 │ ├── tsm_baseline.py │ └── tsm_tpn.py ├── sthv2 │ ├── tsm_baseline.py │ └── tsm_tpn.py └── kinetics400 │ ├── baseline │ ├── r101f16s4.py │ ├── r101f8s8.py │ ├── r50f8s8.py │ ├── r101f32s2.py │ ├── r50f16s4.py │ └── r50f32s2.py │ └── tpn │ ├── r50f8s8.py │ ├── r101f16s4.py │ ├── r101f8s8.py │ ├── r50f16s4.py │ ├── r101f32s2.py │ └── r50f32s2.py ├── MODELZOO.md └── test_video.py /mmaction/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation import * 2 | from .utils import * 3 | -------------------------------------------------------------------------------- /demo/demo_pred.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decisionforce/TPN/HEAD/demo/demo_pred.gif -------------------------------------------------------------------------------- /mmaction/models/tenons/necks/__init__.py: -------------------------------------------------------------------------------- 1 | from .tpn import TPN 2 | 3 | __all__ = ['TPN'] 4 | -------------------------------------------------------------------------------- /docs/figures/empirical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/empirical.png -------------------------------------------------------------------------------- /docs/figures/exp_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/exp_result.png -------------------------------------------------------------------------------- /docs/figures/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/framework.png -------------------------------------------------------------------------------- /mmaction/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import __version__, short_version 2 | 3 | __all__ = ['__version__', 'short_version'] 4 | -------------------------------------------------------------------------------- /mmaction/models/tenons/cls_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .cls_head import ClsHead 2 | 3 | __all__ = [ 4 | 'ClsHead', 5 | ] 6 | -------------------------------------------------------------------------------- /mmaction/models/tenons/segmental_consensuses/TODO.md: -------------------------------------------------------------------------------- 1 | ### TODO 2 | 3 | [x] SimpleConsensus 4 | 5 | [ ] STPP 6 | 7 | [ ] TRN 8 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | BASED_ON_STYLE = pep8 3 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true 4 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true 5 | -------------------------------------------------------------------------------- /mmaction/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .dist_utils import allreduce_grads, DistOptimizerHook 2 | 3 | __all__ = [ 4 | 'allreduce_grads', 'DistOptimizerHook', 5 | ] 6 | -------------------------------------------------------------------------------- /mmaction/models/tenons/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .conv_module import ConvModule 2 | from .norm import build_norm_layer 3 | 4 | __all__ = [ 5 | 'ConvModule', 'build_norm_layer', 6 | ] 7 | -------------------------------------------------------------------------------- /mmaction/models/tenons/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_slow import ResNet_SlowFast 2 | from .resnet import ResNet 3 | 4 | __all__ = [ 5 | 'ResNet_SlowFast', 6 | 'ResNet' 7 | ] 8 | -------------------------------------------------------------------------------- /mmaction/models/recognizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseRecognizer 2 | from .TSN2D import TSN2D 3 | from .TSN3D import TSN3D 4 | 5 | __all__ = [ 6 | 'BaseRecognizer', 'TSN2D', 'TSN3D', 7 | ] 8 | -------------------------------------------------------------------------------- /mmaction/core/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .eval_hooks import (DistEvalHook, DistEvalTopKAccuracyHook, 2 | ) 3 | 4 | __all__ = [ 5 | 'DistEvalHook', 'DistEvalTopKAccuracyHook', 6 | ] 7 | -------------------------------------------------------------------------------- /tools/dist_train_recognizer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PYTHON=${PYTHON:-"python"} 4 | 5 | $PYTHON -m torch.distributed.launch --nproc_per_node=$2 $(dirname "$0")/train_recognizer.py $1 --launcher pytorch ${@:3} 6 | -------------------------------------------------------------------------------- /mmaction/apis/__init__.py: -------------------------------------------------------------------------------- 1 | from .env import init_dist, get_root_logger, set_random_seed 2 | from .train import train_network 3 | 4 | __all__ = [ 5 | 'init_dist', 'get_root_logger', 'set_random_seed', 6 | 'train_network', 7 | ] 8 | -------------------------------------------------------------------------------- /mmaction/datasets/loader/__init__.py: -------------------------------------------------------------------------------- 1 | from .build_loader import build_dataloader 2 | from .sampler import GroupSampler, DistributedGroupSampler 3 | 4 | __all__ = [ 5 | 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader' 6 | ] 7 | -------------------------------------------------------------------------------- /tools/dist_test_recognizer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PYTHON=${PYTHON:-"python"} 4 | 5 | CONFIG=$1 6 | CHECKPOINT=$2 7 | GPUS=$3 8 | 9 | $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \ 10 | $(dirname "$0")/test_recognizer.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 11 | -------------------------------------------------------------------------------- /mmaction/models/tenons/utils/nonlocal_block.py: -------------------------------------------------------------------------------- 1 | from ..spatial_temporal_modules.non_local import NonLocalModule 2 | 3 | 4 | def build_nonlocal_block(cfg): 5 | """ Build nonlocal block 6 | 7 | Args: 8 | """ 9 | assert isinstance(cfg, dict) 10 | cfg_ = cfg.copy() 11 | return NonLocalModule(**cfg_) 12 | -------------------------------------------------------------------------------- /mmaction/models/tenons/segmental_consensuses/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_consensus import SimpleConsensus 2 | from .stpp import parse_stage_config 3 | from .stpp import StructuredTemporalPyramidPooling 4 | 5 | __all__ = [ 6 | 'SimpleConsensus', 7 | 'StructuredTemporalPyramidPooling', 8 | 'parse_stage_config' 9 | ] 10 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_spatial_module import SimpleSpatialModule 2 | from .simple_spatial_temporal_module import SimpleSpatialTemporalModule 3 | from .avgfusion import AvgFusion 4 | 5 | __all__ = [ 6 | 'SimpleSpatialModule', 7 | 'SimpleSpatialTemporalModule', 8 | 'AvgFusion' 9 | ] 10 | -------------------------------------------------------------------------------- /tools/extract_backbone_weights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import collections 4 | 5 | model = torch.load(sys.argv[1]) 6 | 7 | weight = model['state_dict'] 8 | 9 | out = collections.OrderedDict() 10 | for k, v in weight.items(): 11 | name = k.replace('backbone.', '').replace('cls_head.', '') 12 | out[name] = v.cpu() 13 | print(name) 14 | 15 | torch.save(out, sys.argv[2]) 16 | -------------------------------------------------------------------------------- /mmaction/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .rawframes_dataset import RawFramesDataset 2 | from .utils import get_untrimmed_dataset, get_trimmed_dataset 3 | from .loader import GroupSampler, DistributedGroupSampler, build_dataloader 4 | 5 | __all__ = [ 6 | 'RawFramesDataset', 7 | 'get_trimmed_dataset', 'get_untrimmed_dataset', 8 | 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader' 9 | ] 10 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ```shell 4 | git clone https://github.com/decisionforce/TPN.git 5 | ``` 6 | 7 | ## Requirements 8 | 9 | - Linux 10 | - Python 3.5+ 11 | - PyTorch 1.0+ 12 | - CUDA 9.0+ 13 | - NVCC 2+ 14 | - GCC 4.9+ 15 | - mmcv 0.2.10 16 | 17 | ## Install MMAction 18 | (a) Install Cython 19 | ```shell 20 | pip install cython 21 | ``` 22 | (b) Install mmaction 23 | ```shell 24 | python setup.py develop 25 | ``` 26 | 27 | -------------------------------------------------------------------------------- /mmaction/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .losses import ( 2 | weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy, 3 | weighted_smoothl1, accuracy, 4 | weighted_multilabel_binary_cross_entropy, 5 | multilabel_accuracy) 6 | 7 | __all__ = [ 8 | 'weighted_nll_loss', 'weighted_cross_entropy', 9 | 'weighted_binary_cross_entropy', 10 | 'weighted_smoothl1', 'accuracy', 11 | 'weighted_multilabel_binary_cross_entropy', 12 | 'multilabel_accuracy', 13 | 14 | ] 15 | -------------------------------------------------------------------------------- /mmaction/README.md: -------------------------------------------------------------------------------- 1 | # mmaction 2 | 3 | This code is based on [MMAction](https://github.com/open-mmlab/mmaction) which supports modular design and high efficiency. Our TPN would be merged into the latest MMAction in the future. 4 | 5 | Here we briefly introduce the structure of this codebase: 6 | 7 | - `apis`: contains the launcher of the whole codebase and intializer of distributed training environment. 8 | - `core`: contains multiple hooks for evaluation e.g. calculating the Top-1/Top-5 accuracy. 9 | - `datasets`: contains `rawframes_dataset` and transform for training. 10 | - `losses`: contains kinds of CrossEntropy loss. 11 | - `models`: contains recognizers and various submodules of network e.g. *backbone*, *neck*,and *head* under `models/tenons` 12 | 13 | Such modular design helps us quickly and easily conduct experiments with different modules. 14 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/simple_spatial_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SPATIAL_TEMPORAL_MODULES 5 | 6 | 7 | @SPATIAL_TEMPORAL_MODULES.register_module 8 | class SimpleSpatialModule(nn.Module): 9 | def __init__(self, spatial_type='avg', spatial_size=7): 10 | super(SimpleSpatialModule, self).__init__() 11 | 12 | assert spatial_type in ['avg'] 13 | self.spatial_type = spatial_type 14 | 15 | self.spatial_size = spatial_size if not isinstance(spatial_size, int) else (spatial_size, spatial_size) 16 | 17 | if self.spatial_type == 'avg': 18 | self.op = nn.AvgPool2d(self.spatial_size, stride=1, padding=0) 19 | 20 | def init_weights(self): 21 | pass 22 | 23 | def forward(self, input): 24 | return self.op(input) 25 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/simple_spatial_temporal_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SPATIAL_TEMPORAL_MODULES 5 | 6 | 7 | @SPATIAL_TEMPORAL_MODULES.register_module 8 | class SimpleSpatialTemporalModule(nn.Module): 9 | def __init__(self, spatial_type='avg', spatial_size=7, temporal_size=1): 10 | super(SimpleSpatialTemporalModule, self).__init__() 11 | 12 | assert spatial_type in ['avg'] 13 | self.spatial_type = spatial_type 14 | 15 | self.spatial_size = spatial_size if not isinstance(spatial_size, int) else (spatial_size, spatial_size) 16 | self.temporal_size = temporal_size 17 | self.pool_size = (self.temporal_size,) + self.spatial_size 18 | 19 | if self.spatial_type == 'avg': 20 | self.op = nn.AvgPool3d(self.pool_size, stride=1, padding=0) 21 | 22 | def init_weights(self): 23 | pass 24 | 25 | def forward(self, input): 26 | return self.op(input) 27 | -------------------------------------------------------------------------------- /mmaction/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tenons.backbones import * 2 | from .tenons.spatial_temporal_modules import * 3 | from .tenons.segmental_consensuses import * 4 | from .tenons.cls_heads import * 5 | from .recognizers import * 6 | from .tenons.necks import * 7 | 8 | from .registry import (BACKBONES, SPATIAL_TEMPORAL_MODULES, SEGMENTAL_CONSENSUSES, HEADS, 9 | RECOGNIZERS, LOCALIZERS, DETECTORS, ARCHITECTURES, 10 | NECKS, ROI_EXTRACTORS) 11 | from .builder import (build_backbone, build_spatial_temporal_module, build_segmental_consensus, 12 | build_head, build_recognizer, build_detector, 13 | build_localizer, build_architecture, 14 | build_neck, build_roi_extractor) 15 | 16 | __all__ = [ 17 | 'BACKBONES', 'SPATIAL_TEMPORAL_MODULES', 'SEGMENTAL_CONSENSUSES', 'HEADS', 18 | 'RECOGNIZERS', 'LOCALIZERS', 'DETECTORS', 'ARCHITECTURES', 19 | 'NECKS', 'ROI_EXTRACTORS', 20 | 'build_backbone', 'build_spatial_temporal_module', 'build_segmental_consensus', 21 | 'build_head', 'build_recognizer', 'build_detector', 22 | 'build_localizer', 'build_architecture', 23 | 'build_neck', 'build_roi_extractor' 24 | ] 25 | -------------------------------------------------------------------------------- /mmaction/models/recognizers/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABCMeta, abstractmethod 3 | 4 | import torch.nn as nn 5 | 6 | 7 | class BaseRecognizer(nn.Module): 8 | """Base class for recognizers""" 9 | 10 | __metaclass__ = ABCMeta 11 | 12 | def __init__(self): 13 | super(BaseRecognizer, self).__init__() 14 | 15 | @property 16 | def with_tenon_list(self): 17 | return hasattr(self, 'tenon_list') and self.tenon_list is not None 18 | 19 | @property 20 | def with_cls(self): 21 | return hasattr(self, 'cls_head') and self.cls_head is not None 22 | 23 | @abstractmethod 24 | def forward_train(self, num_modalities, **kwargs): 25 | pass 26 | 27 | @abstractmethod 28 | def forward_test(self, num_modalities, **kwargs): 29 | pass 30 | 31 | def init_weights(self, pretrained=None): 32 | if pretrained is not None: 33 | logger = logging.getLogger() 34 | logger.info("load model from: {}".format(pretrained)) 35 | 36 | def forward(self, num_modalities, img_meta, return_loss=True, **kwargs): 37 | num_modalities = int(num_modalities[0]) 38 | if return_loss: 39 | return self.forward_train(num_modalities, img_meta, **kwargs) 40 | else: 41 | return self.forward_test(num_modalities, img_meta, **kwargs) 42 | -------------------------------------------------------------------------------- /mmaction/core/evaluation/accuracy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import confusion_matrix 3 | 4 | 5 | def softmax(x, dim=1): 6 | """Compute softmax values for each sets of scores in x.""" 7 | e_x = np.exp(x - np.max(x, axis=dim, keepdims=True)) 8 | return e_x / e_x.sum(axis=dim, keepdims=True) 9 | 10 | 11 | def mean_class_accuracy(scores, labels): 12 | pred = np.argmax(scores, axis=1) 13 | cf = confusion_matrix(labels, pred).astype(float) 14 | 15 | cls_cnt = cf.sum(axis=1) 16 | cls_hit = np.diag(cf) 17 | 18 | return np.mean(cls_hit / cls_cnt) 19 | 20 | 21 | def non_mean_class_accuracy(scores, labels): 22 | pred = np.argmax(scores, axis=1) 23 | cf = confusion_matrix(labels, pred).astype(float) 24 | 25 | cls_cnt = cf.sum(axis=1) 26 | cls_hit = np.diag(cf) 27 | 28 | return cls_hit / cls_cnt 29 | 30 | 31 | def top_k_acc(score, lb_set, k=3): 32 | idx = np.argsort(score)[-k:] 33 | return len(lb_set.intersection(idx)), len(lb_set) 34 | 35 | 36 | def top_k_hit(score, lb_set, k=3): 37 | idx = np.argsort(score)[-k:] 38 | return len(lb_set.intersection(idx)) > 0, 1 39 | 40 | 41 | def top_k_accuracy(scores, labels, k=(1,)): 42 | res = [] 43 | for kk in k: 44 | hits = [] 45 | for x, y in zip(scores, labels): 46 | y = [y] if isinstance(y, int) else y 47 | hits.append(top_k_hit(x, set(y), k=kk)[0]) 48 | res.append(np.mean(hits)) 49 | return res 50 | -------------------------------------------------------------------------------- /mmaction/models/tenons/segmental_consensuses/simple_consensus.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SEGMENTAL_CONSENSUSES 5 | 6 | 7 | class _SimpleConsensus(torch.autograd.Function): 8 | """Simplest segmental consensus module""" 9 | 10 | def __init__(self, 11 | consensus_type='avg', 12 | dim=1): 13 | super(_SimpleConsensus, self).__init__() 14 | 15 | assert consensus_type in ['avg'] 16 | self.consensus_type = consensus_type 17 | self.dim = dim 18 | self.shape = None 19 | 20 | def forward(self, x): 21 | self.shape = x.size() 22 | if self.consensus_type == 'avg': 23 | output = x.mean(dim=self.dim, keepdim=True) 24 | else: 25 | output = None 26 | return output 27 | 28 | def backward(self, grad_output): 29 | if self.consensus_type == 'avg': 30 | grad_in = grad_output.expand(self.shape) / float(self.shape[self.dim]) 31 | else: 32 | grad_in = None 33 | return grad_in 34 | 35 | 36 | @SEGMENTAL_CONSENSUSES.register_module 37 | class SimpleConsensus(nn.Module): 38 | def __init__(self, consensus_type, dim=1): 39 | super(SimpleConsensus, self).__init__() 40 | 41 | assert consensus_type in ['avg'] 42 | self.consensus_type = consensus_type 43 | self.dim = dim 44 | 45 | def init_weights(self): 46 | pass 47 | 48 | def forward(self, input): 49 | return _SimpleConsensus(self.consensus_type, self.dim)(input) 50 | -------------------------------------------------------------------------------- /mmaction/models/registry.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class Registry(object): 5 | 6 | def __init__(self, name): 7 | self._name = name 8 | self._module_dict = dict() 9 | 10 | @property 11 | def name(self): 12 | return self._name 13 | 14 | @property 15 | def module_dict(self): 16 | return self._module_dict 17 | 18 | def _register_module(self, module_class): 19 | """Register a module 20 | 21 | Args: 22 | module (:obj:`nn.Module`): Module to be registered. 23 | """ 24 | if not issubclass(module_class, nn.Module): 25 | raise TypeError( 26 | 'module must be a child of nn.Module, but got {}'.format( 27 | module_class)) 28 | module_name = module_class.__name__ 29 | if module_name in self._module_dict: 30 | raise KeyError('{} is already registered in {}'.format( 31 | module_name, self.name)) 32 | self._module_dict[module_name] = module_class 33 | 34 | def register_module(self, cls): 35 | self._register_module(cls) 36 | return cls 37 | 38 | 39 | BACKBONES = Registry('backbone') 40 | FLOWNETS = Registry('flownet') 41 | SPATIAL_TEMPORAL_MODULES = Registry('spatial_temporal_module') 42 | SEGMENTAL_CONSENSUSES = Registry('segmental_consensus') 43 | HEADS = Registry('head') 44 | RECOGNIZERS = Registry('recognizer') 45 | LOCALIZERS = Registry('localizer') 46 | DETECTORS = Registry('detector') 47 | ARCHITECTURES = Registry('architecture') 48 | NECKS = Registry('neck') 49 | ROI_EXTRACTORS = Registry('roi_extractor') 50 | -------------------------------------------------------------------------------- /mmaction/utils/misc.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import numpy as np 3 | import mmcv 4 | 5 | 6 | def rsetattr(obj, attr, val): 7 | ''' 8 | See: 9 | https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects 10 | ''' 11 | pre, _, post = attr.rpartition('.') 12 | return setattr(rgetattr(obj, pre) if pre else obj, post, val) 13 | 14 | 15 | def rgetattr(obj, attr, *args): 16 | def _getattr(obj, attr): 17 | return getattr(obj, attr, *args) 18 | 19 | return functools.reduce(_getattr, [obj] + attr.split('.')) 20 | 21 | 22 | def rhasattr(obj, attr, *args): 23 | def _hasattr(obj, attr): 24 | if hasattr(obj, attr): 25 | return getattr(obj, attr) 26 | else: 27 | return None 28 | 29 | return functools.reduce(_hasattr, [obj] + attr.split('.')) is not None 30 | 31 | 32 | def tensor2video_snaps(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): 33 | num_videos = tensor.size(0) 34 | num_frames = tensor.size(2) 35 | mean = np.array(mean, dtype=np.float32) 36 | std = np.array(std, dtype=np.float32) 37 | video_snaps = [] 38 | for vid_id in range(num_videos): 39 | img = tensor[vid_id, :, num_frames // 40 | 2, ...].cpu().numpy().transpose(1, 2, 0) 41 | img = mmcv.imdenormalize( 42 | img, mean, std, to_bgr=to_rgb).astype(np.uint8) 43 | video_snaps.append(np.ascontiguousarray(img)) 44 | return video_snaps 45 | 46 | 47 | def multi_apply(func, *args, **kwargs): 48 | pfunc = functools.partial(func, **kwargs) if kwargs else func 49 | map_results = map(pfunc, *args) 50 | return tuple(map(list, zip(*map_results))) 51 | -------------------------------------------------------------------------------- /docs/assets/font.css: -------------------------------------------------------------------------------- 1 | /* Homepage Font */ 2 | 3 | /* latin-ext */ 4 | @font-face { 5 | font-family: 'Lato'; 6 | font-style: normal; 7 | font-weight: 400; 8 | src: local('Lato Regular'), local('Lato-Regular'), url(https://fonts.gstatic.com/s/lato/v16/S6uyw4BMUTPHjxAwXjeu.woff2) format('woff2'); 9 | unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; 10 | } 11 | 12 | /* latin */ 13 | @font-face { 14 | font-family: 'Lato'; 15 | font-style: normal; 16 | font-weight: 400; 17 | src: local('Lato Regular'), local('Lato-Regular'), url(https://fonts.gstatic.com/s/lato/v16/S6uyw4BMUTPHjx4wXg.woff2) format('woff2'); 18 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; 19 | } 20 | 21 | /* latin-ext */ 22 | @font-face { 23 | font-family: 'Lato'; 24 | font-style: normal; 25 | font-weight: 700; 26 | src: local('Lato Bold'), local('Lato-Bold'), url(https://fonts.gstatic.com/s/lato/v16/S6u9w4BMUTPHh6UVSwaPGR_p.woff2) format('woff2'); 27 | unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; 28 | } 29 | 30 | /* latin */ 31 | @font-face { 32 | font-family: 'Lato'; 33 | font-style: normal; 34 | font-weight: 700; 35 | src: local('Lato Bold'), local('Lato-Bold'), url(https://fonts.gstatic.com/s/lato/v16/S6u9w4BMUTPHh6UVSwiPGQ.woff2) format('woff2'); 36 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; 37 | } 38 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ## Data Preparation 2 | 3 | ### Notes on Video Data format 4 | Since the original VideoDataloader of MMAction requires [decord](https://github.com/zhreshold/decord) for efficient video loading which is non-trivial to compile, this repo only supports **raw frame** format of videos. Therefore, you have to extract frames from raw videos. We will find another libaries and support VideoLoader soon. 5 | 6 | ### Supported datasets 7 | The `rawframe_dataset` loads data in a general manner by preparing a `.txt` file which contains the directory path of frames, total number of a certain video, and the groundtruth label. After that, specify the `data_root` and `image_tmpl` of config files. See the sample below: 8 | 9 | ```bash 10 | shot_put/c5-PBp04AQI 299 298 11 | marching/5OEnoefcO1Y 299 192 12 | dancing_ballet/pR1jxLvjcgU 249 84 13 | motorcycling/0dC3o90WYHs 299 199 14 | hoverboarding/RVkof6bxvg0 278 157 15 | playing_piano/H3JzOkvTrJk 297 241 16 | ``` 17 | Such general loader might help your experiment with other dataset e.g. UCF101 or custom dataset. 18 | 19 | ### Prepare annotations 20 | 21 | - [Kinetics400](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) contains ~240k training videos and ~19k validation videos. See the [guide](https://github.com/open-mmlab/mmaction/tree/master/data_tools/kinetics400/PREPARING_KINETICS400.md) of original MMAction to generate annotations. 22 | - [Something-Someting](https://github.com/TwentyBN) has 2 versions which you have to apply on their [website](https://20bn.com/datasets/something-something). See the [guide](https://github.com/mit-han-lab/temporal-shift-module/tree/master/tools) of TSM to generate annotations. 23 | 24 | Thank original [MMAction](https://github.com/open-mmlab/mmaction) and [TSM](https://github.com/mit-han-lab/temporal-shift-module) repo for kindly providing preprocessing scripts. 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # cython generated cpp 107 | mmaction/version.py 108 | .vscode 109 | .idea 110 | -------------------------------------------------------------------------------- /mmaction/models/tenons/utils/norm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | norm_cfg = { 4 | # format: layer_type: (abbreviation, module) 5 | 'BN': ('bn', nn.BatchNorm2d), 6 | 'SyncBN': ('bn', None), 7 | 'GN': ('gn', nn.GroupNorm), 8 | # and potentially 'SN' 9 | } 10 | 11 | 12 | def build_norm_layer(cfg, num_features, postfix=''): 13 | """ Build normalization layer 14 | Args: 15 | cfg (dict): cfg should contain: 16 | type (str): identify norm layer type. 17 | layer args: args needed to instantiate a norm layer. 18 | frozen (bool): [optional] whether stop gradient updates 19 | of norm layer, it is helpful to set frozen mode 20 | in backbone's norms. 21 | num_features (int): number of channels from input 22 | postfix (int, str): appended into norm abbreation to 23 | create named layer. 24 | Returns: 25 | name (str): abbreation + postfix 26 | layer (nn.Module): created norm layer 27 | """ 28 | assert isinstance(cfg, dict) and 'type' in cfg 29 | cfg_ = cfg.copy() 30 | 31 | layer_type = cfg_.pop('type') 32 | if layer_type not in norm_cfg: 33 | raise KeyError('Unrecognized norm type {}'.format(layer_type)) 34 | else: 35 | abbr, norm_layer = norm_cfg[layer_type] 36 | if norm_layer is None: 37 | raise NotImplementedError 38 | 39 | assert isinstance(postfix, (int, str)) 40 | name = abbr + str(postfix) 41 | 42 | frozen = cfg_.pop('frozen', False) 43 | cfg_.setdefault('eps', 1e-5) 44 | if layer_type != 'GN': 45 | layer = norm_layer(num_features, **cfg_) 46 | else: 47 | assert 'num_groups' in cfg_ 48 | layer = norm_layer(num_channels=num_features, **cfg_) 49 | 50 | if frozen: 51 | for param in layer.parameters(): 52 | param.requires_grad = False 53 | 54 | return name, layer 55 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/avgfusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SPATIAL_TEMPORAL_MODULES 5 | 6 | 7 | @SPATIAL_TEMPORAL_MODULES.register_module 8 | class AvgFusion(nn.Module): 9 | def __init__(self, fusion_type='concat'): 10 | super(AvgFusion, self).__init__() 11 | assert fusion_type in ['add', 'avg', 'concat', 'concatadd', 'concatavg'] 12 | self.fusion_type = fusion_type 13 | 14 | def init_weights(self): 15 | pass 16 | 17 | def forward(self, input): 18 | assert (isinstance(input, tuple)) 19 | after_avgpool = [F.adaptive_avg_pool3d(each, 1) for each in input] 20 | 21 | if self.fusion_type == 'add': 22 | out = torch.sum(torch.cat(after_avgpool, -1), -1, keepdim=True) 23 | 24 | elif self.fusion_type == 'avg': 25 | out = torch.mean(torch.cat(after_avgpool, -1), -1, keepdim=True) 26 | 27 | elif self.fusion_type == 'concat': 28 | out = torch.cat(after_avgpool, 1) 29 | 30 | elif self.fusion_type == 'concatadd': 31 | out_first = torch.cat(after_avgpool[:-1], 1) 32 | out = torch.sum(torch.cat([out_first, after_avgpool[-1]], -1), -1, keepdim=True) 33 | elif self.fusion_type == 'concatavg': 34 | out_first = torch.cat(after_avgpool[:-1], 1) 35 | out = torch.mean(torch.cat([out_first, after_avgpool[-1]], -1), -1, keepdim=True) 36 | else: 37 | raise ValueError 38 | 39 | return out 40 | 41 | 42 | def main(): 43 | res2 = torch.FloatTensor(8, 512, 8, 56, 56).cuda() 44 | res3 = torch.FloatTensor(8, 512, 8, 28, 28).cuda() 45 | res4 = torch.FloatTensor(8, 512, 8, 14, 14).cuda() 46 | res5 = torch.FloatTensor(8, 512, 8, 7, 7).cuda() 47 | feature = tuple([res2, res3, res4, res5]) 48 | model = AvgFusion(fusion_type='add').cuda() 49 | out = model(feature) 50 | print(out.shape) 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /mmaction/core/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch.distributed as dist 4 | from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors, 5 | _take_tensors) 6 | from mmcv.runner import OptimizerHook 7 | 8 | 9 | def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): 10 | if bucket_size_mb > 0: 11 | bucket_size_bytes = bucket_size_mb * 1024 * 1024 12 | buckets = _take_tensors(tensors, bucket_size_bytes) 13 | else: 14 | buckets = OrderedDict() 15 | for tensor in tensors: 16 | tp = tensor.type() 17 | if tp not in buckets: 18 | buckets[tp] = [] 19 | buckets[tp].append(tensor) 20 | buckets = buckets.values() 21 | 22 | for bucket in buckets: 23 | flat_tensors = _flatten_dense_tensors(bucket) 24 | dist.all_reduce(flat_tensors) 25 | flat_tensors.div_(world_size) 26 | for tensor, synced in zip( 27 | bucket, _unflatten_dense_tensors(flat_tensors, bucket)): 28 | tensor.copy_(synced) 29 | 30 | 31 | def allreduce_grads(model, coalesce=True, bucket_size_mb=-1): 32 | grads = [ 33 | param.grad.data for param in model.parameters() 34 | if param.requires_grad and param.grad is not None 35 | ] 36 | world_size = dist.get_world_size() 37 | if coalesce: 38 | _allreduce_coalesced(grads, world_size, bucket_size_mb) 39 | else: 40 | for tensor in grads: 41 | dist.all_reduce(tensor.div_(world_size)) 42 | 43 | 44 | class DistOptimizerHook(OptimizerHook): 45 | 46 | def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1): 47 | self.grad_clip = grad_clip 48 | self.coalesce = coalesce 49 | self.bucket_size_mb = bucket_size_mb 50 | 51 | def after_train_iter(self, runner): 52 | runner.optimizer.zero_grad() 53 | runner.outputs['loss'].backward() 54 | allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb) 55 | if self.grad_clip is not None: 56 | self.clip_grads(runner.model.parameters()) 57 | runner.optimizer.step() 58 | -------------------------------------------------------------------------------- /mmaction/apis/env.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | 5 | import numpy as np 6 | import torch 7 | import torch.distributed as dist 8 | import torch.multiprocessing as mp 9 | from mmcv.runner import get_dist_info 10 | import subprocess 11 | 12 | 13 | def init_dist(launcher, backend='nccl', **kwargs): 14 | if mp.get_start_method(allow_none=True) is None: 15 | mp.set_start_method('spawn') 16 | if launcher == 'pytorch': 17 | _init_dist_pytorch(backend, **kwargs) 18 | elif launcher == 'mpi': 19 | _init_dist_mpi(backend, **kwargs) 20 | elif launcher == 'slurm': 21 | _init_dist_slurm(backend, **kwargs) 22 | else: 23 | raise ValueError('Invalid launcher type: {}'.format(launcher)) 24 | 25 | 26 | def _init_dist_pytorch(backend, **kwargs): 27 | # TODO: use local_rank instead of rank % num_gpus 28 | rank = int(os.environ['RANK']) 29 | num_gpus = torch.cuda.device_count() 30 | torch.cuda.set_device(rank % num_gpus) 31 | dist.init_process_group(backend=backend, **kwargs) 32 | 33 | 34 | def _init_dist_mpi(backend, **kwargs): 35 | raise NotImplementedError 36 | 37 | 38 | def _init_dist_slurm(backend, port=12345, **kwargs): 39 | proc_id = int(os.environ['SLURM_PROCID']) 40 | ntasks = int(os.environ['SLURM_NTASKS']) 41 | node_list = os.environ['SLURM_NODELIST'] 42 | num_gpus = torch.cuda.device_count() 43 | torch.cuda.set_device(proc_id % num_gpus) 44 | addr = subprocess.getoutput( 45 | 'scontrol show hostname {} | head -n1'.format(node_list)) 46 | os.environ['MASTER_PORT'] = str(port) 47 | os.environ['MASTER_ADDR'] = addr 48 | os.environ['WORLD_SIZE'] = str(ntasks) 49 | os.environ['RANK'] = str(proc_id) 50 | dist.init_process_group(backend=backend) 51 | # raise NotImplementedError 52 | 53 | 54 | def set_random_seed(seed): 55 | random.seed(seed) 56 | np.random.seed(seed) 57 | torch.manual_seed(seed) 58 | torch.cuda.manual_seed_all(seed) 59 | 60 | 61 | def get_root_logger(log_level=logging.INFO): 62 | logger = logging.getLogger() 63 | if not logger.hasHandlers(): 64 | logging.basicConfig( 65 | format='%(asctime)s - %(levelname)s - %(message)s', 66 | level=log_level) 67 | rank, _ = get_dist_info() 68 | if rank != 0: 69 | logger.setLevel('ERROR') 70 | return logger 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Temporal Pyramid Network for Action Recognition 2 | 3 |  4 | [[Paper](https://arxiv.org/pdf/2004.03548.pdf)] 5 | [[Project Page](https://decisionforce.github.io/TPN/)] 6 | 7 | 8 | ## License 9 | The project is release under the [Apache 2.0 license](./LICENSE). 10 | 11 | ## Model Zoo 12 | Results and reference models are available in the [model zoo](./MODELZOO.md). 13 | 14 | ## Installation and Data Preparation 15 | Please refer to [INSTALL](INSTALL.md) for installation and [DATA](./data/README.md) for data preparation. 16 | 17 | ## Get Started 18 | Please refer to [GETTING_STARTED](./tools/README.md) for detailed usage. 19 | 20 | ## Quick Demo 21 | We provide `test_video.py` to inference a single video. 22 | Download the checkpoints and put them to the `ckpt/.` and run: 23 | ``` 24 | python ./test_video.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --video_file ${VIDOE_NAME} --label_file ${LABLE_FILE} --rendered_output ${RENDERED_NAME} 25 | ``` 26 | Arguments: 27 | - `--video_file`: Path for demo video, default is `./demo/demo.mp4` 28 | - `--label_file`: The label file for pretrained model, default is `demo/category.txt` 29 | - `--redndered_output`: The output file name. If specified, the script will render output video with label name, default is `demo/demo_pred.webm`. 30 | 31 | For example, we can predict for the demo video (download [here](https://drive.google.com/open?id=14VYS8hGA5i1J70qBqrUqLiDxJq_FgXiW) and put it under `demo/.`) by running: 32 | ``` 33 | python ./test_video.py config_files/sthv2/tsm_tpn.py ckpt/sthv2_tpn.pth 34 | ``` 35 | The rendered output video: 36 | 37 |  38 | 39 | ## Acknowledgement 40 | We really appreciate developers of [MMAction](https://github.com/open-mmlab/mmaction) for such wonderful codebase. We also thank Yue Zhao for the insightful discussion. 41 | 42 | ## Contact 43 | This repo is currently maintained by Ceyuan Yang ([@limbo0000](https://github.com/limbo0000)) and Yinghao Xu ([@justimyhxu](https://github.com/justimyhxu)). 44 | 45 | ## Bibtex 46 | ``` 47 | @inproceedings{yang2020tpn, 48 | title={Temporal Pyramid Network for Action Recognition}, 49 | author={Yang, Ceyuan and Xu, Yinghao and Shi, Jianping and Dai, Bo and Zhou, Bolei}, 50 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 51 | year={2020}, 52 | } 53 | ``` 54 | -------------------------------------------------------------------------------- /mmaction/models/builder.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | from torch import nn 3 | 4 | from .registry import (BACKBONES, FLOWNETS, SPATIAL_TEMPORAL_MODULES, 5 | SEGMENTAL_CONSENSUSES, HEADS, 6 | RECOGNIZERS, DETECTORS, LOCALIZERS, ARCHITECTURES, 7 | NECKS, ROI_EXTRACTORS) 8 | 9 | 10 | def _build_module(cfg, registry, default_args): 11 | assert isinstance(cfg, dict) and 'type' in cfg 12 | assert isinstance(default_args, dict) or default_args is None 13 | args = cfg.copy() 14 | obj_type = args.pop('type') 15 | if mmcv.is_str(obj_type): 16 | if obj_type not in registry.module_dict: 17 | raise KeyError('{} is not in the {} registry'.format( 18 | obj_type, registry.name)) 19 | obj_type = registry.module_dict[obj_type] 20 | elif not isinstance(obj_type, type): 21 | raise TypeError('type must be a str or valid type, but got {}'.format( 22 | type(obj_type))) 23 | if default_args is not None: 24 | for name, value in default_args.items(): 25 | args.setdefault(name, value) 26 | return obj_type(**args) 27 | 28 | 29 | def build(cfg, registry, default_args=None): 30 | if isinstance(cfg, list): 31 | modules = [_build_module(cfg_, registry, default_args) for cfg_ in cfg] 32 | return nn.Sequential(*modules) 33 | else: 34 | return _build_module(cfg, registry, default_args) 35 | 36 | 37 | def build_backbone(cfg): 38 | return build(cfg, BACKBONES) 39 | 40 | 41 | def build_flownet(cfg): 42 | return build(cfg, FLOWNETS) 43 | 44 | 45 | def build_spatial_temporal_module(cfg): 46 | return build(cfg, SPATIAL_TEMPORAL_MODULES) 47 | 48 | 49 | def build_segmental_consensus(cfg): 50 | return build(cfg, SEGMENTAL_CONSENSUSES) 51 | 52 | 53 | def build_head(cfg): 54 | return build(cfg, HEADS) 55 | 56 | 57 | def build_recognizer(cfg, train_cfg=None, test_cfg=None): 58 | return build(cfg, RECOGNIZERS, 59 | dict(train_cfg=train_cfg, test_cfg=test_cfg)) 60 | 61 | 62 | def build_localizer(cfg, train_cfg=None, test_cfg=None): 63 | return build(cfg, LOCALIZERS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) 64 | 65 | 66 | def build_detector(cfg, train_cfg=None, test_cfg=None): 67 | return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) 68 | 69 | 70 | def build_architecture(cfg, train_cfg=None, test_cfg=None): 71 | return build(cfg, ARCHITECTURES, 72 | dict(train_cfg=train_cfg, test_cfg=test_cfg)) 73 | 74 | 75 | def build_neck(cfg): 76 | return build(cfg, NECKS) 77 | 78 | 79 | def build_roi_extractor(cfg): 80 | return build(cfg, ROI_EXTRACTORS) 81 | -------------------------------------------------------------------------------- /tools/train_recognizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import argparse 4 | from mmcv import Config 5 | 6 | from mmaction import __version__ 7 | from mmaction.datasets import get_trimmed_dataset 8 | from mmaction.apis import (train_network, init_dist, get_root_logger, 9 | set_random_seed) 10 | from mmaction.models import build_recognizer 11 | import torch 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(description='Train an action recognizer') 16 | parser.add_argument('config', help='train config file path') 17 | parser.add_argument('--work_dir', help='the dir to save logs and models') 18 | parser.add_argument( 19 | '--resume_from', help='the checkpoint file to resume from') 20 | parser.add_argument( 21 | '--validate', 22 | action='store_true', 23 | help='whether to evaluate the checkpoint during training') 24 | parser.add_argument( 25 | '--gpus', 26 | type=int, 27 | default=1, 28 | help='number of gpus to use ' 29 | '(only applicable to non-distributed training)') 30 | parser.add_argument('--seed', type=int, default=None, help='random seed') 31 | parser.add_argument( 32 | '--launcher', 33 | choices=['none', 'pytorch', 'slurm', 'mpi'], 34 | default='none', 35 | help='job launcher') 36 | parser.add_argument('--local_rank', type=int, default=0) 37 | args = parser.parse_args() 38 | 39 | return args 40 | 41 | 42 | def main(): 43 | args = parse_args() 44 | 45 | cfg = Config.fromfile(args.config) 46 | # set cudnn_benchmark 47 | if cfg.get('cudnn_benchmark', False): 48 | torch.backends.cudnn.benchmark = True 49 | # update configs according to CLI args 50 | if args.work_dir is not None: 51 | cfg.work_dir = args.work_dir 52 | if args.resume_from is not None: 53 | cfg.resume_from = args.resume_from 54 | cfg.gpus = args.gpus 55 | if cfg.checkpoint_config is not None: 56 | # save mmaction version in checkpoints as meta data 57 | cfg.checkpoint_config.meta = dict( 58 | mmact_version=__version__, config=cfg.text) 59 | 60 | # init distributed env first, since logger depends on the dist info. 61 | if args.launcher == 'none': 62 | distributed = False 63 | else: 64 | distributed = True 65 | init_dist(args.launcher, **cfg.dist_params) 66 | 67 | # init logger before other steps 68 | logger = get_root_logger(cfg.log_level) 69 | logger.info('Distributed training: {}'.format(distributed)) 70 | 71 | # set random seeds 72 | if args.seed is not None: 73 | logger.info('Set random seed to {}'.format(args.seed)) 74 | set_random_seed(args.seed) 75 | 76 | model = build_recognizer( 77 | cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) 78 | 79 | train_dataset = get_trimmed_dataset(cfg.data.train) 80 | train_network( 81 | model, 82 | train_dataset, 83 | cfg, 84 | distributed=distributed, 85 | validate=args.validate, 86 | logger=logger) 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /mmaction/models/tenons/utils/conv_module.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import torch.nn as nn 4 | from mmcv.cnn import kaiming_init, constant_init 5 | 6 | from .norm import build_norm_layer 7 | 8 | 9 | class ConvModule(nn.Module): 10 | 11 | def __init__(self, 12 | in_channels, 13 | out_channels, 14 | kernel_size, 15 | stride=1, 16 | padding=0, 17 | dilation=1, 18 | groups=1, 19 | bias=True, 20 | normalize=None, 21 | activation='relu', 22 | inplace=True, 23 | activate_last=True): 24 | super(ConvModule, self).__init__() 25 | self.with_norm = normalize is not None 26 | self.with_activatation = activation is not None 27 | self.with_bias = bias 28 | self.activation = activation 29 | self.activate_last = activate_last 30 | 31 | if self.with_norm and self.with_bias: 32 | warnings.warn('ConvModule has norm and bias at the same time') 33 | 34 | self.conv = nn.Conv2d( 35 | in_channels, 36 | out_channels, 37 | kernel_size, 38 | stride, 39 | padding, 40 | dilation, 41 | groups, 42 | bias=bias) 43 | 44 | self.in_channels = self.conv.in_channels 45 | self.out_channels = self.conv.out_channels 46 | self.kernel_size = self.conv.kernel_size 47 | self.stride = self.conv.stride 48 | self.padding = self.conv.padding 49 | self.dilation = self.conv.dilation 50 | self.transposed = self.conv.transposed 51 | self.output_padding = self.conv.output_padding 52 | self.groups = self.conv.groups 53 | 54 | if self.with_norm: 55 | norm_channels = out_channels if self.activate_last else in_channels 56 | self.norm_name, norm = build_norm_layer(normalize, norm_channels) 57 | self.add_module(self.norm_name, norm) 58 | 59 | if self.with_activatation: 60 | assert activation in ['relu'], 'Only ReLU supported.' 61 | if self.activation == 'relu': 62 | self.activate = nn.ReLU(inplace=inplace) 63 | 64 | # Default using msra init 65 | self.init_weights() 66 | 67 | @property 68 | def norm(self): 69 | return getattr(self, self.norm_name) 70 | 71 | def init_weights(self): 72 | nonlinearity = 'relu' if self.activation is None else self.activation 73 | kaiming_init(self.conv, nonlinearity=nonlinearity) 74 | if self.with_norm: 75 | constant_init(self.norm, 1, bias=0) 76 | 77 | def forward(self, x, activate=True, norm=True): 78 | if self.activate_last: 79 | x = self.conv(x) 80 | if norm and self.with_norm: 81 | x = self.norm(x) 82 | if activate and self.with_activatation: 83 | x = self.activate(x) 84 | else: 85 | if norm and self.with_norm: 86 | x = self.norm(x) 87 | if activate and self.with_activatation: 88 | x = self.activate(x) 89 | x = self.conv(x) 90 | return x 91 | -------------------------------------------------------------------------------- /mmaction/models/tenons/cls_heads/cls_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import HEADS 5 | 6 | 7 | @HEADS.register_module 8 | class ClsHead(nn.Module): 9 | """Simplest classification head""" 10 | 11 | def __init__(self, 12 | with_avg_pool=True, 13 | temporal_feature_size=1, 14 | spatial_feature_size=7, 15 | dropout_ratio=0.8, 16 | in_channels=2048, 17 | num_classes=101, 18 | fcn_testing=False, 19 | init_std=0.01): 20 | 21 | super(ClsHead, self).__init__() 22 | 23 | self.with_avg_pool = with_avg_pool 24 | self.dropout_ratio = dropout_ratio 25 | self.in_channels = in_channels 26 | self.dropout_ratio = dropout_ratio 27 | self.temporal_feature_size = temporal_feature_size 28 | self.spatial_feature_size = spatial_feature_size 29 | self.init_std = init_std 30 | self.fcn_testing = fcn_testing 31 | 32 | if self.dropout_ratio != 0: 33 | self.dropout = nn.Dropout(p=self.dropout_ratio) 34 | else: 35 | self.dropout = None 36 | # self.with_avg_pool = fcn_testing 37 | if self.with_avg_pool: 38 | self.avg_pool = nn.AvgPool3d((temporal_feature_size, spatial_feature_size, spatial_feature_size), (1, 1, 1), 39 | (0, 0, 0)) 40 | if self.fcn_testing: 41 | self.new_cls = None 42 | self.in_channels = in_channels 43 | self.num_classes = num_classes 44 | self.fc_cls = nn.Linear(in_channels, num_classes) 45 | 46 | def init_weights(self): 47 | nn.init.normal_(self.fc_cls.weight, 0, self.init_std) 48 | nn.init.constant_(self.fc_cls.bias, 0) 49 | 50 | def forward(self, x): 51 | if not self.fcn_testing: 52 | if x.ndimension() == 4: 53 | x = x.unsqueeze(2) 54 | assert x.shape[1] == self.in_channels 55 | assert x.shape[2] == self.temporal_feature_size 56 | assert x.shape[3] == self.spatial_feature_size 57 | assert x.shape[4] == self.spatial_feature_size 58 | if self.with_avg_pool: 59 | x = self.avg_pool(x) 60 | if self.dropout is not None: 61 | x = self.dropout(x) 62 | x = x.view(x.size(0), -1) 63 | cls_score = self.fc_cls(x) 64 | return cls_score 65 | else: 66 | if self.with_avg_pool: 67 | x = self.avg_pool(x) 68 | if self.new_cls is None: 69 | self.new_cls = nn.Conv3d(self.in_channels, self.num_classes, 1, 1, 0).cuda() 70 | self.new_cls.weight.copy_(self.fc_cls.weight.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)) 71 | self.new_cls.bias.copy_(self.fc_cls.bias) 72 | self.fc_cls = None 73 | class_map = self.new_cls(x) 74 | # return class_map.mean([2,3,4]) 75 | return class_map 76 | 77 | def loss(self, 78 | cls_score, 79 | labels): 80 | losses = dict() 81 | losses['loss_cls'] = F.cross_entropy(cls_score, labels) 82 | 83 | return losses 84 | -------------------------------------------------------------------------------- /mmaction/datasets/loader/build_loader.py: -------------------------------------------------------------------------------- 1 | # from functools import partial 2 | # 3 | # from mmcv.runner import get_dist_info 4 | # from mmcv.parallel import collate 5 | # from torch.utils.data import DataLoader 6 | # 7 | # from .sampler import GroupSampler, DistributedGroupSampler 8 | # 9 | # # https://github.com/pytorch/pytorch/issues/973 10 | # import resource 11 | # rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 12 | # resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) 13 | # 14 | # 15 | # def build_dataloader(dataset, 16 | # imgs_per_gpu, 17 | # workers_per_gpu, 18 | # num_gpus=1, 19 | # dist=True, 20 | # **kwargs): 21 | # if dist: 22 | # rank, world_size = get_dist_info() 23 | # sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size, 24 | # rank) 25 | # batch_size = imgs_per_gpu 26 | # num_workers = workers_per_gpu 27 | # else: 28 | # if not kwargs.get('shuffle', True): 29 | # sampler = None 30 | # else: 31 | # sampler = GroupSampler(dataset, imgs_per_gpu) 32 | # batch_size = num_gpus * imgs_per_gpu 33 | # num_workers = num_gpus * workers_per_gpu 34 | # 35 | # data_loader = DataLoader( 36 | # dataset, 37 | # batch_size=batch_size, 38 | # sampler=sampler, 39 | # num_workers=num_workers, 40 | # collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu), 41 | # pin_memory=False, 42 | # **kwargs) 43 | # 44 | # return data_loader 45 | from functools import partial 46 | 47 | from mmcv.runner import get_dist_info 48 | from mmcv.parallel import collate 49 | from torch.utils.data import DataLoader 50 | 51 | from .sampler import GroupSampler, DistributedGroupSampler, DistributedSampler 52 | 53 | # https://github.com/pytorch/pytorch/issues/973 54 | import resource 55 | 56 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 57 | resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) 58 | 59 | 60 | def build_dataloader(dataset, 61 | imgs_per_gpu, 62 | workers_per_gpu, 63 | num_gpus=1, 64 | dist=True, 65 | **kwargs): 66 | shuffle = kwargs.get('shuffle', True) 67 | if dist: 68 | rank, world_size = get_dist_info() 69 | if shuffle: 70 | sampler = DistributedGroupSampler(dataset, imgs_per_gpu, 71 | world_size, rank) 72 | else: 73 | sampler = DistributedSampler( 74 | dataset, imgs_per_gpu, world_size, rank, shuffle=False) 75 | batch_size = imgs_per_gpu 76 | num_workers = workers_per_gpu 77 | else: 78 | sampler = GroupSampler(dataset, imgs_per_gpu) if shuffle else None 79 | batch_size = num_gpus * imgs_per_gpu 80 | num_workers = num_gpus * workers_per_gpu 81 | 82 | data_loader = DataLoader( 83 | dataset, 84 | batch_size=batch_size, 85 | sampler=sampler, 86 | num_workers=num_workers, 87 | collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu), 88 | pin_memory=False, 89 | **kwargs) 90 | 91 | return data_loader 92 | -------------------------------------------------------------------------------- /mmaction/core/evaluation/eval_hooks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import logging 4 | import mmcv 5 | import time 6 | import torch 7 | import numpy as np 8 | import torch.distributed as dist 9 | from mmcv.runner import Hook, obj_from_dict 10 | from mmcv.parallel import scatter, collate 11 | from torch.utils.data import Dataset 12 | 13 | from mmaction import datasets 14 | from .accuracy import top_k_accuracy 15 | 16 | 17 | class DistEvalHook(Hook): 18 | def __init__(self, dataset, interval=1): 19 | if isinstance(dataset, Dataset): 20 | self.dataset = dataset 21 | elif isinstance(dataset, dict): 22 | self.dataset = obj_from_dict(dataset, datasets, 23 | {'test_mode': True}) 24 | else: 25 | raise TypeError( 26 | 'dataset must be a Dataset object or a dict, not {}'.format( 27 | type(dataset))) 28 | self.interval = interval 29 | 30 | def after_train_epoch(self, runner): 31 | if not self.every_n_epochs(runner, self.interval): 32 | return 33 | runner.model.eval() 34 | results = [None for _ in range(len(self.dataset))] 35 | if runner.rank == 0: 36 | prog_bar = mmcv.ProgressBar(len(self.dataset)) 37 | for idx in range(runner.rank, len(self.dataset), runner.world_size): 38 | data = self.dataset[idx] 39 | data_gpu = scatter( 40 | collate([data], samples_per_gpu=1), 41 | [torch.cuda.current_device()])[0] 42 | 43 | # compute output 44 | with torch.no_grad(): 45 | result = runner.model( 46 | return_loss=False, rescale=True, **data_gpu) 47 | results[idx] = result 48 | 49 | batch_size = runner.world_size 50 | if runner.rank == 0: 51 | for _ in range(batch_size): 52 | prog_bar.update() 53 | 54 | if runner.rank == 0: 55 | print('\n') 56 | dist.barrier() 57 | for i in range(1, runner.world_size): 58 | tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) 59 | tmp_results = mmcv.load(tmp_file) 60 | for idx in range(i, len(results), runner.world_size): 61 | results[idx] = tmp_results[idx] 62 | os.remove(tmp_file) 63 | self.evaluate(runner, results) 64 | else: 65 | tmp_file = osp.join(runner.work_dir, 66 | 'temp_{}.pkl'.format(runner.rank)) 67 | mmcv.dump(results, tmp_file) 68 | dist.barrier() 69 | dist.barrier() 70 | 71 | def evaluate(self): 72 | raise NotImplementedError 73 | 74 | 75 | class DistEvalTopKAccuracyHook(DistEvalHook): 76 | 77 | def __init__(self, 78 | dataset, 79 | k=(1,)): 80 | super(DistEvalTopKAccuracyHook, self).__init__(dataset) 81 | self.k = k 82 | 83 | def evaluate(self, runner, results): 84 | gt_labels = [] 85 | for i in range(len(self.dataset)): 86 | ann = self.dataset.get_ann_info(i) 87 | gt_labels.append(ann['label']) 88 | 89 | results = [res.squeeze() for res in results] 90 | top1, top5 = top_k_accuracy(results, gt_labels, k=self.k) 91 | runner.mode = 'val' 92 | runner.log_buffer.output['top1_acc'] = top1 93 | runner.log_buffer.output['top5_acc'] = top5 94 | runner.log_buffer.ready = True 95 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | from setuptools import find_packages, setup 5 | 6 | 7 | def readme(): 8 | with open('README.md', encoding='utf-8') as f: 9 | content = f.read() 10 | return content 11 | 12 | 13 | MAJOR = 0 14 | MINOR = 1 15 | PATCH = 'rc0' 16 | SUFFIX = '' 17 | SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX) 18 | 19 | version_file = 'mmaction/version.py' 20 | 21 | 22 | def get_git_hash(): 23 | def _minimal_ext_cmd(cmd): 24 | # construct minimal environment 25 | env = {} 26 | for k in ['SYSTEMROOT', 'PATH', 'HOME']: 27 | v = os.environ.get(k) 28 | if v is not None: 29 | env[k] = v 30 | # LANGUAGE is used on win32 31 | env['LANGUAGE'] = 'C' 32 | env['LANG'] = 'C' 33 | env['LC_ALL'] = 'C' 34 | out = subprocess.Popen( 35 | cmd, stdout=subprocess.PIPE, env=env).communicate()[0] 36 | return out 37 | 38 | try: 39 | out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) 40 | sha = out.strip().decode('ascii') 41 | except OSError: 42 | sha = 'unknown' 43 | 44 | return sha 45 | 46 | 47 | def get_hash(): 48 | if os.path.exists('.git'): 49 | sha = get_git_hash()[:7] 50 | elif os.path.exists(version_file): 51 | try: 52 | from mmaction.version import __version__ 53 | sha = __version__.split('+')[-1] 54 | except ImportError: 55 | raise ImportError('Unable to get git version') 56 | else: 57 | sha = 'unknown' 58 | 59 | return sha 60 | 61 | 62 | def write_version_py(): 63 | content = """# GENERATED VERSION FILE 64 | # TIME: {} 65 | 66 | __version__ = '{}' 67 | short_version = '{}' 68 | """ 69 | sha = get_hash() 70 | VERSION = SHORT_VERSION + '+' + sha 71 | 72 | with open(version_file, 'w') as f: 73 | f.write(content.format(time.asctime(), VERSION, SHORT_VERSION)) 74 | 75 | 76 | def get_version(): 77 | with open(version_file, 'r') as f: 78 | exec(compile(f.read(), version_file, 'exec')) 79 | return locals()['__version__'] 80 | 81 | 82 | if __name__ == '__main__': 83 | write_version_py() 84 | setup( 85 | name='mmaction', 86 | version=get_version(), 87 | description='Open MMLab Action Toolbox', 88 | long_description=readme(), 89 | keywords='computer vision, action recognition', 90 | url='https://github.com/open-mmlab/mmaction', 91 | packages=find_packages(exclude=('configs', 'tools', 'demo')), 92 | package_data={'mmaction.ops': ['*/*.so']}, 93 | classifiers=[ 94 | 'Development Status :: 4 - Beta', 95 | 'License :: OSI Approved :: Apache Software License', 96 | 'Operating System :: OS Independent', 97 | 'Programming Language :: Python :: 2', 98 | 'Programming Language :: Python :: 2.7', 99 | 'Programming Language :: Python :: 3', 100 | 'Programming Language :: Python :: 3.4', 101 | 'Programming Language :: Python :: 3.5', 102 | 'Programming Language :: Python :: 3.6', 103 | ], 104 | license='Apache License 2.0', 105 | setup_requires=['pytest-runner'], 106 | tests_require=['pytest'], 107 | install_requires=[ 108 | 'mmcv', 'numpy', 'scipy', 'scikit-learn', 'terminaltables', 'lmdb', 'joblib' 109 | ], 110 | zip_safe=False) 111 | -------------------------------------------------------------------------------- /config_files/sthv1/tsm_baseline.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='TSN2D', 3 | backbone=dict( 4 | type='ResNet', 5 | pretrained='modelzoo://resnet50', 6 | nsegments=8, 7 | depth=50, 8 | out_indices=(3,), 9 | tsm=True, 10 | bn_eval=False, 11 | partial_bn=False), 12 | spatial_temporal_module=dict( 13 | type='SimpleSpatialModule', 14 | spatial_type='avg', 15 | spatial_size=7), 16 | segmental_consensus=dict( 17 | type='SimpleConsensus', 18 | consensus_type='avg'), 19 | cls_head=dict( 20 | type='ClsHead', 21 | with_avg_pool=False, 22 | temporal_feature_size=1, 23 | spatial_feature_size=1, 24 | dropout_ratio=0.5, 25 | in_channels=2048, 26 | num_classes=174)) 27 | train_cfg = None 28 | test_cfg = None 29 | # dataset settings 30 | dataset_type = 'RawFramesDataset' 31 | data_root = '' 32 | data_root_val = '' 33 | 34 | img_norm_cfg = dict( 35 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 36 | 37 | data = dict( 38 | videos_per_gpu=8, 39 | workers_per_gpu=8, 40 | train=dict( 41 | type=dataset_type, 42 | ann_file='data/sthv1/train_videofolder.txt', 43 | img_prefix=data_root, 44 | img_norm_cfg=img_norm_cfg, 45 | num_segments=8, 46 | new_length=1, 47 | new_step=1, 48 | random_shift=True, 49 | modality='RGB', 50 | image_tmpl='{:05d}.jpg', 51 | img_scale=256, 52 | input_size=224, 53 | flip_ratio=0.5, 54 | resize_keep_ratio=True, 55 | resize_crop=True, 56 | color_jitter=True, 57 | color_space_aug=True, 58 | oversample=None, 59 | max_distort=1, 60 | test_mode=False), 61 | val=dict( 62 | type=dataset_type, 63 | ann_file='data/sthv1/val_videofolder.txt', 64 | img_prefix=data_root_val, 65 | img_norm_cfg=img_norm_cfg, 66 | num_segments=8, 67 | new_length=1, 68 | new_step=1, 69 | random_shift=False, 70 | modality='RGB', 71 | image_tmpl='{:05d}.jpg', 72 | img_scale=256, 73 | input_size=224, 74 | flip_ratio=0, 75 | resize_keep_ratio=True, 76 | oversample=None, 77 | test_mode=False), 78 | test=dict( 79 | type=dataset_type, 80 | ann_file='data/sthv1/val_videofolder.txt', 81 | img_prefix=data_root_val, 82 | img_norm_cfg=img_norm_cfg, 83 | num_segments=16, 84 | new_length=1, 85 | new_step=1, 86 | random_shift=False, 87 | modality='RGB', 88 | image_tmpl='{:05d}.jpg', 89 | img_scale=256, 90 | input_size=256, 91 | flip_ratio=0, 92 | resize_keep_ratio=True, 93 | oversample="three_crop", 94 | test_mode=True)) 95 | # optimizer 96 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True) 97 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2)) 98 | # learning policy 99 | lr_config = dict( 100 | policy='step', 101 | step=[75, 125]) 102 | checkpoint_config = dict(interval=1) 103 | workflow = [('train', 1)] 104 | # yapf:disable 105 | log_config = dict( 106 | interval=20, 107 | hooks=[ 108 | dict(type='TextLoggerHook'), 109 | # dict(type='TensorboardLoggerHook') 110 | ]) 111 | # yapf:enable 112 | # runtime settings 113 | total_epochs = 150 114 | dist_params = dict(backend='nccl') 115 | log_level = 'INFO' 116 | load_from = None 117 | resume_from = None 118 | -------------------------------------------------------------------------------- /config_files/sthv2/tsm_baseline.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='TSN2D', 3 | backbone=dict( 4 | type='ResNet', 5 | pretrained='modelzoo://resnet50', 6 | nsegments=8, 7 | depth=50, 8 | out_indices=(3,), 9 | tsm=True, 10 | bn_eval=False, 11 | partial_bn=False), 12 | spatial_temporal_module=dict( 13 | type='SimpleSpatialModule', 14 | spatial_type='avg', 15 | spatial_size=7), 16 | segmental_consensus=dict( 17 | type='SimpleConsensus', 18 | consensus_type='avg'), 19 | cls_head=dict( 20 | type='ClsHead', 21 | with_avg_pool=False, 22 | temporal_feature_size=1, 23 | spatial_feature_size=1, 24 | dropout_ratio=0.5, 25 | in_channels=2048, 26 | num_classes=174)) 27 | train_cfg = None 28 | test_cfg = None 29 | # dataset settings 30 | dataset_type = 'RawFramesDataset' 31 | data_root = '' 32 | data_root_val = '' 33 | 34 | img_norm_cfg = dict( 35 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 36 | 37 | data = dict( 38 | videos_per_gpu=8, 39 | workers_per_gpu=8, 40 | train=dict( 41 | type=dataset_type, 42 | ann_file='data/sthv2/train_videofolder.txt', 43 | img_prefix=data_root, 44 | img_norm_cfg=img_norm_cfg, 45 | num_segments=8, 46 | new_length=1, 47 | new_step=1, 48 | random_shift=True, 49 | modality='RGB', 50 | image_tmpl='img_{:05d}.jpg', 51 | img_scale=256, 52 | input_size=224, 53 | flip_ratio=0.5, 54 | resize_keep_ratio=True, 55 | resize_crop=True, 56 | color_jitter=True, 57 | color_space_aug=True, 58 | oversample=None, 59 | max_distort=1, 60 | test_mode=False), 61 | val=dict( 62 | type=dataset_type, 63 | ann_file='data/sthv2/val_videofolder.txt', 64 | img_prefix=data_root_val, 65 | img_norm_cfg=img_norm_cfg, 66 | num_segments=8, 67 | new_length=1, 68 | new_step=1, 69 | random_shift=False, 70 | modality='RGB', 71 | image_tmpl='img_{:05d}.jpg', 72 | img_scale=256, 73 | input_size=224, 74 | flip_ratio=0, 75 | resize_keep_ratio=True, 76 | oversample=None, 77 | test_mode=False), 78 | test=dict( 79 | type=dataset_type, 80 | ann_file='data/sthv2/val_videofolder.txt', 81 | img_prefix=data_root_val, 82 | img_norm_cfg=img_norm_cfg, 83 | num_segments=16, 84 | new_length=1, 85 | new_step=1, 86 | random_shift=False, 87 | modality='RGB', 88 | image_tmpl='img_{:05d}.jpg', 89 | img_scale=256, 90 | input_size=256, 91 | flip_ratio=0, 92 | resize_keep_ratio=True, 93 | oversample="three_crop", 94 | test_mode=True)) 95 | # optimizer 96 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True) 97 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2)) 98 | # learning policy 99 | lr_config = dict( 100 | policy='step', 101 | step=[75, 125]) 102 | checkpoint_config = dict(interval=1) 103 | workflow = [('train', 1)] 104 | # yapf:disable 105 | log_config = dict( 106 | interval=20, 107 | hooks=[ 108 | dict(type='TextLoggerHook'), 109 | # dict(type='TensorboardLoggerHook') 110 | ]) 111 | # yapf:enable 112 | # runtime settings 113 | total_epochs = 150 114 | dist_params = dict(backend='nccl') 115 | log_level = 'INFO' 116 | load_from = None 117 | resume_from = None 118 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | This directory provides basic tutorials for the usage of MMAction. 4 | 5 | After installation of codebase and preparation of data, you could use the given scripts for training/evaluating your models. 6 | 7 | ### Test a reference model 8 | Our codebase supports distributed and non-distributed evaluation mode for reference model. Actually, distributed testing is a little faster than non-distributed testing. 9 | ``` 10 | # non-distributed testing 11 | python tools/test_recognizer.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] {--gpus ${GPU_NUM}} --ignore_cache --fcn_testing 12 | 13 | # distributed testing 14 | ./tools/dist_test_recognizer.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --ignore_cache --fcn_testing 15 | ``` 16 | Optional arguments: 17 | - `--ignore_cache`: If specified, the results cache will be ignored. 18 | - `--fcn_testing`: If specified, spatially fully-convolutional testing is performed via 3 crops approximation. 19 | - `--flip`: If specified, all frames would be flipped firstly and then fed into models. 20 | 21 | **Important**: some of our models might requires machine with more than 24G memory. 22 | 23 | Examples: 24 | Assume that you have already downloaded the checkpoints to the directory `ckpt/`. 25 | 26 | 1. Test tpn_f8s8 model with non-distributed evaluation mode on 8 GPUs 27 | ``` 28 | python ./tools/test_recognizer.py config_files/kinetics400/tpn/r50f8s8.py ckpt/kinetics400_tpn_r50f8s8 --gpus 8 --out ckpt/kinetics400_tpn_r50f8s8.pkl --fcn_testing --ignore_cache 29 | ``` 30 | 2. Test tpn_f8s8 model with distributed evaluation mode on 8 GPUs 31 | ```shell 32 | ./tools/dist_test_recognizer.sh config_files/kinetics400/tpn/r50f8s8.py ckpt/kinetics400_tpn_r50f8s8 8 --out ckpt/kinetics400_tpn_r50f8s8.pkl --fcn_testing --ignore_cache 33 | ``` 34 | 35 | ### Train a model 36 | 37 | Our codebase also supports distributed training and non-distributed training. 38 | 39 | All outputs (log files and checkpoints) will be saved to the working directory, 40 | which is specified by `work_dir` in the config file. 41 | 42 | By default we evaluate the model on the validation set after each epoch, you can change the evaluation interval by adding the interval argument in the training config. 43 | ```python 44 | evaluation = dict(interval=10) # This evaluate the model per 10 epoch. 45 | ``` 46 | 47 | #### Train with a single GPU 48 | ``` 49 | python tools/train_recognizer.py ${CONFIG_FILE} 50 | ``` 51 | If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`. 52 | 53 | #### Train with multiple GPUs 54 | ```shell 55 | ./tools/dist_train_recognizer.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments] 56 | ``` 57 | 58 | Optional arguments: 59 | - `--validate`: Perform evaluation at every 1 epoch during the training. 60 | - `--work_dir`: All outputs (log files and checkpoints) will be saved to the working directory. 61 | - `--resume_from`: Resume from a previous checkpoint file. 62 | 63 | Difference between `resume_from` and `load_from`: `resume_from` loads both the model weights and optimizer status, and the epoch is also inherited from the specified checkpoint. It is usually used for resuming the training process that is interrupted accidentally. `load_from` only loads the model weights and the training epoch starts from 0. It is usually used for finetuning. 64 | 65 | **Important**: The default learning rate in config files is for 8 GPUs and 8 video/gpu (batch size = 8*8 = 64). According to the Linear Scaling Rule, you need to set the learning rate proportional to the batch size if you use different GPUs or images per GPU, e.g., lr=0.01 for 8 GPUs * 8 video/gpu and lr=0.04 for 32 GPUs * 8 video/gpu. 66 | 67 | Here is the example of using 8 GPUs to train Kinetics400_r50_f8s8: 68 | ```shell 69 | ./tools/dist_train_recognizer.sh config_files/kinetics400/tpn/r50f8s8.py 8 --validate 70 | ``` 71 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r101f16s4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=16, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=16, 56 | new_step=4, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=16, 79 | new_step=4, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=16, 98 | new_step=4, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | workflow = [('train', 1)] 119 | # yapf:disable 120 | log_config = dict( 121 | interval=20, 122 | hooks=[ 123 | dict(type='TextLoggerHook'), 124 | # dict(type='TensorboardLoggerHook') 125 | ]) 126 | # yapf:enable 127 | # runtime settings 128 | total_epochs = 150 129 | dist_params = dict(backend='nccl') 130 | log_level = 'INFO' 131 | load_from = None 132 | resume_from = None 133 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r101f8s8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=False, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=8, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=8, 56 | new_step=8, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=8, 79 | new_step=8, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=8, 98 | new_step=8, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r50f8s8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=False, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=8, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=8, 56 | new_step=8, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=8, 79 | new_step=8, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=8, 98 | new_step=8, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r101f32s2.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=32, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=32, 56 | new_step=2, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=32, 79 | new_step=2, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=32, 98 | new_step=2, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r50f16s4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=16, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=16, 56 | new_step=4, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=16, 79 | new_step=4, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=16, 98 | new_step=4, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r50f32s2.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=32, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=32, 56 | new_step=2, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=32, 79 | new_step=2, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=32, 98 | new_step=2, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /docs/assets/style.css: -------------------------------------------------------------------------------- 1 | /* Homepage Style */ 2 | 3 | /* Body */ 4 | body { 5 | background: #e3e5e8; 6 | color: #ffffff; 7 | font-family: 'Lato', Verdana, Helvetica, sans-serif; 8 | font-weight: 300; 9 | font-size: 14pt; 10 | } 11 | 12 | /* Headings */ 13 | h1 { 14 | font-size: 30pt; 15 | } 16 | 17 | h2 { 18 | font-size: 22pt; 19 | } 20 | 21 | h3 { 22 | font-size: 14pt; 23 | } 24 | 25 | /* Hyperlinks */ 26 | a { 27 | text-decoration: none; 28 | } 29 | 30 | a:link { 31 | color: #1772d0; 32 | } 33 | 34 | a:visited { 35 | color: #1772d0; 36 | } 37 | 38 | a:active { 39 | color: red; 40 | } 41 | 42 | a:hover { 43 | color: #f09228; 44 | } 45 | 46 | pre { 47 | background: #fcfcfc; 48 | border: 0; 49 | font-size: 12pt; 50 | margin: 5pt auto; 51 | } 52 | 53 | /* Container */ 54 | .container { 55 | width: 768pt; 56 | min-height: 100pt; 57 | margin: 15pt auto; 58 | padding: 20pt; 59 | border: 1pt hidden #000; 60 | text-align: justify; 61 | color: #000000; 62 | background: #ffffff; 63 | } 64 | 65 | .container .title { 66 | text-align: center; 67 | font-size: 22pt; 68 | margin: 5pt auto; 69 | } 70 | 71 | .container .author { 72 | text-align: center; 73 | font-size: 16pt; 74 | margin: 20pt auto; 75 | } 76 | 77 | .container .institution { 78 | text-align: center; 79 | font-size: 16pt; 80 | margin: 20pt auto; 81 | } 82 | 83 | .container .link { 84 | text-align: center; 85 | font-size: 16pt; 86 | margin: 20pt auto; 87 | } 88 | 89 | .container .teaser { 90 | text-align: center; 91 | } 92 | 93 | .container .teaser img { 94 | text-align: center; 95 | margin: 20pt auto; 96 | width: 95%; 97 | } 98 | 99 | .container .body { 100 | text-align: justify; 101 | font-size: 14pt; 102 | margin: 10pt auto; 103 | } 104 | 105 | .container .bibtex { 106 | text-align: left; 107 | font-size: 22pt; 108 | margin: 5pt auto; 109 | } 110 | 111 | .container .ref { 112 | text-align: left; 113 | font-size: 18pt; 114 | font-weight: bold; 115 | margin: 15pt auto; 116 | } 117 | 118 | .container .citation { 119 | margin: 8pt auto; 120 | font-size: 14pt; 121 | clear: both; 122 | } 123 | 124 | .container .citation img { 125 | float: left; 126 | margin: 0 8pt 8pt 0; /*top right bottom left*/ 127 | width: 120pt; 128 | } 129 | 130 | /* Homepage */ 131 | /* Followings can be removed for single project page. */ 132 | .homepage { 133 | width: 768pt; 134 | min-height: 100pt; 135 | margin: 15pt auto; 136 | padding: 20pt; 137 | border: 1pt hidden #000; 138 | text-align: justify; 139 | color: #000000; 140 | background: #ffffff; 141 | } 142 | 143 | .homepage .header { 144 | margin-top: 30pt; 145 | margin-bottom: 60pt; 146 | margin-right: 70pt; 147 | font-size: 28pt; 148 | text-align: center; 149 | } 150 | 151 | .homepage .header img { 152 | height: 80pt; 153 | float: left; 154 | object-fit: cover; 155 | margin-left: 20pt; 156 | } 157 | 158 | .homepage .section { 159 | text-align: left; 160 | font-size: 25pt; 161 | font-weight: bolder; 162 | margin: 50pt 20pt 20pt 20pt; /*top right bottom left*/ 163 | } 164 | 165 | .homepage .project { 166 | height: 130pt; 167 | outline: thin dotted #666666; 168 | margin: 10pt 20pt 10pt 20pt; /*top right bottom left*/ 169 | } 170 | 171 | .homepage .project .image { 172 | height: 120pt; 173 | width: 160pt; 174 | float: left; 175 | text-align: center; 176 | vertical-align: top; 177 | } 178 | 179 | .homepage .project .image img { 180 | height: 120pt; 181 | width: 160pt; 182 | object-fit: cover; 183 | border-radius: 6pt; 184 | box-shadow: 1pt 1pt 2pt #888888; 185 | -moz-box-shadow: 1pt 1pt 2pt #888888; 186 | -webkit-box-shadow: 1pt 1pt 2pt #888888; 187 | margin: 5pt; 188 | } 189 | 190 | .homepage .project .info { 191 | font-size: 16pt; 192 | text-align: left; 193 | margin: 10pt 20pt 0 180pt; /*top right bottom left*/ 194 | } 195 | 196 | .homepage .avatar { 197 | margin: -10pt 20pt 320pt 0pt; /*top right bottom left*/ 198 | } 199 | 200 | .homepage .avatar table { 201 | float: left; 202 | width: auto; 203 | height: auto; 204 | margin: 10pt auto; 205 | text-align: center; 206 | font-size: 16pt; 207 | border-collapse: separate; 208 | border-spacing: 20pt 10pt; 209 | } 210 | 211 | .homepage .avatar img { 212 | height: 100pt; 213 | width: 100pt; 214 | object-fit: cover; 215 | } 216 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
7 | 8 | 9 |71 | Our TPN could achieve 78.9%, 49.0% and 62.0% top-1 accuracy on the mainstream benchmarks of action 72 | recognition i.e., Kinetics-400, Something-Something V1 and V2 respectively, which basically outperforms 73 | other state-of-the-art methods. More detailed comparison and ablation studie are presented in our paper. 74 |
75 |Per-class Performance Gain vs. Per-class Variance of Visual Tempos : 77 | Figure 4 indicates that the performance gain is clearly positively correlated with the variance of visual 78 | tempos. This study has strongly verified our motivation that TPN could bring a significant improvement for 79 | such actions with large variances of visual tempo.
80 |Robustness of TPN to Visual Tempo Variation : 81 | Figure 5 suggests that TPN helps improve the robustness of I3D-50, resulting in a curve with moderater 82 | fluctuations. More discussion is presented in our experimental section. 83 |
84 | 87 | 88 |
97 | @inproceedings{yang2020tpn,
98 | title = {Temporal Pyramid Network for Action Recognition}},
99 | author = {Yang, Ceyuan and Xu, Yinghao and Shi, Jianping and Dai, Bo and Zhou, Bolei},
100 | journal = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
101 | year = {2020}
102 | }
103 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/config_files/sthv1/tsm_tpn.py:
--------------------------------------------------------------------------------
1 | model = dict(
2 | type='TSN2D',
3 | backbone=dict(
4 | type='ResNet',
5 | pretrained='modelzoo://resnet50',
6 | depth=50,
7 | nsegments=8,
8 | out_indices=(2, 3),
9 | tsm=True,
10 | bn_eval=False,
11 | partial_bn=False),
12 | necks=dict(
13 | type='TPN',
14 | in_channels=[1024, 2048],
15 | out_channels=1024,
16 | spatial_modulation_config=dict(
17 | inplanes=[1024, 2048],
18 | planes=2048,
19 | ),
20 | temporal_modulation_config=dict(
21 | scales=(8, 8),
22 | param=dict(
23 | inplanes=-1,
24 | planes=-1,
25 | downsample_scale=-1,
26 | )),
27 | upsampling_config=dict(
28 | scale=(1, 1, 1),
29 | ),
30 | downsampling_config=dict(
31 | scales=(1, 1, 1),
32 | param=dict(
33 | inplanes=-1,
34 | planes=-1,
35 | downsample_scale=-1,
36 | )),
37 | level_fusion_config=dict(
38 | in_channels=[1024, 1024],
39 | mid_channels=[1024, 1024],
40 | out_channels=2048,
41 | ds_scales=[(1, 1, 1), (1, 1, 1)],
42 | ),
43 | aux_head_config=dict(
44 | inplanes=-1,
45 | planes=174,
46 | loss_weight=0.5
47 | ),
48 | ),
49 | spatial_temporal_module=dict(
50 | type='SimpleSpatialModule',
51 | spatial_type='avg',
52 | spatial_size=7),
53 | segmental_consensus=dict(
54 | type='SimpleConsensus',
55 | consensus_type='avg'),
56 | cls_head=dict(
57 | type='ClsHead',
58 | with_avg_pool=False,
59 | temporal_feature_size=1,
60 | spatial_feature_size=1,
61 | dropout_ratio=0.5,
62 | in_channels=2048,
63 | num_classes=174))
64 | train_cfg = None
65 | test_cfg = None
66 | # dataset settings
67 | dataset_type = 'RawFramesDataset'
68 | data_root = ''
69 | data_root_val = ''
70 |
71 | img_norm_cfg = dict(
72 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
73 |
74 | data = dict(
75 | videos_per_gpu=8,
76 | workers_per_gpu=8,
77 | train=dict(
78 | type=dataset_type,
79 | ann_file='data/sthv1/train_videofolder.txt',
80 | img_prefix=data_root,
81 | img_norm_cfg=img_norm_cfg,
82 | num_segments=8,
83 | new_length=1,
84 | new_step=1,
85 | random_shift=True,
86 | modality='RGB',
87 | image_tmpl='{:05d}.jpg',
88 | img_scale=256,
89 | input_size=224,
90 | flip_ratio=0.5,
91 | resize_keep_ratio=True,
92 | resize_crop=True,
93 | color_jitter=True,
94 | color_space_aug=True,
95 | oversample=None,
96 | max_distort=1,
97 | test_mode=False),
98 | val=dict(
99 | type=dataset_type,
100 | ann_file='data/sthv1/val_videofolder.txt',
101 | img_prefix=data_root_val,
102 | img_norm_cfg=img_norm_cfg,
103 | num_segments=8,
104 | new_length=1,
105 | new_step=1,
106 | random_shift=False,
107 | modality='RGB',
108 | image_tmpl='{:05d}.jpg',
109 | img_scale=256,
110 | input_size=224,
111 | flip_ratio=0,
112 | resize_keep_ratio=True,
113 | oversample=None,
114 | test_mode=False),
115 | test=dict(
116 | type=dataset_type,
117 | ann_file='data/sthv1/val_videofolder.txt',
118 | img_prefix=data_root_val,
119 | img_norm_cfg=img_norm_cfg,
120 | num_segments=16,
121 | new_length=1,
122 | new_step=1,
123 | random_shift=False,
124 | modality='RGB',
125 | image_tmpl='{:05d}.jpg',
126 | img_scale=256,
127 | input_size=256,
128 | flip_ratio=0,
129 | resize_keep_ratio=True,
130 | oversample="three_crop",
131 | test_mode=True))
132 | # optimizer
133 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True)
134 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
135 | # learning policy
136 | lr_config = dict(
137 | policy='step',
138 | step=[75, 125])
139 | checkpoint_config = dict(interval=1)
140 | workflow = [('train', 1)]
141 | # yapf:disable
142 | log_config = dict(
143 | interval=20,
144 | hooks=[
145 | dict(type='TextLoggerHook'),
146 | # dict(type='TensorboardLoggerHook')
147 | ])
148 | # yapf:enable
149 | # runtime settings
150 | total_epochs = 150
151 | dist_params = dict(backend='nccl')
152 | log_level = 'INFO'
153 | load_from = None
154 | resume_from = None
155 |
--------------------------------------------------------------------------------
/config_files/sthv2/tsm_tpn.py:
--------------------------------------------------------------------------------
1 | model = dict(
2 | type='TSN2D',
3 | backbone=dict(
4 | type='ResNet',
5 | pretrained='modelzoo://resnet50',
6 | depth=50,
7 | nsegments=8,
8 | out_indices=(2, 3),
9 | tsm=True,
10 | bn_eval=False,
11 | partial_bn=False),
12 | necks=dict(
13 | type='TPN',
14 | in_channels=[1024, 2048],
15 | out_channels=1024,
16 | spatial_modulation_config=dict(
17 | inplanes=[1024, 2048],
18 | planes=2048,
19 | ),
20 | temporal_modulation_config=dict(
21 | scales=(8, 8),
22 | param=dict(
23 | inplanes=-1,
24 | planes=-1,
25 | downsample_scale=-1,
26 | )),
27 | upsampling_config=dict(
28 | scale=(1, 1, 1),
29 | ),
30 | downsampling_config=dict(
31 | scales=(1, 1, 1),
32 | param=dict(
33 | inplanes=-1,
34 | planes=-1,
35 | downsample_scale=-1,
36 | )),
37 | level_fusion_config=dict(
38 | in_channels=[1024, 1024],
39 | mid_channels=[1024, 1024],
40 | out_channels=2048,
41 | ds_scales=[(1, 1, 1), (1, 1, 1)],
42 | ),
43 | aux_head_config=dict(
44 | inplanes=-1,
45 | planes=174,
46 | loss_weight=0.5
47 | ),
48 | ),
49 | spatial_temporal_module=dict(
50 | type='SimpleSpatialModule',
51 | spatial_type='avg',
52 | spatial_size=7),
53 | segmental_consensus=dict(
54 | type='SimpleConsensus',
55 | consensus_type='avg'),
56 | cls_head=dict(
57 | type='ClsHead',
58 | with_avg_pool=False,
59 | temporal_feature_size=1,
60 | spatial_feature_size=1,
61 | dropout_ratio=0.5,
62 | in_channels=2048,
63 | num_classes=174))
64 | train_cfg = None
65 | test_cfg = None
66 | # dataset settings
67 | dataset_type = 'RawFramesDataset'
68 | data_root = ''
69 | data_root_val = ''
70 |
71 | img_norm_cfg = dict(
72 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
73 |
74 | data = dict(
75 | videos_per_gpu=8,
76 | workers_per_gpu=8,
77 | train=dict(
78 | type=dataset_type,
79 | ann_file='data/sthv2/train_videofolder.txt',
80 | img_prefix=data_root,
81 | img_norm_cfg=img_norm_cfg,
82 | num_segments=8,
83 | new_length=1,
84 | new_step=1,
85 | random_shift=True,
86 | modality='RGB',
87 | image_tmpl='img_{:05d}.jpg',
88 | img_scale=256,
89 | input_size=224,
90 | flip_ratio=0.5,
91 | resize_keep_ratio=True,
92 | resize_crop=True,
93 | color_jitter=True,
94 | color_space_aug=True,
95 | oversample=None,
96 | max_distort=1,
97 | test_mode=False),
98 | val=dict(
99 | type=dataset_type,
100 | ann_file='data/sthv2/val_videofolder.txt',
101 | img_prefix=data_root_val,
102 | img_norm_cfg=img_norm_cfg,
103 | num_segments=8,
104 | new_length=1,
105 | new_step=1,
106 | random_shift=False,
107 | modality='RGB',
108 | image_tmpl='img_{:05d}.jpg',
109 | img_scale=256,
110 | input_size=224,
111 | flip_ratio=0,
112 | resize_keep_ratio=True,
113 | oversample=None,
114 | test_mode=False),
115 | test=dict(
116 | type=dataset_type,
117 | ann_file='data/sthv2/val_videofolder.txt',
118 | img_prefix=data_root_val,
119 | img_norm_cfg=img_norm_cfg,
120 | num_segments=16,
121 | new_length=1,
122 | new_step=1,
123 | random_shift=False,
124 | modality='RGB',
125 | image_tmpl='img_{:05d}.jpg',
126 | img_scale=256,
127 | input_size=256,
128 | flip_ratio=0,
129 | resize_keep_ratio=True,
130 | oversample="three_crop",
131 | test_mode=True))
132 | # optimizer
133 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True)
134 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
135 | # learning policy
136 | lr_config = dict(
137 | policy='step',
138 | step=[75, 125])
139 | checkpoint_config = dict(interval=1)
140 | workflow = [('train', 1)]
141 | # yapf:disable
142 | log_config = dict(
143 | interval=20,
144 | hooks=[
145 | dict(type='TextLoggerHook'),
146 | # dict(type='TensorboardLoggerHook')
147 | ])
148 | # yapf:enable
149 | # runtime settings
150 | total_epochs = 150
151 | dist_params = dict(backend='nccl')
152 | log_level = 'INFO'
153 | load_from = None
154 | resume_from = None
155 |
--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r50f8s8.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | model = dict(
3 | type='TSN3D',
4 | backbone=dict(
5 | type='ResNet_SlowFast',
6 | pretrained='modelzoo://resnet50',
7 | depth=50,
8 | num_stages=4,
9 | out_indices=[2, 3],
10 | frozen_stages=-1,
11 | inflate_freq=(0, 0, 1, 1),
12 | inflate_style='3x1x1',
13 | conv1_kernel_t=1,
14 | conv1_stride_t=1,
15 | pool1_kernel_t=1,
16 | pool1_stride_t=1,
17 | with_cp=False,
18 | bn_eval=False,
19 | partial_bn=False,
20 | style='pytorch'),
21 | necks=dict(
22 | type='TPN',
23 | in_channels=[1024, 2048],
24 | out_channels=1024,
25 | spatial_modulation_config=dict(
26 | inplanes=[1024, 2048],
27 | planes=2048,
28 | ),
29 | temporal_modulation_config=dict(
30 | scales=(8, 8),
31 | param=dict(
32 | inplanes=-1,
33 | planes=-1,
34 | downsample_scale=-1,
35 | )),
36 | upsampling_config=dict(
37 | scale=(1, 1, 1),
38 | ),
39 | downsampling_config=dict(
40 | scales=(1, 1, 1),
41 | param=dict(
42 | inplanes=-1,
43 | planes=-1,
44 | downsample_scale=-1,
45 | )),
46 | level_fusion_config=dict(
47 | in_channels=[1024, 1024],
48 | mid_channels=[1024, 1024],
49 | out_channels=2048,
50 | ds_scales=[(1, 1, 1), (1, 1, 1)],
51 | ),
52 | aux_head_config=dict(
53 | inplanes=-1,
54 | planes=400,
55 | loss_weight=0.5
56 | ),
57 | ),
58 | spatial_temporal_module=dict(
59 | type='SimpleSpatialTemporalModule',
60 | spatial_type='avg',
61 | temporal_size=1,
62 | spatial_size=7),
63 | segmental_consensus=dict(
64 | type='SimpleConsensus',
65 | consensus_type='avg'),
66 | cls_head=dict(
67 | type='ClsHead',
68 | with_avg_pool=False,
69 | temporal_feature_size=1,
70 | spatial_feature_size=1,
71 | dropout_ratio=0.5,
72 | in_channels=2048,
73 | num_classes=400))
74 | train_cfg = None
75 | test_cfg = None
76 | # dataset settings
77 | dataset_type = 'RawFramesDataset'
78 | data_root = ''
79 | data_root_val = ''
80 | img_norm_cfg = dict(
81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
82 | data = dict(
83 | videos_per_gpu=8,
84 | workers_per_gpu=8,
85 | train=dict(
86 | type=dataset_type,
87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
88 | img_prefix=data_root,
89 | img_norm_cfg=img_norm_cfg,
90 | input_format="NCTHW",
91 | num_segments=1,
92 | new_length=8,
93 | new_step=8,
94 | random_shift=True,
95 | modality='RGB',
96 | image_tmpl='img_{:05d}.jpg',
97 | img_scale=256,
98 | resize_keep_ratio=True,
99 | input_size=224,
100 | flip_ratio=0.5,
101 | oversample=None,
102 | resize_crop=True,
103 | color_jitter=True,
104 | color_space_aug=True,
105 | max_distort=0,
106 | test_mode=False,
107 | ),
108 | val=dict(
109 | type=dataset_type,
110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 | img_prefix=data_root_val,
112 | img_norm_cfg=img_norm_cfg,
113 | input_format="NCTHW",
114 | num_segments=1,
115 | new_length=8,
116 | new_step=8,
117 | random_shift=True,
118 | modality='RGB',
119 | image_tmpl='img_{:05d}.jpg',
120 | img_scale=256,
121 | input_size=224,
122 | flip_ratio=0,
123 | resize_keep_ratio=True,
124 | oversample=None,
125 | test_mode=False,
126 | ),
127 | test=dict(
128 | type=dataset_type,
129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 | img_prefix=data_root_val,
131 | img_norm_cfg=img_norm_cfg,
132 | input_format="NCTHW",
133 | num_segments=10,
134 | new_length=8,
135 | new_step=8,
136 | random_shift=True,
137 | modality='RGB',
138 | image_tmpl='img_{:05d}.jpg',
139 | img_scale=256,
140 | input_size=256,
141 | flip_ratio=0,
142 | resize_keep_ratio=True,
143 | oversample='three_crop',
144 | test_mode=True,
145 | ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 | policy='step',
152 | step=[75, 125])
153 |
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 | interval=20,
160 | hooks=[
161 | dict(type='TextLoggerHook'),
162 | # dict(type='TensorboardLoggerHook')
163 | ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 |
--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r101f16s4.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | model = dict(
3 | type='TSN3D',
4 | backbone=dict(
5 | type='ResNet_SlowFast',
6 | pretrained='modelzoo://resnet101',
7 | depth=101,
8 | num_stages=4,
9 | out_indices=[2, 3],
10 | frozen_stages=-1,
11 | inflate_freq=(0, 0, 1, 1),
12 | inflate_style='3x1x1',
13 | conv1_kernel_t=1,
14 | conv1_stride_t=1,
15 | pool1_kernel_t=1,
16 | pool1_stride_t=1,
17 | with_cp=True,
18 | bn_eval=False,
19 | partial_bn=False,
20 | style='pytorch'),
21 | necks=dict(
22 | type='TPN',
23 | in_channels=[1024, 2048],
24 | out_channels=1024,
25 | spatial_modulation_config=dict(
26 | inplanes=[1024, 2048],
27 | planes=2048,
28 | ),
29 | temporal_modulation_config=dict(
30 | scales=(8, 16),
31 | param=dict(
32 | inplanes=-1,
33 | planes=-1,
34 | downsample_scale=-1,
35 | )),
36 | upsampling_config=dict(
37 | scale=(1, 1, 1),
38 | ),
39 | downsampling_config=dict(
40 | scales=(2, 1, 1),
41 | param=dict(
42 | inplanes=-1,
43 | planes=-1,
44 | downsample_scale=-1,
45 | )),
46 | level_fusion_config=dict(
47 | in_channels=[1024, 1024],
48 | mid_channels=[1024, 1024],
49 | out_channels=2048,
50 | ds_scales=[(2, 1, 1), (1, 1, 1)],
51 | ),
52 | aux_head_config=dict(
53 | inplanes=-1,
54 | planes=400,
55 | loss_weight=0.5
56 | ),
57 | ),
58 | spatial_temporal_module=dict(
59 | type='SimpleSpatialTemporalModule',
60 | spatial_type='avg',
61 | temporal_size=1,
62 | spatial_size=7),
63 | segmental_consensus=dict(
64 | type='SimpleConsensus',
65 | consensus_type='avg'),
66 | cls_head=dict(
67 | type='ClsHead',
68 | with_avg_pool=False,
69 | temporal_feature_size=1,
70 | spatial_feature_size=1,
71 | dropout_ratio=0.5,
72 | in_channels=2048,
73 | num_classes=400))
74 | train_cfg = None
75 | test_cfg = None
76 | # dataset settings
77 | dataset_type = 'RawFramesDataset'
78 | data_root = ''
79 | data_root_val = ''
80 | img_norm_cfg = dict(
81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
82 | data = dict(
83 | videos_per_gpu=8,
84 | workers_per_gpu=8,
85 | train=dict(
86 | type=dataset_type,
87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
88 | img_prefix=data_root,
89 | img_norm_cfg=img_norm_cfg,
90 | input_format="NCTHW",
91 | num_segments=1,
92 | new_length=16,
93 | new_step=4,
94 | random_shift=True,
95 | modality='RGB',
96 | image_tmpl='img_{:05d}.jpg',
97 | img_scale=256,
98 | resize_keep_ratio=True,
99 | input_size=224,
100 | flip_ratio=0.5,
101 | oversample=None,
102 | resize_crop=True,
103 | color_jitter=True,
104 | color_space_aug=True,
105 | max_distort=0,
106 | test_mode=False,
107 | ),
108 | val=dict(
109 | type=dataset_type,
110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 | img_prefix=data_root_val,
112 | img_norm_cfg=img_norm_cfg,
113 | input_format="NCTHW",
114 | num_segments=1,
115 | new_length=16,
116 | new_step=4,
117 | random_shift=True,
118 | modality='RGB',
119 | image_tmpl='img_{:05d}.jpg',
120 | img_scale=256,
121 | input_size=224,
122 | flip_ratio=0,
123 | resize_keep_ratio=True,
124 | oversample=None,
125 | test_mode=False,
126 | ),
127 | test=dict(
128 | type=dataset_type,
129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 | img_prefix=data_root_val,
131 | img_norm_cfg=img_norm_cfg,
132 | input_format="NCTHW",
133 | num_segments=10,
134 | new_length=16,
135 | new_step=4,
136 | random_shift=True,
137 | modality='RGB',
138 | image_tmpl='img_{:05d}.jpg',
139 | img_scale=256,
140 | input_size=256,
141 | flip_ratio=0,
142 | resize_keep_ratio=True,
143 | oversample='three_crop',
144 | test_mode=True,
145 | ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 | policy='step',
152 | step=[75, 125])
153 |
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 | interval=20,
160 | hooks=[
161 | dict(type='TextLoggerHook'),
162 | # dict(type='TensorboardLoggerHook')
163 | ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 |
--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r101f8s8.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | model = dict(
3 | type='TSN3D',
4 | backbone=dict(
5 | type='ResNet_SlowFast',
6 | pretrained='modelzoo://resnet101',
7 | depth=101,
8 | num_stages=4,
9 | out_indices=[2, 3],
10 | frozen_stages=-1,
11 | inflate_freq=(0, 0, 1, 1),
12 | inflate_style='3x1x1',
13 | conv1_kernel_t=1,
14 | conv1_stride_t=1,
15 | pool1_kernel_t=1,
16 | pool1_stride_t=1,
17 | with_cp=False,
18 | bn_eval=False,
19 | partial_bn=False,
20 | style='pytorch'),
21 | necks=dict(
22 | type='TPN',
23 | in_channels=[1024, 2048],
24 | out_channels=1024,
25 | spatial_modulation_config=dict(
26 | inplanes=[1024, 2048],
27 | planes=2048,
28 | ),
29 | temporal_modulation_config=dict(
30 | scales=(4, 8),
31 | param=dict(
32 | inplanes=-1,
33 | planes=-1,
34 | downsample_scale=-1,
35 | )),
36 | upsampling_config=dict(
37 | scale=(1, 1, 1),
38 | ),
39 | downsampling_config=dict(
40 | scales=(2, 1, 1),
41 | param=dict(
42 | inplanes=-1,
43 | planes=-1,
44 | downsample_scale=-1,
45 | )),
46 | level_fusion_config=dict(
47 | in_channels=[1024, 1024],
48 | mid_channels=[1024, 1024],
49 | out_channels=2048,
50 | ds_scales=[(2, 1, 1), (1, 1, 1)],
51 | ),
52 | aux_head_config=dict(
53 | inplanes=-1,
54 | planes=400,
55 | loss_weight=0.5
56 | ),
57 | ),
58 | spatial_temporal_module=dict(
59 | type='SimpleSpatialTemporalModule',
60 | spatial_type='avg',
61 | temporal_size=1,
62 | spatial_size=7),
63 | segmental_consensus=dict(
64 | type='SimpleConsensus',
65 | consensus_type='avg'),
66 | cls_head=dict(
67 | type='ClsHead',
68 | with_avg_pool=False,
69 | temporal_feature_size=1,
70 | spatial_feature_size=1,
71 | dropout_ratio=0.5,
72 | in_channels=2048,
73 | num_classes=400))
74 | train_cfg = None
75 | test_cfg = None
76 | # dataset settings
77 | dataset_type = 'RawFramesDataset'
78 | data_root = ''
79 | data_root_val = ''
80 | img_norm_cfg = dict(
81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
82 | data = dict(
83 | videos_per_gpu=8,
84 | workers_per_gpu=8,
85 | train=dict(
86 | type=dataset_type,
87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
88 | img_prefix=data_root,
89 | img_norm_cfg=img_norm_cfg,
90 | input_format="NCTHW",
91 | num_segments=1,
92 | new_length=8,
93 | new_step=8,
94 | random_shift=True,
95 | modality='RGB',
96 | image_tmpl='img_{:05d}.jpg',
97 | img_scale=256,
98 | resize_keep_ratio=True,
99 | input_size=224,
100 | flip_ratio=0.5,
101 | oversample=None,
102 | resize_crop=True,
103 | color_jitter=True,
104 | color_space_aug=True,
105 | max_distort=0,
106 | test_mode=False,
107 | ),
108 | val=dict(
109 | type=dataset_type,
110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 | img_prefix=data_root_val,
112 | img_norm_cfg=img_norm_cfg,
113 | input_format="NCTHW",
114 | num_segments=1,
115 | new_length=8,
116 | new_step=8,
117 | random_shift=True,
118 | modality='RGB',
119 | image_tmpl='img_{:05d}.jpg',
120 | img_scale=256,
121 | input_size=224,
122 | flip_ratio=0,
123 | resize_keep_ratio=True,
124 | oversample=None,
125 | test_mode=False,
126 | ),
127 | test=dict(
128 | type=dataset_type,
129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 | img_prefix=data_root_val,
131 | img_norm_cfg=img_norm_cfg,
132 | input_format="NCTHW",
133 | num_segments=10,
134 | new_length=8,
135 | new_step=8,
136 | random_shift=True,
137 | modality='RGB',
138 | image_tmpl='img_{:05d}.jpg',
139 | img_scale=256,
140 | input_size=256,
141 | flip_ratio=0,
142 | resize_keep_ratio=True,
143 | oversample='three_crop',
144 | test_mode=True,
145 | ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 | policy='step',
152 | step=[75, 125])
153 |
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 | interval=20,
160 | hooks=[
161 | dict(type='TextLoggerHook'),
162 | # dict(type='TensorboardLoggerHook')
163 | ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 |
--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r50f16s4.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | model = dict(
3 | type='TSN3D',
4 | backbone=dict(
5 | type='ResNet_SlowFast',
6 | pretrained='modelzoo://resnet50',
7 | depth=50,
8 | num_stages=4,
9 | out_indices=[2, 3],
10 | frozen_stages=-1,
11 | inflate_freq=(0, 0, 1, 1),
12 | inflate_style='3x1x1',
13 | conv1_kernel_t=1,
14 | conv1_stride_t=1,
15 | pool1_kernel_t=1,
16 | pool1_stride_t=1,
17 | with_cp=True,
18 | bn_eval=False,
19 | partial_bn=False,
20 | style='pytorch'),
21 | necks=dict(
22 | type='TPN',
23 | in_channels=[1024, 2048],
24 | out_channels=1024,
25 | spatial_modulation_config=dict(
26 | inplanes=[1024, 2048],
27 | planes=2048,
28 | ),
29 | temporal_modulation_config=dict(
30 | scales=(16, 16),
31 | param=dict(
32 | inplanes=-1,
33 | planes=-1,
34 | downsample_scale=-1,
35 | )),
36 | upsampling_config=dict(
37 | scale=(1, 1, 1),
38 | ),
39 | downsampling_config=dict(
40 | scales=(1, 1, 1),
41 | param=dict(
42 | inplanes=-1,
43 | planes=-1,
44 | downsample_scale=-1,
45 | )),
46 | level_fusion_config=dict(
47 | in_channels=[1024, 1024],
48 | mid_channels=[1024, 1024],
49 | out_channels=2048,
50 | ds_scales=[(1, 1, 1), (1, 1, 1)],
51 | ),
52 | aux_head_config=dict(
53 | inplanes=-1,
54 | planes=400,
55 | loss_weight=0.5
56 | ),
57 | ),
58 | spatial_temporal_module=dict(
59 | type='SimpleSpatialTemporalModule',
60 | spatial_type='avg',
61 | temporal_size=1,
62 | spatial_size=7),
63 | segmental_consensus=dict(
64 | type='SimpleConsensus',
65 | consensus_type='avg'),
66 | cls_head=dict(
67 | type='ClsHead',
68 | with_avg_pool=False,
69 | temporal_feature_size=1,
70 | spatial_feature_size=1,
71 | dropout_ratio=0.5,
72 | in_channels=2048,
73 | num_classes=400))
74 | train_cfg = None
75 | test_cfg = None
76 | # dataset settings
77 | dataset_type = 'RawFramesDataset'
78 | data_root = ''
79 | data_root_val = ''
80 | img_norm_cfg = dict(
81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
82 | data = dict(
83 | videos_per_gpu=8,
84 | workers_per_gpu=8,
85 | train=dict(
86 | type=dataset_type,
87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
88 | img_prefix=data_root,
89 | img_norm_cfg=img_norm_cfg,
90 | input_format="NCTHW",
91 | num_segments=1,
92 | new_length=16,
93 | new_step=4,
94 | random_shift=True,
95 | modality='RGB',
96 | image_tmpl='img_{:05d}.jpg',
97 | img_scale=256,
98 | resize_keep_ratio=True,
99 | input_size=224,
100 | flip_ratio=0.5,
101 | oversample=None,
102 | resize_crop=True,
103 | color_jitter=True,
104 | color_space_aug=True,
105 | max_distort=0,
106 | test_mode=False,
107 | ),
108 | val=dict(
109 | type=dataset_type,
110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 | img_prefix=data_root_val,
112 | img_norm_cfg=img_norm_cfg,
113 | input_format="NCTHW",
114 | num_segments=1,
115 | new_length=16,
116 | new_step=4,
117 | random_shift=True,
118 | modality='RGB',
119 | image_tmpl='img_{:05d}.jpg',
120 | img_scale=256,
121 | input_size=224,
122 | flip_ratio=0,
123 | resize_keep_ratio=True,
124 | oversample=None,
125 | test_mode=False,
126 | ),
127 | test=dict(
128 | type=dataset_type,
129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 | img_prefix=data_root_val,
131 | img_norm_cfg=img_norm_cfg,
132 | input_format="NCTHW",
133 | num_segments=10,
134 | new_length=16,
135 | new_step=4,
136 | random_shift=True,
137 | modality='RGB',
138 | image_tmpl='img_{:05d}.jpg',
139 | img_scale=256,
140 | input_size=256,
141 | flip_ratio=0,
142 | resize_keep_ratio=True,
143 | oversample='three_crop',
144 | test_mode=True,
145 | ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 | policy='step',
152 | step=[75, 125])
153 |
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 | interval=20,
160 | hooks=[
161 | dict(type='TextLoggerHook'),
162 | # dict(type='TensorboardLoggerHook')
163 | ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 |
--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r101f32s2.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | model = dict(
3 | type='TSN3D',
4 | backbone=dict(
5 | type='ResNet_SlowFast',
6 | pretrained='modelzoo://resnet101',
7 | depth=101,
8 | num_stages=4,
9 | out_indices=[2, 3],
10 | frozen_stages=-1,
11 | inflate_freq=(0, 0, 1, 1),
12 | inflate_style='3x1x1',
13 | conv1_kernel_t=1,
14 | conv1_stride_t=1,
15 | pool1_kernel_t=1,
16 | pool1_stride_t=1,
17 | with_cp=True,
18 | bn_eval=False,
19 | partial_bn=False,
20 | style='pytorch'),
21 | necks=dict(
22 | type='TPN',
23 | in_channels=[1024, 2048],
24 | out_channels=1024,
25 | spatial_modulation_config=dict(
26 | inplanes=[1024, 2048],
27 | planes=2048,
28 | ),
29 | temporal_modulation_config=dict(
30 | scales=(16, 32),
31 | param=dict(
32 | inplanes=-1,
33 | planes=-1,
34 | downsample_scale=-1,
35 | )),
36 | upsampling_config=dict(
37 | scale=(1, 1, 1),
38 | ),
39 | downsampling_config=dict(
40 | scales=(2, 1, 1),
41 | param=dict(
42 | inplanes=-1,
43 | planes=-1,
44 | downsample_scale=-1,
45 | )),
46 | level_fusion_config=dict(
47 | in_channels=[1024, 1024],
48 | mid_channels=[1024, 1024],
49 | out_channels=2048,
50 | ds_scales=[(2, 1, 1), (1, 1, 1)],
51 | ),
52 | aux_head_config=dict(
53 | inplanes=-1,
54 | planes=400,
55 | loss_weight=0.5
56 | ),
57 | ),
58 | spatial_temporal_module=dict(
59 | type='SimpleSpatialTemporalModule',
60 | spatial_type='avg',
61 | temporal_size=1,
62 | spatial_size=7),
63 | segmental_consensus=dict(
64 | type='SimpleConsensus',
65 | consensus_type='avg'),
66 | cls_head=dict(
67 | type='ClsHead',
68 | with_avg_pool=False,
69 | temporal_feature_size=1,
70 | spatial_feature_size=1,
71 | dropout_ratio=0.5,
72 | in_channels=2048,
73 | num_classes=400))
74 | train_cfg = None
75 | test_cfg = None
76 | # dataset settings
77 | dataset_type = 'RawFramesDataset'
78 | data_root = ''
79 | data_root_val = ''
80 | img_norm_cfg = dict(
81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
82 | data = dict(
83 | videos_per_gpu=8,
84 | workers_per_gpu=8,
85 | train=dict(
86 | type=dataset_type,
87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
88 | img_prefix=data_root,
89 | img_norm_cfg=img_norm_cfg,
90 | input_format="NCTHW",
91 | num_segments=1,
92 | new_length=32,
93 | new_step=2,
94 | random_shift=True,
95 | modality='RGB',
96 | image_tmpl='img_{:05d}.jpg',
97 | img_scale=256,
98 | resize_keep_ratio=True,
99 | input_size=224,
100 | flip_ratio=0.5,
101 | oversample=None,
102 | resize_crop=True,
103 | color_jitter=True,
104 | color_space_aug=True,
105 | max_distort=0,
106 | test_mode=False,
107 | ),
108 | val=dict(
109 | type=dataset_type,
110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 | img_prefix=data_root_val,
112 | img_norm_cfg=img_norm_cfg,
113 | input_format="NCTHW",
114 | num_segments=1,
115 | new_length=32,
116 | new_step=2,
117 | random_shift=True,
118 | modality='RGB',
119 | image_tmpl='img_{:05d}.jpg',
120 | img_scale=256,
121 | input_size=224,
122 | flip_ratio=0,
123 | resize_keep_ratio=True,
124 | oversample=None,
125 | test_mode=False,
126 | ),
127 | test=dict(
128 | type=dataset_type,
129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 | img_prefix=data_root_val,
131 | img_norm_cfg=img_norm_cfg,
132 | input_format="NCTHW",
133 | num_segments=10,
134 | new_length=32,
135 | new_step=2,
136 | random_shift=True,
137 | modality='RGB',
138 | image_tmpl='img_{:05d}.jpg',
139 | img_scale=256,
140 | input_size=256,
141 | flip_ratio=0,
142 | resize_keep_ratio=True,
143 | oversample='three_crop',
144 | test_mode=True,
145 | ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 | policy='step',
152 | step=[75, 125])
153 |
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 | interval=20,
160 | hooks=[
161 | dict(type='TextLoggerHook'),
162 | # dict(type='TensorboardLoggerHook')
163 | ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 |
--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r50f32s2.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | model = dict(
3 | type='TSN3D',
4 | backbone=dict(
5 | type='ResNet_SlowFast',
6 | pretrained='modelzoo://resnet50',
7 | depth=50,
8 | num_stages=4,
9 | out_indices=[2, 3],
10 | frozen_stages=-1,
11 | inflate_freq=(0, 0, 1, 1),
12 | inflate_style='3x1x1',
13 | conv1_kernel_t=1,
14 | conv1_stride_t=1,
15 | pool1_kernel_t=1,
16 | pool1_stride_t=1,
17 | with_cp=True,
18 | bn_eval=False,
19 | partial_bn=False,
20 | style='pytorch'),
21 | necks=dict(
22 | type='TPN',
23 | in_channels=[1024, 2048],
24 | out_channels=1024,
25 | spatial_modulation_config=dict(
26 | inplanes=[1024, 2048],
27 | planes=2048,
28 | ),
29 | temporal_modulation_config=dict(
30 | scales=(32, 32),
31 | param=dict(
32 | inplanes=-1,
33 | planes=-1,
34 | downsample_scale=-1,
35 | )),
36 | upsampling_config=dict(
37 | scale=(1, 1, 1),
38 | ),
39 | downsampling_config=dict(
40 | scales=(1, 1, 1),
41 | param=dict(
42 | inplanes=-1,
43 | planes=-1,
44 | downsample_scale=-1,
45 | )),
46 | level_fusion_config=dict(
47 | in_channels=[1024, 1024],
48 | mid_channels=[1024, 1024],
49 | out_channels=2048,
50 | ds_scales=[(1, 1, 1), (1, 1, 1)],
51 | ),
52 | aux_head_config=dict(
53 | inplanes=-1,
54 | planes=400,
55 | loss_weight=0.5
56 | ),
57 | ),
58 | spatial_temporal_module=dict(
59 | type='SimpleSpatialTemporalModule',
60 | spatial_type='avg',
61 | temporal_size=1,
62 | spatial_size=7),
63 | segmental_consensus=dict(
64 | type='SimpleConsensus',
65 | consensus_type='avg'),
66 | cls_head=dict(
67 | type='ClsHead',
68 | with_avg_pool=False,
69 | temporal_feature_size=1,
70 | spatial_feature_size=1,
71 | dropout_ratio=0.5,
72 | in_channels=2048,
73 | num_classes=400))
74 | train_cfg = None
75 | test_cfg = None
76 | # dataset settings
77 | dataset_type = 'RawFramesDataset'
78 | data_root = ''
79 | data_root_val = ''
80 | img_norm_cfg = dict(
81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
82 | data = dict(
83 | videos_per_gpu=8,
84 | workers_per_gpu=8,
85 | train=dict(
86 | type=dataset_type,
87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
88 | img_prefix=data_root,
89 | img_norm_cfg=img_norm_cfg,
90 | input_format="NCTHW",
91 | num_segments=1,
92 | new_length=32,
93 | new_step=2,
94 | random_shift=True,
95 | modality='RGB',
96 | image_tmpl='img_{:05d}.jpg',
97 | img_scale=256,
98 | resize_keep_ratio=True,
99 | input_size=224,
100 | flip_ratio=0.5,
101 | oversample=None,
102 | resize_crop=True,
103 | color_jitter=True,
104 | color_space_aug=True,
105 | max_distort=0,
106 | test_mode=False,
107 | ),
108 | val=dict(
109 | type=dataset_type,
110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 | img_prefix=data_root_val,
112 | img_norm_cfg=img_norm_cfg,
113 | input_format="NCTHW",
114 | num_segments=1,
115 | new_length=32,
116 | new_step=2,
117 | random_shift=True,
118 | modality='RGB',
119 | image_tmpl='img_{:05d}.jpg',
120 | img_scale=256,
121 | input_size=224,
122 | div_255=False,
123 | flip_ratio=0,
124 | resize_keep_ratio=True,
125 | oversample=None,
126 | test_mode=False,
127 | ),
128 | test=dict(
129 | type=dataset_type,
130 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
131 | img_prefix=data_root_val,
132 | img_norm_cfg=img_norm_cfg,
133 | input_format="NCTHW",
134 | num_segments=10,
135 | new_length=32,
136 | new_step=2,
137 | random_shift=True,
138 | modality='RGB',
139 | image_tmpl='img_{:05d}.jpg',
140 | img_scale=256,
141 | input_size=256,
142 | flip_ratio=0,
143 | resize_keep_ratio=True,
144 | oversample='three_crop',
145 | test_mode=True,
146 | ))
147 | # optimizer
148 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
149 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
150 | # learning policy
151 | lr_config = dict(
152 | policy='step',
153 | step=[75, 125])
154 |
155 | checkpoint_config = dict(interval=1)
156 | # workflow = [('train', 5), ('val', 1)]
157 | workflow = [('train', 1)]
158 | # yapf:disable
159 | log_config = dict(
160 | interval=20,
161 | hooks=[
162 | dict(type='TextLoggerHook'),
163 | # dict(type='TensorboardLoggerHook')
164 | ])
165 | # yapf:enable
166 | # runtime settings
167 | total_epochs = 150
168 | dist_params = dict(backend='nccl')
169 | log_level = 'INFO'
170 | load_from = None
171 | resume_from = None
172 |
--------------------------------------------------------------------------------
/mmaction/models/recognizers/TSN3D.py:
--------------------------------------------------------------------------------
1 | from .base import BaseRecognizer
2 | from .. import builder
3 | from ..registry import RECOGNIZERS
4 |
5 | import torch
6 |
7 |
8 | @RECOGNIZERS.register_module
9 | class TSN3D(BaseRecognizer):
10 |
11 | def __init__(self,
12 | backbone,
13 | necks=None,
14 | spatial_temporal_module=None,
15 | segmental_consensus=None,
16 | fcn_testing=False,
17 | flip=False,
18 | cls_head=None,
19 | train_cfg=None,
20 | test_cfg=None):
21 |
22 | super(TSN3D, self).__init__()
23 | self.backbone = builder.build_backbone(backbone)
24 |
25 | if necks is not None:
26 | self.necks = builder.build_neck(necks)
27 | else:
28 | self.necks = None
29 |
30 | if spatial_temporal_module is not None:
31 | self.spatial_temporal_module = builder.build_spatial_temporal_module(
32 | spatial_temporal_module)
33 | else:
34 | raise NotImplementedError
35 |
36 | if segmental_consensus is not None:
37 | self.segmental_consensus = builder.build_segmental_consensus(
38 | segmental_consensus)
39 | else:
40 | raise NotImplementedError
41 |
42 | if cls_head is not None:
43 | self.cls_head = builder.build_head(cls_head)
44 | else:
45 | raise NotImplementedError
46 |
47 | self.train_cfg = train_cfg
48 | self.test_cfg = test_cfg
49 | self.fcn_testing = fcn_testing
50 | self.flip = flip
51 | self.init_weights()
52 |
53 | @property
54 | def with_spatial_temporal_module(self):
55 | return hasattr(self, 'spatial_temporal_module') and self.spatial_temporal_module is not None
56 |
57 | @property
58 | def with_segmental_consensus(self):
59 | return hasattr(self, 'segmental_consensus') and self.segmental_consensus is not None
60 |
61 | @property
62 | def with_cls_head(self):
63 | return hasattr(self, 'cls_head') and self.cls_head is not None
64 |
65 | def init_weights(self):
66 | super(TSN3D, self).init_weights()
67 | self.backbone.init_weights()
68 |
69 | if self.with_spatial_temporal_module:
70 | self.spatial_temporal_module.init_weights()
71 |
72 | if self.with_segmental_consensus:
73 | self.segmental_consensus.init_weights()
74 |
75 | if self.with_cls_head:
76 | self.cls_head.init_weights()
77 |
78 | if self.necks is not None:
79 | self.necks.init_weights()
80 |
81 | def extract_feat(self, img_group):
82 | x = self.backbone(img_group)
83 | return x
84 |
85 | def forward_train(self,
86 | num_modalities,
87 | img_meta,
88 | gt_label,
89 | **kwargs):
90 | assert num_modalities == 1
91 | img_group = kwargs['img_group_0']
92 |
93 | bs = img_group.shape[0]
94 | img_group = img_group.reshape((-1,) + img_group.shape[2:])
95 | num_seg = img_group.shape[0] // bs
96 |
97 | x = self.extract_feat(img_group)
98 |
99 | if self.necks is not None:
100 | x, aux_losses = self.necks(x, gt_label.squeeze())
101 |
102 | if self.with_spatial_temporal_module:
103 | x = self.spatial_temporal_module(x)
104 | if self.with_segmental_consensus:
105 | x = x.reshape((-1, num_seg) + x.shape[1:])
106 | x = self.segmental_consensus(x)
107 | x = x.squeeze(1)
108 | losses = dict()
109 | if self.with_cls_head:
110 | cls_score = self.cls_head(x)
111 | gt_label = gt_label.squeeze()
112 | loss_cls = self.cls_head.loss(cls_score, gt_label)
113 | losses.update(loss_cls)
114 | if self.necks is not None:
115 | if aux_losses is not None:
116 | losses.update(aux_losses)
117 |
118 | return losses
119 |
120 | def forward_test(self,
121 | num_modalities,
122 | img_meta,
123 | **kwargs):
124 | assert num_modalities == 1
125 | img_group = kwargs['img_group_0']
126 |
127 | bs = img_group.shape[0]
128 | img_group = img_group.reshape((-1,) + img_group.shape[2:])
129 | num_seg = img_group.shape[0] // bs
130 |
131 | if self.flip:
132 | img_group = self.extract_feat(torch.flip(img_group, [-1]))
133 | x = self.extract_feat(img_group)
134 | if self.necks is not None:
135 | x, _ = self.necks(x)
136 | if self.fcn_testing:
137 | if self.with_cls_head:
138 | x = self.cls_head(x)
139 | prob1 = torch.nn.functional.softmax(x.mean([2, 3, 4]), 1).mean(0, keepdim=True).detach().cpu().numpy()
140 | return prob1
141 |
142 | if self.with_spatial_temporal_module:
143 | x = self.spatial_temporal_module(x)
144 | if self.with_segmental_consensus:
145 | x = x.reshape((-1, num_seg) + x.shape[1:])
146 | x = self.segmental_consensus(x)
147 | x = x.squeeze(1)
148 | if self.with_cls_head:
149 | x = self.cls_head(x)
150 |
151 | return x.cpu().numpy()
152 |
--------------------------------------------------------------------------------
/mmaction/models/tenons/spatial_temporal_modules/non_local.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from mmcv.cnn import constant_init, kaiming_init
5 | from ...registry import SPATIAL_TEMPORAL_MODULES
6 |
7 |
8 | @SPATIAL_TEMPORAL_MODULES.register_module
9 | class NonLocalModule(nn.Module):
10 | def __init__(self, in_channels=1024, nonlocal_type="gaussian", dim=3, embed=True, embed_dim=None, sub_sample=True,
11 | use_bn=True):
12 | super(NonLocalModule, self).__init__()
13 |
14 | assert nonlocal_type in ['gaussian', 'dot', 'concat']
15 | assert dim == 2 or dim == 3
16 | self.nonlocal_type = nonlocal_type
17 | self.embed = embed
18 | self.embed_dim = embed_dim if embed_dim is not None else in_channels // 2
19 | self.sub_sample = sub_sample
20 | self.use_bn = use_bn
21 |
22 | if self.embed:
23 | if dim == 2:
24 | self.theta = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
25 | self.phi = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
26 | self.g = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
27 | elif dim == 3:
28 | self.theta = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1),
29 | padding=(0, 0, 0))
30 | self.phi = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1),
31 | padding=(0, 0, 0))
32 | self.g = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1),
33 | padding=(0, 0, 0))
34 |
35 | if self.nonlocal_type == 'gaussian':
36 | self.softmax = nn.Softmax(dim=2)
37 | elif self.nonlocal_type == 'concat':
38 | if dim == 2:
39 | self.concat_proj = nn.Sequential(
40 | nn.Conv2d(self.embed_dim * 2, 1, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)),
41 | nn.ReLU())
42 | elif dim == 3:
43 | self.concat_proj = nn.Sequential(
44 | nn.Conv3d(self.embed_dim * 2, 1, kernel_size=(1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0)),
45 | nn.ReLU())
46 |
47 | if sub_sample:
48 | if dim == 2:
49 | self.max_pool = nn.MaxPool2d(kernel_size=(2, 2))
50 | elif dim == 3:
51 | self.max_pool = nn.MaxPool3d(kernel_size=(1, 2, 2))
52 | self.g = nn.Sequential(self.max_pool, self.g)
53 | self.phi = nn.Sequential(self.max_pool, self.phi)
54 |
55 | if dim == 2:
56 | self.W = nn.Conv2d(self.embed_dim, in_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
57 | elif dim == 3:
58 | self.W = nn.Conv3d(self.embed_dim, in_channels, kernel_size=(1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0))
59 |
60 | if use_bn:
61 | if dim == 2:
62 | self.bn = nn.BatchNorm2d(in_channels, eps=1e-05, momentum=0.9, affine=True)
63 | elif dim == 3:
64 | self.bn = nn.BatchNorm3d(in_channels, eps=1e-05, momentum=0.9, affine=True)
65 | self.W = nn.Sequential(self.W, self.bn)
66 |
67 | def init_weights(self):
68 | for m in self.modules():
69 | if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv3d):
70 | kaiming_init(m)
71 | elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
72 | constant_init(m, 0)
73 |
74 | def forward(self, input):
75 | if self.embed:
76 | theta = self.theta(input)
77 | phi = self.phi(input)
78 | g = self.g(input)
79 | else:
80 | theta = input
81 | phi = input
82 | g = input
83 |
84 | if self.nonlocal_type in ['gaussian', 'dot']:
85 | # reshape [BxC'xTxHxW] to [BxC'x(T)HW]
86 | theta = theta.reshape(theta.shape[:2] + (-1,))
87 | phi = phi.reshape(theta.shape[:2] + (-1,))
88 | g = g.reshape(theta.shape[:2] + (-1,))
89 | theta_phi = torch.matmul(theta.transpose(1, 2), phi)
90 | if self.nonlocal_type == 'gaussian':
91 | p = self.softmax(theta_phi)
92 | elif self.nonlocal_type == 'dot':
93 | N = theta_phi.size(-1)
94 | p = theta_phi / N
95 | elif self.non_local_type == 'concat':
96 | # reshape [BxC'xTxHxW] to [BxC'x(T)HWx1]
97 | theta = theta.reshape(theta.shape[:2] + (-1, 1))
98 | # reshape [BxC'xTxHxW] to [BxC'x1x(T)HW]
99 | phi = phi.reshape(theta.shape[:2] + (1, -1))
100 | theta_x = theta.repeat(1, 1, 1, phi.size(3))
101 | phi_x = phi.repeat(1, 1, theta.size(2), 1)
102 | theta_phi = torch.cat([theta_x, phi_x], dim=1)
103 | theta_phi = self.concat_proj(theta_phi)
104 | theta_phi = theta_phi.squeeze()
105 | N = theta_phi.size(-1)
106 | p = theta_phi / N
107 | else:
108 | NotImplementedError
109 |
110 | # BxC'xddd , Bxdxddd => BxC'xd
111 | y = torch.matmul(g, p.transpose(1, 2))
112 | y = y.reshape(y.shape[:2] + input.shape[2:])
113 | z = self.W(y) + input
114 |
115 | return z
116 |
--------------------------------------------------------------------------------
/MODELZOO.md:
--------------------------------------------------------------------------------
1 | # Model Zoo
2 |
3 | ## Pretrained Models
4 | All pretrained models can be downloaded from [Google Drive](https://drive.google.com/drive/folders/1UnqZ48doF0UTYjH6iZCXQW3HlDocbBxl). After downloading, put them into `ckpt/`.
5 |
6 | ## Main Results
7 | We report our methods on Kinetics-400, Something-Something V1 and V2. All the numbers including baselines and TPN are obtained via fully-convolutional testing.
8 |
9 | ### Kinetics-400
10 | Since the number of Kinetics-400 videos is slightly different (might lead to a performance drop), we report all results on our own dataset. Our data contains 240403 training videos and 19769 validation videos which are rescaled to 240*320 resolution. Note that the trimmed time of [Non-Local](https://github.com/facebookresearch/video-nonlocal-net/blob/master/DATASET.md) data and the resolution of [MMAction](https://github.com/open-mmlab/mmaction/blob/master/MODEL_ZOO.md) data are different from ours. But the improvements of TPN are consistent. In order to ensure the reproduction, we will find a proper way to release our validation set. All the following results on Kinetics-400 also take flip augmentation testing (~0.1% fluctuation). We sample F frames with a stride of S frames (denote FxS).
11 |
12 |
13 | | Model | Frames | TPN | Top-1 | Weights | Config |
14 | | :---: | :------: | :--------: | :------: | :------: | :------ |
15 | |R50 | 8 x 8 | - | 74.9 | [link](https://drive.google.com/open?id=1uKHvZsY_heFHTBl6RXo02I7-W_aLBhFI) | config_files/kinetics400/baseline/r50f8s8.py |
16 | |R50 | 8 x 8 | Yes | 76.1 | [link](https://drive.google.com/open?id=1KoISwdKDlfzZdEsLItygcvPGkKNwWyR-) | config_files/kinetics400/tpn/r50f8s8.py |
17 | |R50 | 16 x 4 | - | 76.1 | [link](https://drive.google.com/open?id=1Qgck89mUVs9gyUzalbYJPfJPwQEPbyI9) | config_files/kinetics400/baseline/r50f16s4.py |
18 | |R50 | 16 x 4 | Yes | 77.3 | [link](https://drive.google.com/open?id=1TY39uBR-ckUw3aiabeFLNpR9uPSxt--H) | config_files/kinetics400/tpn/r50f16s4.py |
19 | |R50 | 32 x 2 | - | 75.7 | [link](https://drive.google.com/open?id=1oJ1sTzMeLPXHtnutJAAD8gWfm0b3NYpi) | config_files/kinetics400/baseline/r50f32s2.py |
20 | |R50 | 32 x 2 | Yes | 77.7 | [link](https://drive.google.com/open?id=1TjeqcTJ2tReDz4VnLR8ajSHySre9sZDd) | config_files/kinetics400/tpn/r50f32s2.py |
21 | |R101 | 8 x 8 | - | 76.0 | [link](https://drive.google.com/open?id=1dqLWiI3DFHAPIzGtEY_jfI66nthw2GEX) | config_files/kinetics400/baseline/r101f8s8.py |
22 | |R101 | 8 x 8 | Yes | 77.2 | [link](https://drive.google.com/open?id=1B4Vsld-JzQe4QmXeZHd0TolMPNyZypXI) | config_files/kinetics400/tpn/r101f8s8.py |
23 | |R101 | 16 x 4 | - | 77.0 | [link](https://drive.google.com/open?id=1tj2Y0OChKW7RoElXXmBeU63dph40kEyJ) | config_files/kinetics400/baseline/r101f16s4.py |
24 | |R101 | 16 x 4 | Yes | 78.1 | [link](https://drive.google.com/open?id=1mT4kuaYuAGA-Zjagc56vByMQdvx0bE-H) | config_files/kinetics400/tpn/r101f16s4.py |
25 | |R101 | 32 x 2 | - | 77.4 | [link](https://drive.google.com/open?id=1IAobiYS3PhXC1sA_MCdudGCdHRWcWc9J) | config_files/kinetics400/baseline/r101f32s2.py |
26 | |R101 | 32 x 2 | Yes | 78.9 | [link](https://drive.google.com/open?id=1OPudI7CzJzpdeI0YpwLgZB59VCzcoidp) | config_files/kinetics400/tpn/r101f32s2.py |
27 |
28 | We also train our TPN on [MMAction](https://github.com/open-mmlab/mmaction/blob/master/MODEL_ZOO.md) data, the performance will increase due to the raw resolution and ratio.
29 |
30 | | Model | Frames | TPN | Top-1 | Weights | Config |
31 | | :---: | :------: | :--------: | :------: | :------: | :------ |
32 | |R50 | 8 x 8 | Yes | 76.7 | [link](https://drive.google.com/open?id=1pCY4oiWK3hs6MwaPZ8QVPMb-qDCV56w5) | config_files/kinetics400/baseline/r50f8s8.py |
33 | |R101 | 8 x 8 | Yes | 78.2 | [link](https://drive.google.com/open?id=1DeVp7cf-dk-x6Um4NouLq5tFniTge0Bd) | config_files/kinetics400/baseline/r101f8s8.py |
34 |
35 | All models are trained on 32 GPUs with 150 epochs. More details could be found in `config_files`.
36 |
37 | ### Something-Something
38 | Something-Something is a more stable benchmark and the whole data could be downloaded from their [website](https://20bn.com/datasets/something-something). We report our results on both V1 and V2. All numbers are obtained by following the standard protocol i.e., 3 crops * 2 clips. [TSM](https://github.com/mit-han-lab/temporal-shift-module) serves as our backbone network.
39 | Different from original [repo](https://github.com/mit-han-lab/temporal-shift-module) of TSM which takes Kinetics-pretrain, our implementation is initialized by imagenet-pretrain and trained with longer schedule. We use **the same** hyper-parameters of training for both baseline and TPN. Therefore, the improvements come from TPN design instead of other training tricks. We take the uniform sampling for training and validation.
40 |
41 | | Model | Dataset Version | Frames | TPN | Top-1 | Weights | Config |
42 | | :---: | :------: | :------: | :--------: | :------: | :------: | :------ |
43 | |TSM50 | V1 | 8 | - | 48.2 | [link](https://drive.google.com/open?id=1x7iwL2Op0qxaUluyQCPOVVEEH53cavhL) | config_files/sthv1/tsm_baseline.py |
44 | |TSM50 | V1 | 8 | Yes | 50.7 | [link](https://drive.google.com/open?id=1NVjsCYgNXKUKAn33XCxV2YEIaWXlEnLS) | config_files/sthv1/tsm_tpn.py |
45 | |TSM50 | V2 | 8 | - | 62.3 | [link](https://drive.google.com/open?id=1fU1b9WySld5knJ8E2bMXfuyRenoViSEX) | config_files/sthv2/tsm_baseline.py |
46 | |TSM50 | V2 | 8 | Yes | 64.7 | [link](https://drive.google.com/open?id=15HHKGIhksTf0dSmgxrTsoHzZxF6n7eRa) | config_files/sthv2/tsm_tpn.py |
47 |
48 | If you have any problem about how to reproduce our results, please contact Ceyuan Yang (yc019@ie.cuhk.edu.hk) or Yinghao Xu (xy119@ie.cuhk.edu.hk).
49 |
50 |
--------------------------------------------------------------------------------
/mmaction/datasets/loader/sampler.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 |
3 | import math
4 | import torch
5 | import numpy as np
6 |
7 | from torch.distributed import get_world_size, get_rank
8 | from torch.utils.data import Sampler
9 | from torch.utils.data import DistributedSampler as _DistributedSampler
10 |
11 |
12 | class DistributedSampler(_DistributedSampler):
13 |
14 | def __init__(self, dataset, imgs_per_gpu, num_replicas=None, rank=None, shuffle=True):
15 | super().__init__(dataset, num_replicas=num_replicas, rank=rank)
16 | self.shuffle = shuffle
17 | self.samples_per_gpu = imgs_per_gpu
18 |
19 | self.num_samples = int(
20 | math.ceil(len(dataset) * 1.0 / self.samples_per_gpu /
21 | self.num_replicas)) * self.samples_per_gpu
22 | self.total_size = self.num_samples * self.num_replicas
23 |
24 | def __iter__(self):
25 | # deterministically shuffle based on epoch
26 | if self.shuffle:
27 | g = torch.Generator()
28 | g.manual_seed(self.epoch)
29 | indices = torch.randperm(len(self.dataset), generator=g).tolist()
30 | else:
31 | indices = torch.arange(len(self.dataset)).tolist()
32 |
33 | # add extra samples to make it evenly divisible
34 | indices += indices[:(self.total_size - len(indices))]
35 | assert len(indices) == self.total_size
36 |
37 | # subsample
38 | indices = indices[self.rank:self.total_size:self.num_replicas]
39 | assert len(indices) == self.num_samples
40 |
41 | return iter(indices)
42 |
43 |
44 | class GroupSampler(Sampler):
45 |
46 | def __init__(self, dataset, samples_per_gpu=1):
47 | assert hasattr(dataset, 'flag')
48 | self.dataset = dataset
49 | self.samples_per_gpu = samples_per_gpu
50 | self.flag = dataset.flag.astype(np.int64)
51 | self.group_sizes = np.bincount(self.flag)
52 | self.num_samples = 0
53 | for i, size in enumerate(self.group_sizes):
54 | self.num_samples += int(np.ceil(
55 | size / self.samples_per_gpu)) * self.samples_per_gpu
56 |
57 | def __iter__(self):
58 | indices = []
59 | for i, size in enumerate(self.group_sizes):
60 | if size == 0:
61 | continue
62 | indice = np.where(self.flag == i)[0]
63 | assert len(indice) == size
64 | np.random.shuffle(indice)
65 | num_extra = int(np.ceil(size / self.samples_per_gpu)
66 | ) * self.samples_per_gpu - len(indice)
67 | indice = np.concatenate([indice, indice[:num_extra]])
68 | indices.append(indice)
69 | indices = np.concatenate(indices)
70 | indices = [
71 | indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
72 | for i in np.random.permutation(
73 | range(len(indices) // self.samples_per_gpu))
74 | ]
75 | indices = np.concatenate(indices)
76 | indices = torch.from_numpy(indices).long()
77 | assert len(indices) == self.num_samples
78 | return iter(indices)
79 |
80 | def __len__(self):
81 | return self.num_samples
82 |
83 |
84 | class DistributedGroupSampler(Sampler):
85 | """Sampler that restricts data loading to a subset of the dataset.
86 | It is especially useful in conjunction with
87 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
88 | process can pass a DistributedSampler instance as a DataLoader sampler,
89 | and load a subset of the original dataset that is exclusive to it.
90 | .. note::
91 | Dataset is assumed to be of constant size.
92 | Arguments:
93 | dataset: Dataset used for sampling.
94 | num_replicas (optional): Number of processes participating in
95 | distributed training.
96 | rank (optional): Rank of the current process within num_replicas.
97 | """
98 |
99 | def __init__(self,
100 | dataset,
101 | samples_per_gpu=1,
102 | num_replicas=None,
103 | rank=None):
104 | if num_replicas is None:
105 | num_replicas = get_world_size()
106 | if rank is None:
107 | rank = get_rank()
108 | self.dataset = dataset
109 | self.samples_per_gpu = samples_per_gpu
110 | self.num_replicas = num_replicas
111 | self.rank = rank
112 | self.epoch = 0
113 |
114 | assert hasattr(self.dataset, 'flag')
115 | self.flag = self.dataset.flag
116 | self.group_sizes = np.bincount(self.flag)
117 |
118 | self.num_samples = 0
119 | for i, j in enumerate(self.group_sizes):
120 | self.num_samples += int(
121 | math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
122 | self.num_replicas)) * self.samples_per_gpu
123 | self.total_size = self.num_samples * self.num_replicas
124 |
125 | def __iter__(self):
126 | # deterministically shuffle based on epoch
127 | g = torch.Generator()
128 | g.manual_seed(self.epoch)
129 |
130 | indices = []
131 | for i, size in enumerate(self.group_sizes):
132 | if size > 0:
133 | indice = np.where(self.flag == i)[0]
134 | assert len(indice) == size
135 | indice = indice[list(torch.randperm(int(size),
136 | generator=g))].tolist()
137 | extra = int(
138 | math.ceil(
139 | size * 1.0 / self.samples_per_gpu / self.num_replicas)
140 | ) * self.samples_per_gpu * self.num_replicas - len(indice)
141 | indice += indice[:extra]
142 | indices += indice
143 |
144 | assert len(indices) == self.total_size
145 |
146 | indices = [
147 | indices[j] for i in list(
148 | torch.randperm(
149 | len(indices) // self.samples_per_gpu, generator=g))
150 | for j in range(i * self.samples_per_gpu, (i + 1) *
151 | self.samples_per_gpu)
152 | ]
153 |
154 | # subsample
155 | offset = self.num_samples * self.rank
156 | indices = indices[offset:offset + self.num_samples]
157 | assert len(indices) == self.num_samples
158 |
159 | return iter(indices)
160 |
161 | def __len__(self):
162 | return self.num_samples
163 |
164 | def set_epoch(self, epoch):
165 | self.epoch = epoch
166 |
--------------------------------------------------------------------------------
/test_video.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import cv2
4 | import argparse
5 | import functools
6 | import subprocess
7 | import warnings
8 | from scipy.special import softmax
9 | import moviepy.editor as mpy
10 | import numpy as np
11 | import torch
12 |
13 | import mmcv
14 | from mmcv.runner import load_checkpoint
15 | from mmcv.parallel import collate, scatter
16 |
17 | from mmaction.models import build_recognizer
18 | from mmaction.datasets.transforms import GroupImageTransform
19 |
20 |
21 | def init_recognizer(config, checkpoint=None, label_file=None, device='cuda:0'):
22 | if isinstance(config, str):
23 | config = mmcv.Config.fromfile(config)
24 | elif not isinstance(config, mmcv.Config):
25 | raise TypeError('config must be a filename or Config object, '
26 | 'but got {}'.format(type(config)))
27 | config.model.backbone.pretrained = None
28 | config.model.spatial_temporal_module.spatial_size = 8
29 | model = build_recognizer(
30 | config.model, train_cfg=None, test_cfg=config.test_cfg)
31 | if checkpoint is not None:
32 | checkpoint = load_checkpoint(model, checkpoint)
33 | if label_file is not None:
34 | classes = [line.rstrip() for line in open(label_file, 'r').readlines()]
35 | model.CLASSES = classes
36 | else:
37 | if 'CLASSES' in checkpoint['meta']:
38 | model.CLASSES = checkpoint['meta']['CLASSES']
39 | else:
40 | warnings.warn('Class names are not saved in the checkpoint\'s '
41 | 'meta data, use something-something-v2 classes by default.')
42 | model.CLASSES = get_classes('something=something-v2')
43 | model.cfg = config # save the config in the model for convenience
44 | model.to(device)
45 | model.eval()
46 | return model
47 |
48 |
49 | def inference_recognizer(model, frames):
50 | cfg = model.cfg
51 | device = next(model.parameters()).device # model device
52 | # build the data pipeline
53 | test_transform = GroupImageTransform(
54 | crop_size=cfg.data.test.input_size,
55 | oversample=None,
56 | resize_crop=False,
57 | **dict(mean=[123.675, 116.28, 103.53],
58 | std=[58.395, 57.12, 57.375], to_rgb=True))
59 | # prepare data
60 | frames, *l = test_transform(
61 | frames, (cfg.data.test.img_scale, cfg.data.test.img_scale),
62 | crop_history=None,
63 | flip=False,
64 | keep_ratio=False,
65 | div_255=False,
66 | is_flow=False)
67 | data = dict(img_group_0=frames,
68 | num_modalities=1,
69 | img_meta={})
70 | data = scatter(collate([data], samples_per_gpu=1), [device])[0]
71 | # forward the model
72 | with torch.no_grad():
73 | result = model(return_loss=False, rescale=True, **data)
74 | return result
75 |
76 |
77 | def extract_frames(video_file, num_frames=8):
78 | try:
79 | os.makedirs(os.path.join(os.getcwd(), 'frames'))
80 | except OSError:
81 | pass
82 | fps = subprocess.check_output(['ffprobe', '-v', 'error',
83 | '-select_streams',
84 | 'v', '-of', 'default=noprint_wrappers=1:nokey=1',
85 | '-show_entries',
86 | ' stream=r_frame_rate',
87 | video_file]).decode('utf-8').strip().split('/')[0]
88 | fps = int(fps)
89 |
90 | output = subprocess.Popen(['ffmpeg', '-i', video_file,
91 | '-loglevel', 'panic',
92 | 'frames/%d.jpg']).communicate()
93 | frame_paths = [os.path.join('frames', frame)
94 | for frame in sorted(os.listdir('frames'), key=lambda x: int(x.split('.')[0]))]
95 |
96 | seg_frames, raw_frames = load_frames(frame_paths)
97 | subprocess.call(['rm', '-rf', 'frames'])
98 |
99 | return seg_frames, raw_frames, fps
100 |
101 |
102 | def load_frames(frame_paths, num_frames=8):
103 | frames = [mmcv.imread(frame) for frame in frame_paths]
104 | if len(frames) >= num_frames:
105 | return frames[::int(np.floor(len(frames) / float(num_frames)))][:num_frames].copy(), frames.copy()
106 | else:
107 | raise ValueError('Video must have at least {} frames'.format(num_frames))
108 |
109 |
110 | def render_frames(frames, prediction):
111 | rendered_frames = []
112 | for frame in frames:
113 | img = np.array(frame[:, :, ::-1])
114 | height, width, _ = img.shape
115 | cv2.putText(img=img, text=prediction, org=(1, int(height / 8)), fontFace=cv2.FONT_HERSHEY_TRIPLEX,
116 | fontScale=0.6, color=(255, 255, 255), lineType=cv2.LINE_8, bottomLeftOrigin=False)
117 | rendered_frames.append(img)
118 | return rendered_frames
119 |
120 |
121 | # options
122 | parser = argparse.ArgumentParser(description="test TPN on a single video")
123 | parser.add_argument('config', type=str, default=None, help='model init config')
124 | parser.add_argument('checkpoint', type=str, default=None)
125 | parser.add_argument('--label_file', type=str, default='demo/category.txt')
126 | parser.add_argument('--video_file', type=str, default='demo/demo.mp4')
127 | parser.add_argument('--frame_folder', type=str, default=None)
128 | parser.add_argument('--rendered_output', type=str, default='demo/demo_pred.mp4')
129 | args = parser.parse_args()
130 |
131 | # Obtain video frames
132 | if args.frame_folder is not None:
133 | print('Loading frames in {}'.format(args.frame_folder))
134 | import glob
135 |
136 | # Here, make sure after sorting the frame paths have the correct temporal order
137 | frame_paths = sorted(glob.glob(os.path.join(args.frame_folder, '*.jpg')))
138 | seg_frames, raw_frames = load_frames(frame_paths)
139 | fps = 4
140 | else:
141 | print('Extracting frames using ffmpeg...')
142 | seg_frames, raw_frames, fps = extract_frames(args.video_file, 8)
143 |
144 | model = init_recognizer(args.config, checkpoint=args.checkpoint, label_file=args.label_file)
145 | results = inference_recognizer(model, seg_frames)
146 | prob = softmax(results.squeeze())
147 | idx = np.argsort(-prob)
148 | # Output the prediction.
149 | video_name = args.frame_folder if args.frame_folder is not None else args.video_file
150 | print('RESULT ON ' + video_name)
151 | for i in range(0, 5):
152 | print('{:.3f} -> {}'.format(prob[idx[i]], model.CLASSES[idx[i]]))
153 |
154 | # Render output frames with prediction text.
155 | if args.rendered_output is not None:
156 | prediction = model.CLASSES[idx[0]]
157 | rendered_frames = render_frames(raw_frames, prediction)
158 | clip = mpy.ImageSequenceClip(rendered_frames, fps=fps)
159 | clip.write_videofile(args.rendered_output)
160 |
--------------------------------------------------------------------------------
/mmaction/models/tenons/segmental_consensuses/stpp.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from ...registry import SEGMENTAL_CONSENSUSES
5 | import numpy as np
6 |
7 |
8 | def parse_stage_config(stage_cfg):
9 | if isinstance(stage_cfg, int):
10 | return (stage_cfg,), stage_cfg
11 | elif isinstance(stage_cfg, tuple) or isinstance(stage_cfg, list):
12 | return stage_cfg, sum(stage_cfg)
13 | else:
14 | raise ValueError("Incorrect STPP config {}".format(stage_cfg))
15 |
16 |
17 | @SEGMENTAL_CONSENSUSES.register_module
18 | class StructuredTemporalPyramidPooling(nn.Module):
19 | def __init__(self, standalong_classifier=False, stpp_cfg=(1, (1, 2), 1), num_seg=(2, 5, 2)):
20 | super(StructuredTemporalPyramidPooling, self).__init__()
21 |
22 | self.sc = standalong_classifier
23 |
24 | starting_parts, starting_mult = parse_stage_config(stpp_cfg[0])
25 | course_parts, course_mult = parse_stage_config(stpp_cfg[1])
26 | ending_parts, ending_mult = parse_stage_config(stpp_cfg[2])
27 |
28 | self.feat_multiplier = starting_mult + course_mult + ending_mult
29 | self.parts = (starting_parts, course_parts, ending_parts)
30 | self.norm_num = (starting_mult, course_mult, ending_mult)
31 |
32 | self.num_seg = num_seg
33 |
34 | def init_weights(self):
35 | pass
36 |
37 | def forward(self, input, scaling):
38 | x1 = self.num_seg[0]
39 | x2 = x1 + self.num_seg[1]
40 | n_seg = x2 + self.num_seg[2]
41 |
42 | feat_dim = input.size(1)
43 | src = input.view(-1, n_seg, feat_dim)
44 | num_sample = src.size(0)
45 |
46 | scaling = scaling.view(-1, 2)
47 |
48 | def get_stage_stpp(stage_feat, stage_parts, norm_num, scaling):
49 | stage_stpp = []
50 | stage_len = stage_feat.size(1)
51 | for n_part in stage_parts:
52 | ticks = torch.arange(0, stage_len + 1e-5, stage_len / n_part)
53 | for i in range(n_part):
54 | part_feat = stage_feat[:, int(ticks[i]):int(ticks[i + 1]), :].mean(dim=1) / norm_num
55 | if scaling is not None:
56 | part_feat = part_feat * scaling.view(num_sample, 1)
57 | stage_stpp.append(part_feat)
58 | return stage_stpp
59 |
60 | feature_parts = []
61 | feature_parts.extend(get_stage_stpp(src[:, :x1, :], self.parts[0], self.norm_num[0], scaling[:, 0]))
62 | feature_parts.extend(get_stage_stpp(src[:, x1:x2, :], self.parts[1], self.norm_num[1], None))
63 | feature_parts.extend(get_stage_stpp(src[:, x2:, :], self.parts[2], self.norm_num[2], scaling[:, 1]))
64 | stpp_feat = torch.cat(feature_parts, dim=1)
65 | if not self.sc:
66 | return stpp_feat, stpp_feat
67 | else:
68 | course_feat = src[:, x1:x2, :].mean(dim=1)
69 | return course_feat, stpp_feat
70 |
71 |
72 | @SEGMENTAL_CONSENSUSES.register_module
73 | class STPPReorganized(nn.Module):
74 | def __init__(self, feat_dim, act_score_len,
75 | comp_score_len, reg_score_len,
76 | standalong_classifier=False,
77 | with_regression=True,
78 | stpp_cfg=(1, (1, 2), 1)):
79 | super(STPPReorganized, self).__init__()
80 |
81 | self.sc = standalong_classifier
82 | self.feat_dim = feat_dim
83 | self.act_score_len = act_score_len
84 | self.comp_score_len = comp_score_len
85 | self.reg_score_len = reg_score_len
86 | self.with_regression = with_regression
87 |
88 | starting_parts, starting_mult = parse_stage_config(stpp_cfg[0])
89 | course_parts, course_mult = parse_stage_config(stpp_cfg[1])
90 | ending_parts, ending_mult = parse_stage_config(stpp_cfg[2])
91 |
92 | self.feat_multiplier = starting_mult + course_mult + ending_mult
93 | self.stpp_cfg = (starting_parts, course_parts, ending_parts)
94 |
95 | self.act_slice = slice(0, self.act_score_len if self.sc else (self.act_score_len * self.feat_multiplier))
96 | self.comp_slice = slice(self.act_slice.stop, self.act_slice.stop + self.comp_score_len * self.feat_multiplier)
97 | self.reg_slice = slice(self.comp_slice.stop, self.comp_slice.stop + self.reg_score_len * self.feat_multiplier)
98 |
99 | def init_weights(self):
100 | pass
101 |
102 | def forward(self, input, proposal_ticks, scaling):
103 | assert input.size(1) == self.feat_dim
104 | n_ticks = proposal_ticks.size(0)
105 |
106 | out_act_scores = torch.zeros((n_ticks, self.act_score_len)).type_as(input)
107 | raw_act_scores = input[:, self.act_slice]
108 |
109 | out_comp_scores = torch.zeros((n_ticks, self.comp_score_len)).type_as(input)
110 | raw_comp_scores = input[:, self.comp_slice]
111 |
112 | if self.with_regression:
113 | out_reg_scores = torch.zeros((n_ticks, self.reg_score_len)).type_as(input)
114 | raw_reg_scores = input[:, self.reg_slice]
115 | else:
116 | out_reg_scores = None
117 | raw_reg_scores = None
118 |
119 | def pspool(out_scores, index, raw_scores, ticks, scaling, score_len, stpp_cfg):
120 | offset = 0
121 | for stage_idx, stage_cfg in enumerate(stpp_cfg):
122 | if stage_idx == 0:
123 | s = scaling[0]
124 | elif stage_idx == len(stpp_cfg) - 1:
125 | s = scaling[1]
126 | else:
127 | s = 1.0
128 |
129 | stage_cnt = sum(stage_cfg)
130 | left = ticks[stage_idx]
131 | right = max(ticks[stage_idx] + 1, ticks[stage_idx + 1])
132 |
133 | if right <= 0 or left >= raw_scores.size(0):
134 | offset += stage_cnt
135 | continue
136 | for n_part in stage_cfg:
137 | part_ticks = np.arange(left, right + 1e-5, (right - left) / n_part)
138 | for i in range(n_part):
139 | pl = int(part_ticks[i])
140 | pr = int(part_ticks[i + 1])
141 | if pr - pl >= 1:
142 | out_scores[index, :] += raw_scores[pl:pr,
143 | offset * score_len: (offset + 1) * score_len].mean(dim=0) * s
144 | offset += 1
145 |
146 | for i in range(n_ticks):
147 | ticks = proposal_ticks[i].cpu().numpy()
148 | if self.sc:
149 | out_act_scores[i, :] = raw_act_scores[ticks[1]: max(ticks[1] + 1, ticks[2]), :].mean(dim=0)
150 | else:
151 | pspool(out_act_scores, i, raw_act_scores, ticks, scaling[i], self.act_score_len, self.stpp_cfg)
152 |
153 | pspool(out_comp_scores, i, raw_comp_scores, ticks, scaling[i], self.comp_score_len, self.stpp_cfg)
154 |
155 | if self.with_regression:
156 | pspool(out_reg_scores, i, raw_reg_scores, ticks, scaling[i], self.reg_score_len, self.stpp_cfg)
157 |
158 | return out_act_scores, out_comp_scores, out_reg_scores
159 |
--------------------------------------------------------------------------------
/mmaction/models/recognizers/TSN2D.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | from .base import BaseRecognizer
3 | from .. import builder
4 | from ..registry import RECOGNIZERS
5 | import torch
6 | import numpy as np
7 |
8 |
9 | @RECOGNIZERS.register_module
10 | class TSN2D(BaseRecognizer):
11 |
12 | def __init__(self,
13 | backbone,
14 | necks=None,
15 | modality='RGB',
16 | in_channels=3,
17 | spatial_temporal_module=None,
18 | segmental_consensus=None,
19 | fcn_testing=False,
20 | flip=False,
21 | cls_head=None,
22 | train_cfg=None,
23 | test_cfg=None):
24 |
25 | super(TSN2D, self).__init__()
26 | self.backbone = builder.build_backbone(backbone)
27 | self.modality = modality
28 | self.in_channels = in_channels
29 | if necks is not None:
30 | self.necks = builder.build_neck(necks)
31 | else:
32 | self.necks = None
33 |
34 | if spatial_temporal_module is not None:
35 | self.spatial_temporal_module = builder.build_spatial_temporal_module(
36 | spatial_temporal_module)
37 | else:
38 | raise NotImplementedError
39 |
40 | if segmental_consensus is not None:
41 | self.segmental_consensus = builder.build_segmental_consensus(
42 | segmental_consensus)
43 | else:
44 | raise NotImplementedError
45 |
46 | if cls_head is not None:
47 | self.cls_head = builder.build_head(cls_head)
48 | else:
49 | raise NotImplementedError
50 |
51 | self.train_cfg = train_cfg
52 | self.test_cfg = test_cfg
53 | self.fcn_testing = fcn_testing
54 | self.flip = flip
55 | assert modality in ['RGB', 'Flow', 'RGBDiff']
56 |
57 | self.init_weights()
58 |
59 | @property
60 | def with_spatial_temporal_module(self):
61 | return hasattr(self, 'spatial_temporal_module') and self.spatial_temporal_module is not None
62 |
63 | @property
64 | def with_segmental_consensus(self):
65 | return hasattr(self, 'segmental_consensus') and self.segmental_consensus is not None
66 |
67 | @property
68 | def with_cls_head(self):
69 | return hasattr(self, 'cls_head') and self.cls_head is not None
70 |
71 | def init_weights(self):
72 | super(TSN2D, self).init_weights()
73 | self.backbone.init_weights()
74 |
75 | if self.with_spatial_temporal_module:
76 | self.spatial_temporal_module.init_weights()
77 |
78 | if self.with_segmental_consensus:
79 | self.segmental_consensus.init_weights()
80 |
81 | if self.with_cls_head:
82 | self.cls_head.init_weights()
83 |
84 | if self.necks is not None:
85 | self.necks.init_weights()
86 |
87 | def extract_feat(self, img_group):
88 | x = self.backbone(img_group)
89 | return x
90 |
91 | def forward_train(self,
92 | num_modalities,
93 | img_meta,
94 | gt_label,
95 | **kwargs):
96 | assert num_modalities == 1
97 | img_group = kwargs['img_group_0']
98 |
99 | bs = img_group.shape[0]
100 | img_group = img_group.reshape(
101 | (-1, self.in_channels) + img_group.shape[3:])
102 | num_seg = img_group.shape[0] // bs
103 |
104 | x = self.extract_feat(img_group)
105 | if self.necks is not None:
106 | x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x]
107 | x, aux_losses = self.necks(x, gt_label.squeeze())
108 | x = x.squeeze(2)
109 | num_seg = 1
110 |
111 | if self.with_spatial_temporal_module:
112 | x = self.spatial_temporal_module(x)
113 | x = x.reshape((-1, num_seg) + x.shape[1:])
114 | if self.with_segmental_consensus:
115 | x = self.segmental_consensus(x)
116 | x = x.squeeze(1)
117 | losses = dict()
118 | if self.with_cls_head:
119 | cls_score = self.cls_head(x)
120 | gt_label = gt_label.squeeze()
121 | loss_cls = self.cls_head.loss(cls_score, gt_label)
122 | losses.update(loss_cls)
123 | if self.necks is not None:
124 | if aux_losses is not None:
125 | losses.update(aux_losses)
126 | return losses
127 |
128 | def forward_test(self,
129 | num_modalities,
130 | img_meta,
131 | **kwargs):
132 | if not self.fcn_testing:
133 | # 1crop * 1clip
134 | assert num_modalities == 1
135 | img_group = kwargs['img_group_0']
136 |
137 | bs = img_group.shape[0]
138 | img_group = img_group.reshape(
139 | (-1, self.in_channels) + img_group.shape[3:])
140 | num_seg = img_group.shape[0] // bs
141 |
142 | x = self.extract_feat(img_group)
143 |
144 | if self.necks is not None:
145 | x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x]
146 | x, _ = self.necks(x)
147 | x = x.squeeze(2)
148 | num_seg = 1
149 |
150 | if self.with_spatial_temporal_module:
151 | x = self.spatial_temporal_module(x)
152 | x = x.reshape((-1, num_seg) + x.shape[1:])
153 | if self.with_segmental_consensus:
154 | x = self.segmental_consensus(x)
155 | x = x.squeeze(1)
156 | if self.with_cls_head:
157 | x = self.cls_head(x)
158 |
159 | return x.cpu().numpy()
160 | else:
161 | # fcn testing
162 | assert num_modalities == 1
163 | img_group = kwargs['img_group_0']
164 |
165 | bs = img_group.shape[0]
166 | img_group = img_group.reshape(
167 | (-1, self.in_channels) + img_group.shape[3:])
168 | # standard protocol i.e. 3 crops * 2 clips
169 | num_seg = self.backbone.nsegments * 2
170 | # 3 crops to cover full resolution
171 | num_crops = 3
172 | img_group = img_group.reshape((num_crops, num_seg) + img_group.shape[1:])
173 |
174 | x1 = img_group[:, ::2, :, :, :]
175 | x2 = img_group[:, 1::2, :, :, :]
176 | img_group = torch.cat([x1, x2], 0)
177 | num_seg = num_seg // 2
178 | num_clips = img_group.shape[0]
179 | img_group = img_group.view(num_clips * num_seg, img_group.shape[2], img_group.shape[3], img_group.shape[4])
180 |
181 | if self.flip:
182 | img_group = self.extract_feat(torch.flip(img_group, [-1]))
183 | x = self.extract_feat(img_group)
184 | if self.necks is not None:
185 | x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x]
186 | x, _ = self.necks(x)
187 | else:
188 | x = x.reshape((-1, num_seg) + x.shape[1:]).transpose(1, 2)
189 | x = self.cls_head(x)
190 |
191 | prob = torch.nn.functional.softmax(x.mean([2, 3, 4]), 1).mean(0, keepdim=True).detach().cpu().numpy()
192 | return prob
193 |
--------------------------------------------------------------------------------
/mmaction/losses/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 |
5 | def weighted_nll_loss(pred, label, weight, avg_factor=None):
6 | if avg_factor is None:
7 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
8 | raw = F.nll_loss(pred, label, reduction='none')
9 | return torch.sum(raw * weight)[None] / avg_factor
10 |
11 |
12 | def weighted_cross_entropy(pred, label, weight, avg_factor=None, reduce=True):
13 | if avg_factor is None:
14 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
15 | raw = F.cross_entropy(pred, label, reduction='none')
16 | if reduce:
17 | return torch.sum(raw * weight)[None] / avg_factor
18 | else:
19 | return raw * weight / avg_factor
20 |
21 |
22 | def weighted_binary_cross_entropy(pred, label, weight, avg_factor=None):
23 | if pred.dim() != label.dim():
24 | label, weight = _expand_binary_labels(label, weight, pred.size(-1))
25 | if avg_factor is None:
26 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
27 | return F.binary_cross_entropy_with_logits(
28 | pred, label.float(), weight.float(),
29 | reduction='sum')[None] / avg_factor
30 |
31 |
32 | def smooth_l1_loss(pred, target, beta=1.0, reduction='mean'):
33 | assert beta > 0
34 | assert pred.size() == target.size() and target.numel() > 0
35 | diff = torch.abs(pred - target)
36 | loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
37 | diff - 0.5 * beta)
38 | reduction_enum = F._Reduction.get_enum(reduction)
39 | # none: 0, mean: 1, sum: 2
40 | if reduction_enum == 0:
41 | return loss
42 | elif reduction_enum == 1:
43 | return loss.sum() / pred.numel()
44 | elif reduction_enum == 2:
45 | return loss.sum()
46 |
47 |
48 | def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None):
49 | if avg_factor is None:
50 | avg_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6
51 | loss = smooth_l1_loss(pred, target, beta, reduction='none')
52 | return torch.sum(loss * weight)[None] / avg_factor
53 |
54 |
55 | def accuracy(pred, target, topk=1):
56 | if isinstance(topk, int):
57 | topk = (topk,)
58 | return_single = True
59 | else:
60 | return_single = False
61 |
62 | maxk = max(topk)
63 | _, pred_label = pred.topk(maxk, 1, True, True)
64 | pred_label = pred_label.t()
65 | correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
66 |
67 | res = []
68 | for k in topk:
69 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
70 | res.append(correct_k.mul_(100.0 / pred.size(0)))
71 | return res[0] if return_single else res
72 |
73 |
74 | def _expand_binary_labels(labels, label_weights, label_channels):
75 | bin_labels = labels.new_full((labels.size(0), label_channels), 0)
76 | inds = torch.nonzero(labels >= 1).squeeze()
77 | if inds.numel() > 0:
78 | bin_labels[inds, labels[inds] - 1] = 1
79 | bin_label_weights = label_weights.view(-1, 1).expand(
80 | label_weights.size(0), label_channels)
81 | return bin_labels, bin_label_weights
82 |
83 |
84 | def weighted_multilabel_binary_cross_entropy(
85 | pred, label, weight, avg_factor=None):
86 | label, weight = _expand_multilabel_binary_labels(
87 | label, weight, pred.size(-1))
88 | if avg_factor is None:
89 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
90 | return F.binary_cross_entropy_with_logits(
91 | pred, label.float(), weight.float(),
92 | reduction='sum')[None] / avg_factor
93 |
94 |
95 | def _expand_multilabel_binary_labels(labels, label_weights, label_channels):
96 | bin_labels = labels.new_full((labels.size(0), label_channels), 0)
97 | inds = torch.nonzero(labels >= 1)
98 | if inds.numel() > 0:
99 | for ind in inds:
100 | # note that labels starts from 1
101 | bin_labels[ind[0], labels[ind[0], ind[1]] - 1] = 1
102 | # bin_labels[ind[0], 0] = 1
103 | bin_label_weights = label_weights
104 | return bin_labels, bin_label_weights
105 |
106 |
107 | def multilabel_accuracy(pred, target, topk=1, thr=0.5):
108 | if topk is None:
109 | topk = ()
110 | elif isinstance(topk, int):
111 | topk = (topk,)
112 |
113 | pred = pred.sigmoid()
114 | pred_bin_labels = pred.new_full((pred.size(0),), 0, dtype=torch.long)
115 | pred_vec_labels = pred.new_full(pred.size(), 0, dtype=torch.long)
116 | for i in range(pred.size(0)):
117 | inds = torch.nonzero(pred[i, 1:] > thr).squeeze() + 1
118 | if inds.numel() > 0:
119 | pred_vec_labels[i, inds] = 1
120 | # pred_bin_labels[i] = 1
121 | if pred[i, 0] > thr:
122 | pred_bin_labels[i] = 1
123 | target_bin_labels = target.new_full(
124 | (target.size(0),), 0, dtype=torch.long)
125 | target_vec_labels = target.new_full(target.size(), 0, dtype=torch.long)
126 | for i in range(target.size(0)):
127 | inds = torch.nonzero(target[i, :] >= 1).squeeze()
128 | if inds.numel() > 0:
129 | target_vec_labels[i, target[i, inds]] = 1
130 | target_bin_labels[i] = 1
131 | # overall accuracy
132 | correct = pred_bin_labels.eq(target_bin_labels)
133 | acc = correct.float().sum(0, keepdim=True).mul_(100.0 / correct.size(0))
134 |
135 | # def overlap(tensor1, tensor2):
136 | # indices = tensor1.new_zeros(tensor1).astype(torch.uint8)
137 | # for elem in tensor2:
138 | # indices = indices | (tensor1 == elem)
139 | # return tensor1[indices]
140 |
141 | # recall@thr
142 | recall_thr, prec_thr = recall_prec(pred_vec_labels, target_vec_labels)
143 |
144 | # recall@k
145 | recalls = []
146 | precs = []
147 | for k in topk:
148 | _, pred_label = pred.topk(k, 1, True, True)
149 | pred_vec_labels = pred.new_full(pred.size(), 0, dtype=torch.long)
150 | for i in range(pred.size(0)):
151 | pred_vec_labels[i, pred_label[i]] = 1
152 | recall_k, prec_k = recall_prec(pred_vec_labels, target_vec_labels)
153 | recalls.append(recall_k)
154 | precs.append(prec_k)
155 |
156 | return acc, recall_thr, prec_thr, recalls, precs
157 |
158 |
159 | def recall_prec(pred_vec, target_vec):
160 | """
161 | Args:
162 | pred_vec: