├── mmaction ├── core │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── dist_utils.py │ └── evaluation │ │ ├── __init__.py │ │ ├── accuracy.py │ │ └── eval_hooks.py ├── models │ ├── tenons │ │ ├── necks │ │ │ └── __init__.py │ │ ├── cls_heads │ │ │ ├── __init__.py │ │ │ └── cls_head.py │ │ ├── segmental_consensuses │ │ │ ├── TODO.md │ │ │ ├── __init__.py │ │ │ ├── simple_consensus.py │ │ │ └── stpp.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── nonlocal_block.py │ │ │ ├── norm.py │ │ │ └── conv_module.py │ │ ├── backbones │ │ │ └── __init__.py │ │ └── spatial_temporal_modules │ │ │ ├── __init__.py │ │ │ ├── simple_spatial_module.py │ │ │ ├── simple_spatial_temporal_module.py │ │ │ ├── avgfusion.py │ │ │ └── non_local.py │ ├── recognizers │ │ ├── __init__.py │ │ ├── base.py │ │ ├── TSN3D.py │ │ └── TSN2D.py │ ├── __init__.py │ ├── registry.py │ └── builder.py ├── __init__.py ├── apis │ ├── __init__.py │ ├── env.py │ └── train.py ├── datasets │ ├── loader │ │ ├── __init__.py │ │ ├── build_loader.py │ │ └── sampler.py │ └── __init__.py ├── losses │ ├── __init__.py │ └── losses.py ├── README.md └── utils │ └── misc.py ├── demo ├── demo_pred.gif └── category.txt ├── docs ├── figures │ ├── empirical.png │ ├── exp_result.png │ └── framework.png ├── assets │ ├── font.css │ └── style.css └── index.html ├── .style.yapf ├── tools ├── dist_train_recognizer.sh ├── dist_test_recognizer.sh ├── extract_backbone_weights.py ├── train_recognizer.py ├── README.md └── test_recognizer.py ├── INSTALL.md ├── data └── README.md ├── .gitignore ├── README.md ├── setup.py ├── config_files ├── sthv1 │ ├── tsm_baseline.py │ └── tsm_tpn.py ├── sthv2 │ ├── tsm_baseline.py │ └── tsm_tpn.py └── kinetics400 │ ├── baseline │ ├── r101f16s4.py │ ├── r101f8s8.py │ ├── r50f8s8.py │ ├── r101f32s2.py │ ├── r50f16s4.py │ └── r50f32s2.py │ └── tpn │ ├── r50f8s8.py │ ├── r101f16s4.py │ ├── r101f8s8.py │ ├── r50f16s4.py │ ├── r101f32s2.py │ └── r50f32s2.py ├── MODELZOO.md └── test_video.py /mmaction/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation import * 2 | from .utils import * 3 | -------------------------------------------------------------------------------- /demo/demo_pred.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decisionforce/TPN/HEAD/demo/demo_pred.gif -------------------------------------------------------------------------------- /mmaction/models/tenons/necks/__init__.py: -------------------------------------------------------------------------------- 1 | from .tpn import TPN 2 | 3 | __all__ = ['TPN'] 4 | -------------------------------------------------------------------------------- /docs/figures/empirical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/empirical.png -------------------------------------------------------------------------------- /docs/figures/exp_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/exp_result.png -------------------------------------------------------------------------------- /docs/figures/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/framework.png -------------------------------------------------------------------------------- /mmaction/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import __version__, short_version 2 | 3 | __all__ = ['__version__', 'short_version'] 4 | -------------------------------------------------------------------------------- /mmaction/models/tenons/cls_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .cls_head import ClsHead 2 | 3 | __all__ = [ 4 | 'ClsHead', 5 | ] 6 | -------------------------------------------------------------------------------- /mmaction/models/tenons/segmental_consensuses/TODO.md: -------------------------------------------------------------------------------- 1 | ### TODO 2 | 3 | [x] SimpleConsensus 4 | 5 | [ ] STPP 6 | 7 | [ ] TRN 8 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | BASED_ON_STYLE = pep8 3 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true 4 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true 5 | -------------------------------------------------------------------------------- /mmaction/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .dist_utils import allreduce_grads, DistOptimizerHook 2 | 3 | __all__ = [ 4 | 'allreduce_grads', 'DistOptimizerHook', 5 | ] 6 | -------------------------------------------------------------------------------- /mmaction/models/tenons/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .conv_module import ConvModule 2 | from .norm import build_norm_layer 3 | 4 | __all__ = [ 5 | 'ConvModule', 'build_norm_layer', 6 | ] 7 | -------------------------------------------------------------------------------- /mmaction/models/tenons/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_slow import ResNet_SlowFast 2 | from .resnet import ResNet 3 | 4 | __all__ = [ 5 | 'ResNet_SlowFast', 6 | 'ResNet' 7 | ] 8 | -------------------------------------------------------------------------------- /mmaction/models/recognizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseRecognizer 2 | from .TSN2D import TSN2D 3 | from .TSN3D import TSN3D 4 | 5 | __all__ = [ 6 | 'BaseRecognizer', 'TSN2D', 'TSN3D', 7 | ] 8 | -------------------------------------------------------------------------------- /mmaction/core/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .eval_hooks import (DistEvalHook, DistEvalTopKAccuracyHook, 2 | ) 3 | 4 | __all__ = [ 5 | 'DistEvalHook', 'DistEvalTopKAccuracyHook', 6 | ] 7 | -------------------------------------------------------------------------------- /tools/dist_train_recognizer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PYTHON=${PYTHON:-"python"} 4 | 5 | $PYTHON -m torch.distributed.launch --nproc_per_node=$2 $(dirname "$0")/train_recognizer.py $1 --launcher pytorch ${@:3} 6 | -------------------------------------------------------------------------------- /mmaction/apis/__init__.py: -------------------------------------------------------------------------------- 1 | from .env import init_dist, get_root_logger, set_random_seed 2 | from .train import train_network 3 | 4 | __all__ = [ 5 | 'init_dist', 'get_root_logger', 'set_random_seed', 6 | 'train_network', 7 | ] 8 | -------------------------------------------------------------------------------- /mmaction/datasets/loader/__init__.py: -------------------------------------------------------------------------------- 1 | from .build_loader import build_dataloader 2 | from .sampler import GroupSampler, DistributedGroupSampler 3 | 4 | __all__ = [ 5 | 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader' 6 | ] 7 | -------------------------------------------------------------------------------- /tools/dist_test_recognizer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PYTHON=${PYTHON:-"python"} 4 | 5 | CONFIG=$1 6 | CHECKPOINT=$2 7 | GPUS=$3 8 | 9 | $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \ 10 | $(dirname "$0")/test_recognizer.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} 11 | -------------------------------------------------------------------------------- /mmaction/models/tenons/utils/nonlocal_block.py: -------------------------------------------------------------------------------- 1 | from ..spatial_temporal_modules.non_local import NonLocalModule 2 | 3 | 4 | def build_nonlocal_block(cfg): 5 | """ Build nonlocal block 6 | 7 | Args: 8 | """ 9 | assert isinstance(cfg, dict) 10 | cfg_ = cfg.copy() 11 | return NonLocalModule(**cfg_) 12 | -------------------------------------------------------------------------------- /mmaction/models/tenons/segmental_consensuses/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_consensus import SimpleConsensus 2 | from .stpp import parse_stage_config 3 | from .stpp import StructuredTemporalPyramidPooling 4 | 5 | __all__ = [ 6 | 'SimpleConsensus', 7 | 'StructuredTemporalPyramidPooling', 8 | 'parse_stage_config' 9 | ] 10 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_spatial_module import SimpleSpatialModule 2 | from .simple_spatial_temporal_module import SimpleSpatialTemporalModule 3 | from .avgfusion import AvgFusion 4 | 5 | __all__ = [ 6 | 'SimpleSpatialModule', 7 | 'SimpleSpatialTemporalModule', 8 | 'AvgFusion' 9 | ] 10 | -------------------------------------------------------------------------------- /tools/extract_backbone_weights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import collections 4 | 5 | model = torch.load(sys.argv[1]) 6 | 7 | weight = model['state_dict'] 8 | 9 | out = collections.OrderedDict() 10 | for k, v in weight.items(): 11 | name = k.replace('backbone.', '').replace('cls_head.', '') 12 | out[name] = v.cpu() 13 | print(name) 14 | 15 | torch.save(out, sys.argv[2]) 16 | -------------------------------------------------------------------------------- /mmaction/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .rawframes_dataset import RawFramesDataset 2 | from .utils import get_untrimmed_dataset, get_trimmed_dataset 3 | from .loader import GroupSampler, DistributedGroupSampler, build_dataloader 4 | 5 | __all__ = [ 6 | 'RawFramesDataset', 7 | 'get_trimmed_dataset', 'get_untrimmed_dataset', 8 | 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader' 9 | ] 10 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ```shell 4 | git clone https://github.com/decisionforce/TPN.git 5 | ``` 6 | 7 | ## Requirements 8 | 9 | - Linux 10 | - Python 3.5+ 11 | - PyTorch 1.0+ 12 | - CUDA 9.0+ 13 | - NVCC 2+ 14 | - GCC 4.9+ 15 | - mmcv 0.2.10 16 | 17 | ## Install MMAction 18 | (a) Install Cython 19 | ```shell 20 | pip install cython 21 | ``` 22 | (b) Install mmaction 23 | ```shell 24 | python setup.py develop 25 | ``` 26 | 27 | -------------------------------------------------------------------------------- /mmaction/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .losses import ( 2 | weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy, 3 | weighted_smoothl1, accuracy, 4 | weighted_multilabel_binary_cross_entropy, 5 | multilabel_accuracy) 6 | 7 | __all__ = [ 8 | 'weighted_nll_loss', 'weighted_cross_entropy', 9 | 'weighted_binary_cross_entropy', 10 | 'weighted_smoothl1', 'accuracy', 11 | 'weighted_multilabel_binary_cross_entropy', 12 | 'multilabel_accuracy', 13 | 14 | ] 15 | -------------------------------------------------------------------------------- /mmaction/README.md: -------------------------------------------------------------------------------- 1 | # mmaction 2 | 3 | This code is based on [MMAction](https://github.com/open-mmlab/mmaction) which supports modular design and high efficiency. Our TPN would be merged into the latest MMAction in the future. 4 | 5 | Here we briefly introduce the structure of this codebase: 6 | 7 | - `apis`: contains the launcher of the whole codebase and intializer of distributed training environment. 8 | - `core`: contains multiple hooks for evaluation e.g. calculating the Top-1/Top-5 accuracy. 9 | - `datasets`: contains `rawframes_dataset` and transform for training. 10 | - `losses`: contains kinds of CrossEntropy loss. 11 | - `models`: contains recognizers and various submodules of network e.g. *backbone*, *neck*,and *head* under `models/tenons` 12 | 13 | Such modular design helps us quickly and easily conduct experiments with different modules. 14 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/simple_spatial_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SPATIAL_TEMPORAL_MODULES 5 | 6 | 7 | @SPATIAL_TEMPORAL_MODULES.register_module 8 | class SimpleSpatialModule(nn.Module): 9 | def __init__(self, spatial_type='avg', spatial_size=7): 10 | super(SimpleSpatialModule, self).__init__() 11 | 12 | assert spatial_type in ['avg'] 13 | self.spatial_type = spatial_type 14 | 15 | self.spatial_size = spatial_size if not isinstance(spatial_size, int) else (spatial_size, spatial_size) 16 | 17 | if self.spatial_type == 'avg': 18 | self.op = nn.AvgPool2d(self.spatial_size, stride=1, padding=0) 19 | 20 | def init_weights(self): 21 | pass 22 | 23 | def forward(self, input): 24 | return self.op(input) 25 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/simple_spatial_temporal_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SPATIAL_TEMPORAL_MODULES 5 | 6 | 7 | @SPATIAL_TEMPORAL_MODULES.register_module 8 | class SimpleSpatialTemporalModule(nn.Module): 9 | def __init__(self, spatial_type='avg', spatial_size=7, temporal_size=1): 10 | super(SimpleSpatialTemporalModule, self).__init__() 11 | 12 | assert spatial_type in ['avg'] 13 | self.spatial_type = spatial_type 14 | 15 | self.spatial_size = spatial_size if not isinstance(spatial_size, int) else (spatial_size, spatial_size) 16 | self.temporal_size = temporal_size 17 | self.pool_size = (self.temporal_size,) + self.spatial_size 18 | 19 | if self.spatial_type == 'avg': 20 | self.op = nn.AvgPool3d(self.pool_size, stride=1, padding=0) 21 | 22 | def init_weights(self): 23 | pass 24 | 25 | def forward(self, input): 26 | return self.op(input) 27 | -------------------------------------------------------------------------------- /mmaction/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tenons.backbones import * 2 | from .tenons.spatial_temporal_modules import * 3 | from .tenons.segmental_consensuses import * 4 | from .tenons.cls_heads import * 5 | from .recognizers import * 6 | from .tenons.necks import * 7 | 8 | from .registry import (BACKBONES, SPATIAL_TEMPORAL_MODULES, SEGMENTAL_CONSENSUSES, HEADS, 9 | RECOGNIZERS, LOCALIZERS, DETECTORS, ARCHITECTURES, 10 | NECKS, ROI_EXTRACTORS) 11 | from .builder import (build_backbone, build_spatial_temporal_module, build_segmental_consensus, 12 | build_head, build_recognizer, build_detector, 13 | build_localizer, build_architecture, 14 | build_neck, build_roi_extractor) 15 | 16 | __all__ = [ 17 | 'BACKBONES', 'SPATIAL_TEMPORAL_MODULES', 'SEGMENTAL_CONSENSUSES', 'HEADS', 18 | 'RECOGNIZERS', 'LOCALIZERS', 'DETECTORS', 'ARCHITECTURES', 19 | 'NECKS', 'ROI_EXTRACTORS', 20 | 'build_backbone', 'build_spatial_temporal_module', 'build_segmental_consensus', 21 | 'build_head', 'build_recognizer', 'build_detector', 22 | 'build_localizer', 'build_architecture', 23 | 'build_neck', 'build_roi_extractor' 24 | ] 25 | -------------------------------------------------------------------------------- /mmaction/models/recognizers/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABCMeta, abstractmethod 3 | 4 | import torch.nn as nn 5 | 6 | 7 | class BaseRecognizer(nn.Module): 8 | """Base class for recognizers""" 9 | 10 | __metaclass__ = ABCMeta 11 | 12 | def __init__(self): 13 | super(BaseRecognizer, self).__init__() 14 | 15 | @property 16 | def with_tenon_list(self): 17 | return hasattr(self, 'tenon_list') and self.tenon_list is not None 18 | 19 | @property 20 | def with_cls(self): 21 | return hasattr(self, 'cls_head') and self.cls_head is not None 22 | 23 | @abstractmethod 24 | def forward_train(self, num_modalities, **kwargs): 25 | pass 26 | 27 | @abstractmethod 28 | def forward_test(self, num_modalities, **kwargs): 29 | pass 30 | 31 | def init_weights(self, pretrained=None): 32 | if pretrained is not None: 33 | logger = logging.getLogger() 34 | logger.info("load model from: {}".format(pretrained)) 35 | 36 | def forward(self, num_modalities, img_meta, return_loss=True, **kwargs): 37 | num_modalities = int(num_modalities[0]) 38 | if return_loss: 39 | return self.forward_train(num_modalities, img_meta, **kwargs) 40 | else: 41 | return self.forward_test(num_modalities, img_meta, **kwargs) 42 | -------------------------------------------------------------------------------- /mmaction/core/evaluation/accuracy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import confusion_matrix 3 | 4 | 5 | def softmax(x, dim=1): 6 | """Compute softmax values for each sets of scores in x.""" 7 | e_x = np.exp(x - np.max(x, axis=dim, keepdims=True)) 8 | return e_x / e_x.sum(axis=dim, keepdims=True) 9 | 10 | 11 | def mean_class_accuracy(scores, labels): 12 | pred = np.argmax(scores, axis=1) 13 | cf = confusion_matrix(labels, pred).astype(float) 14 | 15 | cls_cnt = cf.sum(axis=1) 16 | cls_hit = np.diag(cf) 17 | 18 | return np.mean(cls_hit / cls_cnt) 19 | 20 | 21 | def non_mean_class_accuracy(scores, labels): 22 | pred = np.argmax(scores, axis=1) 23 | cf = confusion_matrix(labels, pred).astype(float) 24 | 25 | cls_cnt = cf.sum(axis=1) 26 | cls_hit = np.diag(cf) 27 | 28 | return cls_hit / cls_cnt 29 | 30 | 31 | def top_k_acc(score, lb_set, k=3): 32 | idx = np.argsort(score)[-k:] 33 | return len(lb_set.intersection(idx)), len(lb_set) 34 | 35 | 36 | def top_k_hit(score, lb_set, k=3): 37 | idx = np.argsort(score)[-k:] 38 | return len(lb_set.intersection(idx)) > 0, 1 39 | 40 | 41 | def top_k_accuracy(scores, labels, k=(1,)): 42 | res = [] 43 | for kk in k: 44 | hits = [] 45 | for x, y in zip(scores, labels): 46 | y = [y] if isinstance(y, int) else y 47 | hits.append(top_k_hit(x, set(y), k=kk)[0]) 48 | res.append(np.mean(hits)) 49 | return res 50 | -------------------------------------------------------------------------------- /mmaction/models/tenons/segmental_consensuses/simple_consensus.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SEGMENTAL_CONSENSUSES 5 | 6 | 7 | class _SimpleConsensus(torch.autograd.Function): 8 | """Simplest segmental consensus module""" 9 | 10 | def __init__(self, 11 | consensus_type='avg', 12 | dim=1): 13 | super(_SimpleConsensus, self).__init__() 14 | 15 | assert consensus_type in ['avg'] 16 | self.consensus_type = consensus_type 17 | self.dim = dim 18 | self.shape = None 19 | 20 | def forward(self, x): 21 | self.shape = x.size() 22 | if self.consensus_type == 'avg': 23 | output = x.mean(dim=self.dim, keepdim=True) 24 | else: 25 | output = None 26 | return output 27 | 28 | def backward(self, grad_output): 29 | if self.consensus_type == 'avg': 30 | grad_in = grad_output.expand(self.shape) / float(self.shape[self.dim]) 31 | else: 32 | grad_in = None 33 | return grad_in 34 | 35 | 36 | @SEGMENTAL_CONSENSUSES.register_module 37 | class SimpleConsensus(nn.Module): 38 | def __init__(self, consensus_type, dim=1): 39 | super(SimpleConsensus, self).__init__() 40 | 41 | assert consensus_type in ['avg'] 42 | self.consensus_type = consensus_type 43 | self.dim = dim 44 | 45 | def init_weights(self): 46 | pass 47 | 48 | def forward(self, input): 49 | return _SimpleConsensus(self.consensus_type, self.dim)(input) 50 | -------------------------------------------------------------------------------- /mmaction/models/registry.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class Registry(object): 5 | 6 | def __init__(self, name): 7 | self._name = name 8 | self._module_dict = dict() 9 | 10 | @property 11 | def name(self): 12 | return self._name 13 | 14 | @property 15 | def module_dict(self): 16 | return self._module_dict 17 | 18 | def _register_module(self, module_class): 19 | """Register a module 20 | 21 | Args: 22 | module (:obj:`nn.Module`): Module to be registered. 23 | """ 24 | if not issubclass(module_class, nn.Module): 25 | raise TypeError( 26 | 'module must be a child of nn.Module, but got {}'.format( 27 | module_class)) 28 | module_name = module_class.__name__ 29 | if module_name in self._module_dict: 30 | raise KeyError('{} is already registered in {}'.format( 31 | module_name, self.name)) 32 | self._module_dict[module_name] = module_class 33 | 34 | def register_module(self, cls): 35 | self._register_module(cls) 36 | return cls 37 | 38 | 39 | BACKBONES = Registry('backbone') 40 | FLOWNETS = Registry('flownet') 41 | SPATIAL_TEMPORAL_MODULES = Registry('spatial_temporal_module') 42 | SEGMENTAL_CONSENSUSES = Registry('segmental_consensus') 43 | HEADS = Registry('head') 44 | RECOGNIZERS = Registry('recognizer') 45 | LOCALIZERS = Registry('localizer') 46 | DETECTORS = Registry('detector') 47 | ARCHITECTURES = Registry('architecture') 48 | NECKS = Registry('neck') 49 | ROI_EXTRACTORS = Registry('roi_extractor') 50 | -------------------------------------------------------------------------------- /mmaction/utils/misc.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import numpy as np 3 | import mmcv 4 | 5 | 6 | def rsetattr(obj, attr, val): 7 | ''' 8 | See: 9 | https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects 10 | ''' 11 | pre, _, post = attr.rpartition('.') 12 | return setattr(rgetattr(obj, pre) if pre else obj, post, val) 13 | 14 | 15 | def rgetattr(obj, attr, *args): 16 | def _getattr(obj, attr): 17 | return getattr(obj, attr, *args) 18 | 19 | return functools.reduce(_getattr, [obj] + attr.split('.')) 20 | 21 | 22 | def rhasattr(obj, attr, *args): 23 | def _hasattr(obj, attr): 24 | if hasattr(obj, attr): 25 | return getattr(obj, attr) 26 | else: 27 | return None 28 | 29 | return functools.reduce(_hasattr, [obj] + attr.split('.')) is not None 30 | 31 | 32 | def tensor2video_snaps(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): 33 | num_videos = tensor.size(0) 34 | num_frames = tensor.size(2) 35 | mean = np.array(mean, dtype=np.float32) 36 | std = np.array(std, dtype=np.float32) 37 | video_snaps = [] 38 | for vid_id in range(num_videos): 39 | img = tensor[vid_id, :, num_frames // 40 | 2, ...].cpu().numpy().transpose(1, 2, 0) 41 | img = mmcv.imdenormalize( 42 | img, mean, std, to_bgr=to_rgb).astype(np.uint8) 43 | video_snaps.append(np.ascontiguousarray(img)) 44 | return video_snaps 45 | 46 | 47 | def multi_apply(func, *args, **kwargs): 48 | pfunc = functools.partial(func, **kwargs) if kwargs else func 49 | map_results = map(pfunc, *args) 50 | return tuple(map(list, zip(*map_results))) 51 | -------------------------------------------------------------------------------- /docs/assets/font.css: -------------------------------------------------------------------------------- 1 | /* Homepage Font */ 2 | 3 | /* latin-ext */ 4 | @font-face { 5 | font-family: 'Lato'; 6 | font-style: normal; 7 | font-weight: 400; 8 | src: local('Lato Regular'), local('Lato-Regular'), url(https://fonts.gstatic.com/s/lato/v16/S6uyw4BMUTPHjxAwXjeu.woff2) format('woff2'); 9 | unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; 10 | } 11 | 12 | /* latin */ 13 | @font-face { 14 | font-family: 'Lato'; 15 | font-style: normal; 16 | font-weight: 400; 17 | src: local('Lato Regular'), local('Lato-Regular'), url(https://fonts.gstatic.com/s/lato/v16/S6uyw4BMUTPHjx4wXg.woff2) format('woff2'); 18 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; 19 | } 20 | 21 | /* latin-ext */ 22 | @font-face { 23 | font-family: 'Lato'; 24 | font-style: normal; 25 | font-weight: 700; 26 | src: local('Lato Bold'), local('Lato-Bold'), url(https://fonts.gstatic.com/s/lato/v16/S6u9w4BMUTPHh6UVSwaPGR_p.woff2) format('woff2'); 27 | unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; 28 | } 29 | 30 | /* latin */ 31 | @font-face { 32 | font-family: 'Lato'; 33 | font-style: normal; 34 | font-weight: 700; 35 | src: local('Lato Bold'), local('Lato-Bold'), url(https://fonts.gstatic.com/s/lato/v16/S6u9w4BMUTPHh6UVSwiPGQ.woff2) format('woff2'); 36 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; 37 | } 38 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ## Data Preparation 2 | 3 | ### Notes on Video Data format 4 | Since the original VideoDataloader of MMAction requires [decord](https://github.com/zhreshold/decord) for efficient video loading which is non-trivial to compile, this repo only supports **raw frame** format of videos. Therefore, you have to extract frames from raw videos. We will find another libaries and support VideoLoader soon. 5 | 6 | ### Supported datasets 7 | The `rawframe_dataset` loads data in a general manner by preparing a `.txt` file which contains the directory path of frames, total number of a certain video, and the groundtruth label. After that, specify the `data_root` and `image_tmpl` of config files. See the sample below: 8 | 9 | ```bash 10 | shot_put/c5-PBp04AQI 299 298 11 | marching/5OEnoefcO1Y 299 192 12 | dancing_ballet/pR1jxLvjcgU 249 84 13 | motorcycling/0dC3o90WYHs 299 199 14 | hoverboarding/RVkof6bxvg0 278 157 15 | playing_piano/H3JzOkvTrJk 297 241 16 | ``` 17 | Such general loader might help your experiment with other dataset e.g. UCF101 or custom dataset. 18 | 19 | ### Prepare annotations 20 | 21 | - [Kinetics400](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) contains ~240k training videos and ~19k validation videos. See the [guide](https://github.com/open-mmlab/mmaction/tree/master/data_tools/kinetics400/PREPARING_KINETICS400.md) of original MMAction to generate annotations. 22 | - [Something-Someting](https://github.com/TwentyBN) has 2 versions which you have to apply on their [website](https://20bn.com/datasets/something-something). See the [guide](https://github.com/mit-han-lab/temporal-shift-module/tree/master/tools) of TSM to generate annotations. 23 | 24 | Thank original [MMAction](https://github.com/open-mmlab/mmaction) and [TSM](https://github.com/mit-han-lab/temporal-shift-module) repo for kindly providing preprocessing scripts. 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # cython generated cpp 107 | mmaction/version.py 108 | .vscode 109 | .idea 110 | -------------------------------------------------------------------------------- /mmaction/models/tenons/utils/norm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | norm_cfg = { 4 | # format: layer_type: (abbreviation, module) 5 | 'BN': ('bn', nn.BatchNorm2d), 6 | 'SyncBN': ('bn', None), 7 | 'GN': ('gn', nn.GroupNorm), 8 | # and potentially 'SN' 9 | } 10 | 11 | 12 | def build_norm_layer(cfg, num_features, postfix=''): 13 | """ Build normalization layer 14 | Args: 15 | cfg (dict): cfg should contain: 16 | type (str): identify norm layer type. 17 | layer args: args needed to instantiate a norm layer. 18 | frozen (bool): [optional] whether stop gradient updates 19 | of norm layer, it is helpful to set frozen mode 20 | in backbone's norms. 21 | num_features (int): number of channels from input 22 | postfix (int, str): appended into norm abbreation to 23 | create named layer. 24 | Returns: 25 | name (str): abbreation + postfix 26 | layer (nn.Module): created norm layer 27 | """ 28 | assert isinstance(cfg, dict) and 'type' in cfg 29 | cfg_ = cfg.copy() 30 | 31 | layer_type = cfg_.pop('type') 32 | if layer_type not in norm_cfg: 33 | raise KeyError('Unrecognized norm type {}'.format(layer_type)) 34 | else: 35 | abbr, norm_layer = norm_cfg[layer_type] 36 | if norm_layer is None: 37 | raise NotImplementedError 38 | 39 | assert isinstance(postfix, (int, str)) 40 | name = abbr + str(postfix) 41 | 42 | frozen = cfg_.pop('frozen', False) 43 | cfg_.setdefault('eps', 1e-5) 44 | if layer_type != 'GN': 45 | layer = norm_layer(num_features, **cfg_) 46 | else: 47 | assert 'num_groups' in cfg_ 48 | layer = norm_layer(num_channels=num_features, **cfg_) 49 | 50 | if frozen: 51 | for param in layer.parameters(): 52 | param.requires_grad = False 53 | 54 | return name, layer 55 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/avgfusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SPATIAL_TEMPORAL_MODULES 5 | 6 | 7 | @SPATIAL_TEMPORAL_MODULES.register_module 8 | class AvgFusion(nn.Module): 9 | def __init__(self, fusion_type='concat'): 10 | super(AvgFusion, self).__init__() 11 | assert fusion_type in ['add', 'avg', 'concat', 'concatadd', 'concatavg'] 12 | self.fusion_type = fusion_type 13 | 14 | def init_weights(self): 15 | pass 16 | 17 | def forward(self, input): 18 | assert (isinstance(input, tuple)) 19 | after_avgpool = [F.adaptive_avg_pool3d(each, 1) for each in input] 20 | 21 | if self.fusion_type == 'add': 22 | out = torch.sum(torch.cat(after_avgpool, -1), -1, keepdim=True) 23 | 24 | elif self.fusion_type == 'avg': 25 | out = torch.mean(torch.cat(after_avgpool, -1), -1, keepdim=True) 26 | 27 | elif self.fusion_type == 'concat': 28 | out = torch.cat(after_avgpool, 1) 29 | 30 | elif self.fusion_type == 'concatadd': 31 | out_first = torch.cat(after_avgpool[:-1], 1) 32 | out = torch.sum(torch.cat([out_first, after_avgpool[-1]], -1), -1, keepdim=True) 33 | elif self.fusion_type == 'concatavg': 34 | out_first = torch.cat(after_avgpool[:-1], 1) 35 | out = torch.mean(torch.cat([out_first, after_avgpool[-1]], -1), -1, keepdim=True) 36 | else: 37 | raise ValueError 38 | 39 | return out 40 | 41 | 42 | def main(): 43 | res2 = torch.FloatTensor(8, 512, 8, 56, 56).cuda() 44 | res3 = torch.FloatTensor(8, 512, 8, 28, 28).cuda() 45 | res4 = torch.FloatTensor(8, 512, 8, 14, 14).cuda() 46 | res5 = torch.FloatTensor(8, 512, 8, 7, 7).cuda() 47 | feature = tuple([res2, res3, res4, res5]) 48 | model = AvgFusion(fusion_type='add').cuda() 49 | out = model(feature) 50 | print(out.shape) 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /mmaction/core/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch.distributed as dist 4 | from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors, 5 | _take_tensors) 6 | from mmcv.runner import OptimizerHook 7 | 8 | 9 | def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): 10 | if bucket_size_mb > 0: 11 | bucket_size_bytes = bucket_size_mb * 1024 * 1024 12 | buckets = _take_tensors(tensors, bucket_size_bytes) 13 | else: 14 | buckets = OrderedDict() 15 | for tensor in tensors: 16 | tp = tensor.type() 17 | if tp not in buckets: 18 | buckets[tp] = [] 19 | buckets[tp].append(tensor) 20 | buckets = buckets.values() 21 | 22 | for bucket in buckets: 23 | flat_tensors = _flatten_dense_tensors(bucket) 24 | dist.all_reduce(flat_tensors) 25 | flat_tensors.div_(world_size) 26 | for tensor, synced in zip( 27 | bucket, _unflatten_dense_tensors(flat_tensors, bucket)): 28 | tensor.copy_(synced) 29 | 30 | 31 | def allreduce_grads(model, coalesce=True, bucket_size_mb=-1): 32 | grads = [ 33 | param.grad.data for param in model.parameters() 34 | if param.requires_grad and param.grad is not None 35 | ] 36 | world_size = dist.get_world_size() 37 | if coalesce: 38 | _allreduce_coalesced(grads, world_size, bucket_size_mb) 39 | else: 40 | for tensor in grads: 41 | dist.all_reduce(tensor.div_(world_size)) 42 | 43 | 44 | class DistOptimizerHook(OptimizerHook): 45 | 46 | def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1): 47 | self.grad_clip = grad_clip 48 | self.coalesce = coalesce 49 | self.bucket_size_mb = bucket_size_mb 50 | 51 | def after_train_iter(self, runner): 52 | runner.optimizer.zero_grad() 53 | runner.outputs['loss'].backward() 54 | allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb) 55 | if self.grad_clip is not None: 56 | self.clip_grads(runner.model.parameters()) 57 | runner.optimizer.step() 58 | -------------------------------------------------------------------------------- /mmaction/apis/env.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | 5 | import numpy as np 6 | import torch 7 | import torch.distributed as dist 8 | import torch.multiprocessing as mp 9 | from mmcv.runner import get_dist_info 10 | import subprocess 11 | 12 | 13 | def init_dist(launcher, backend='nccl', **kwargs): 14 | if mp.get_start_method(allow_none=True) is None: 15 | mp.set_start_method('spawn') 16 | if launcher == 'pytorch': 17 | _init_dist_pytorch(backend, **kwargs) 18 | elif launcher == 'mpi': 19 | _init_dist_mpi(backend, **kwargs) 20 | elif launcher == 'slurm': 21 | _init_dist_slurm(backend, **kwargs) 22 | else: 23 | raise ValueError('Invalid launcher type: {}'.format(launcher)) 24 | 25 | 26 | def _init_dist_pytorch(backend, **kwargs): 27 | # TODO: use local_rank instead of rank % num_gpus 28 | rank = int(os.environ['RANK']) 29 | num_gpus = torch.cuda.device_count() 30 | torch.cuda.set_device(rank % num_gpus) 31 | dist.init_process_group(backend=backend, **kwargs) 32 | 33 | 34 | def _init_dist_mpi(backend, **kwargs): 35 | raise NotImplementedError 36 | 37 | 38 | def _init_dist_slurm(backend, port=12345, **kwargs): 39 | proc_id = int(os.environ['SLURM_PROCID']) 40 | ntasks = int(os.environ['SLURM_NTASKS']) 41 | node_list = os.environ['SLURM_NODELIST'] 42 | num_gpus = torch.cuda.device_count() 43 | torch.cuda.set_device(proc_id % num_gpus) 44 | addr = subprocess.getoutput( 45 | 'scontrol show hostname {} | head -n1'.format(node_list)) 46 | os.environ['MASTER_PORT'] = str(port) 47 | os.environ['MASTER_ADDR'] = addr 48 | os.environ['WORLD_SIZE'] = str(ntasks) 49 | os.environ['RANK'] = str(proc_id) 50 | dist.init_process_group(backend=backend) 51 | # raise NotImplementedError 52 | 53 | 54 | def set_random_seed(seed): 55 | random.seed(seed) 56 | np.random.seed(seed) 57 | torch.manual_seed(seed) 58 | torch.cuda.manual_seed_all(seed) 59 | 60 | 61 | def get_root_logger(log_level=logging.INFO): 62 | logger = logging.getLogger() 63 | if not logger.hasHandlers(): 64 | logging.basicConfig( 65 | format='%(asctime)s - %(levelname)s - %(message)s', 66 | level=log_level) 67 | rank, _ = get_dist_info() 68 | if rank != 0: 69 | logger.setLevel('ERROR') 70 | return logger 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Temporal Pyramid Network for Action Recognition 2 | 3 | ![image](./docs/figures/framework.png) 4 | [[Paper](https://arxiv.org/pdf/2004.03548.pdf)] 5 | [[Project Page](https://decisionforce.github.io/TPN/)] 6 | 7 | 8 | ## License 9 | The project is release under the [Apache 2.0 license](./LICENSE). 10 | 11 | ## Model Zoo 12 | Results and reference models are available in the [model zoo](./MODELZOO.md). 13 | 14 | ## Installation and Data Preparation 15 | Please refer to [INSTALL](INSTALL.md) for installation and [DATA](./data/README.md) for data preparation. 16 | 17 | ## Get Started 18 | Please refer to [GETTING_STARTED](./tools/README.md) for detailed usage. 19 | 20 | ## Quick Demo 21 | We provide `test_video.py` to inference a single video. 22 | Download the checkpoints and put them to the `ckpt/.` and run: 23 | ``` 24 | python ./test_video.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --video_file ${VIDOE_NAME} --label_file ${LABLE_FILE} --rendered_output ${RENDERED_NAME} 25 | ``` 26 | Arguments: 27 | - `--video_file`: Path for demo video, default is `./demo/demo.mp4` 28 | - `--label_file`: The label file for pretrained model, default is `demo/category.txt` 29 | - `--redndered_output`: The output file name. If specified, the script will render output video with label name, default is `demo/demo_pred.webm`. 30 | 31 | For example, we can predict for the demo video (download [here](https://drive.google.com/open?id=14VYS8hGA5i1J70qBqrUqLiDxJq_FgXiW) and put it under `demo/.`) by running: 32 | ``` 33 | python ./test_video.py config_files/sthv2/tsm_tpn.py ckpt/sthv2_tpn.pth 34 | ``` 35 | The rendered output video: 36 | 37 | ![image](./demo/demo_pred.gif) 38 | 39 | ## Acknowledgement 40 | We really appreciate developers of [MMAction](https://github.com/open-mmlab/mmaction) for such wonderful codebase. We also thank Yue Zhao for the insightful discussion. 41 | 42 | ## Contact 43 | This repo is currently maintained by Ceyuan Yang ([@limbo0000](https://github.com/limbo0000)) and Yinghao Xu ([@justimyhxu](https://github.com/justimyhxu)). 44 | 45 | ## Bibtex 46 | ``` 47 | @inproceedings{yang2020tpn, 48 | title={Temporal Pyramid Network for Action Recognition}, 49 | author={Yang, Ceyuan and Xu, Yinghao and Shi, Jianping and Dai, Bo and Zhou, Bolei}, 50 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 51 | year={2020}, 52 | } 53 | ``` 54 | -------------------------------------------------------------------------------- /mmaction/models/builder.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | from torch import nn 3 | 4 | from .registry import (BACKBONES, FLOWNETS, SPATIAL_TEMPORAL_MODULES, 5 | SEGMENTAL_CONSENSUSES, HEADS, 6 | RECOGNIZERS, DETECTORS, LOCALIZERS, ARCHITECTURES, 7 | NECKS, ROI_EXTRACTORS) 8 | 9 | 10 | def _build_module(cfg, registry, default_args): 11 | assert isinstance(cfg, dict) and 'type' in cfg 12 | assert isinstance(default_args, dict) or default_args is None 13 | args = cfg.copy() 14 | obj_type = args.pop('type') 15 | if mmcv.is_str(obj_type): 16 | if obj_type not in registry.module_dict: 17 | raise KeyError('{} is not in the {} registry'.format( 18 | obj_type, registry.name)) 19 | obj_type = registry.module_dict[obj_type] 20 | elif not isinstance(obj_type, type): 21 | raise TypeError('type must be a str or valid type, but got {}'.format( 22 | type(obj_type))) 23 | if default_args is not None: 24 | for name, value in default_args.items(): 25 | args.setdefault(name, value) 26 | return obj_type(**args) 27 | 28 | 29 | def build(cfg, registry, default_args=None): 30 | if isinstance(cfg, list): 31 | modules = [_build_module(cfg_, registry, default_args) for cfg_ in cfg] 32 | return nn.Sequential(*modules) 33 | else: 34 | return _build_module(cfg, registry, default_args) 35 | 36 | 37 | def build_backbone(cfg): 38 | return build(cfg, BACKBONES) 39 | 40 | 41 | def build_flownet(cfg): 42 | return build(cfg, FLOWNETS) 43 | 44 | 45 | def build_spatial_temporal_module(cfg): 46 | return build(cfg, SPATIAL_TEMPORAL_MODULES) 47 | 48 | 49 | def build_segmental_consensus(cfg): 50 | return build(cfg, SEGMENTAL_CONSENSUSES) 51 | 52 | 53 | def build_head(cfg): 54 | return build(cfg, HEADS) 55 | 56 | 57 | def build_recognizer(cfg, train_cfg=None, test_cfg=None): 58 | return build(cfg, RECOGNIZERS, 59 | dict(train_cfg=train_cfg, test_cfg=test_cfg)) 60 | 61 | 62 | def build_localizer(cfg, train_cfg=None, test_cfg=None): 63 | return build(cfg, LOCALIZERS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) 64 | 65 | 66 | def build_detector(cfg, train_cfg=None, test_cfg=None): 67 | return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) 68 | 69 | 70 | def build_architecture(cfg, train_cfg=None, test_cfg=None): 71 | return build(cfg, ARCHITECTURES, 72 | dict(train_cfg=train_cfg, test_cfg=test_cfg)) 73 | 74 | 75 | def build_neck(cfg): 76 | return build(cfg, NECKS) 77 | 78 | 79 | def build_roi_extractor(cfg): 80 | return build(cfg, ROI_EXTRACTORS) 81 | -------------------------------------------------------------------------------- /tools/train_recognizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import argparse 4 | from mmcv import Config 5 | 6 | from mmaction import __version__ 7 | from mmaction.datasets import get_trimmed_dataset 8 | from mmaction.apis import (train_network, init_dist, get_root_logger, 9 | set_random_seed) 10 | from mmaction.models import build_recognizer 11 | import torch 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(description='Train an action recognizer') 16 | parser.add_argument('config', help='train config file path') 17 | parser.add_argument('--work_dir', help='the dir to save logs and models') 18 | parser.add_argument( 19 | '--resume_from', help='the checkpoint file to resume from') 20 | parser.add_argument( 21 | '--validate', 22 | action='store_true', 23 | help='whether to evaluate the checkpoint during training') 24 | parser.add_argument( 25 | '--gpus', 26 | type=int, 27 | default=1, 28 | help='number of gpus to use ' 29 | '(only applicable to non-distributed training)') 30 | parser.add_argument('--seed', type=int, default=None, help='random seed') 31 | parser.add_argument( 32 | '--launcher', 33 | choices=['none', 'pytorch', 'slurm', 'mpi'], 34 | default='none', 35 | help='job launcher') 36 | parser.add_argument('--local_rank', type=int, default=0) 37 | args = parser.parse_args() 38 | 39 | return args 40 | 41 | 42 | def main(): 43 | args = parse_args() 44 | 45 | cfg = Config.fromfile(args.config) 46 | # set cudnn_benchmark 47 | if cfg.get('cudnn_benchmark', False): 48 | torch.backends.cudnn.benchmark = True 49 | # update configs according to CLI args 50 | if args.work_dir is not None: 51 | cfg.work_dir = args.work_dir 52 | if args.resume_from is not None: 53 | cfg.resume_from = args.resume_from 54 | cfg.gpus = args.gpus 55 | if cfg.checkpoint_config is not None: 56 | # save mmaction version in checkpoints as meta data 57 | cfg.checkpoint_config.meta = dict( 58 | mmact_version=__version__, config=cfg.text) 59 | 60 | # init distributed env first, since logger depends on the dist info. 61 | if args.launcher == 'none': 62 | distributed = False 63 | else: 64 | distributed = True 65 | init_dist(args.launcher, **cfg.dist_params) 66 | 67 | # init logger before other steps 68 | logger = get_root_logger(cfg.log_level) 69 | logger.info('Distributed training: {}'.format(distributed)) 70 | 71 | # set random seeds 72 | if args.seed is not None: 73 | logger.info('Set random seed to {}'.format(args.seed)) 74 | set_random_seed(args.seed) 75 | 76 | model = build_recognizer( 77 | cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) 78 | 79 | train_dataset = get_trimmed_dataset(cfg.data.train) 80 | train_network( 81 | model, 82 | train_dataset, 83 | cfg, 84 | distributed=distributed, 85 | validate=args.validate, 86 | logger=logger) 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /mmaction/models/tenons/utils/conv_module.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import torch.nn as nn 4 | from mmcv.cnn import kaiming_init, constant_init 5 | 6 | from .norm import build_norm_layer 7 | 8 | 9 | class ConvModule(nn.Module): 10 | 11 | def __init__(self, 12 | in_channels, 13 | out_channels, 14 | kernel_size, 15 | stride=1, 16 | padding=0, 17 | dilation=1, 18 | groups=1, 19 | bias=True, 20 | normalize=None, 21 | activation='relu', 22 | inplace=True, 23 | activate_last=True): 24 | super(ConvModule, self).__init__() 25 | self.with_norm = normalize is not None 26 | self.with_activatation = activation is not None 27 | self.with_bias = bias 28 | self.activation = activation 29 | self.activate_last = activate_last 30 | 31 | if self.with_norm and self.with_bias: 32 | warnings.warn('ConvModule has norm and bias at the same time') 33 | 34 | self.conv = nn.Conv2d( 35 | in_channels, 36 | out_channels, 37 | kernel_size, 38 | stride, 39 | padding, 40 | dilation, 41 | groups, 42 | bias=bias) 43 | 44 | self.in_channels = self.conv.in_channels 45 | self.out_channels = self.conv.out_channels 46 | self.kernel_size = self.conv.kernel_size 47 | self.stride = self.conv.stride 48 | self.padding = self.conv.padding 49 | self.dilation = self.conv.dilation 50 | self.transposed = self.conv.transposed 51 | self.output_padding = self.conv.output_padding 52 | self.groups = self.conv.groups 53 | 54 | if self.with_norm: 55 | norm_channels = out_channels if self.activate_last else in_channels 56 | self.norm_name, norm = build_norm_layer(normalize, norm_channels) 57 | self.add_module(self.norm_name, norm) 58 | 59 | if self.with_activatation: 60 | assert activation in ['relu'], 'Only ReLU supported.' 61 | if self.activation == 'relu': 62 | self.activate = nn.ReLU(inplace=inplace) 63 | 64 | # Default using msra init 65 | self.init_weights() 66 | 67 | @property 68 | def norm(self): 69 | return getattr(self, self.norm_name) 70 | 71 | def init_weights(self): 72 | nonlinearity = 'relu' if self.activation is None else self.activation 73 | kaiming_init(self.conv, nonlinearity=nonlinearity) 74 | if self.with_norm: 75 | constant_init(self.norm, 1, bias=0) 76 | 77 | def forward(self, x, activate=True, norm=True): 78 | if self.activate_last: 79 | x = self.conv(x) 80 | if norm and self.with_norm: 81 | x = self.norm(x) 82 | if activate and self.with_activatation: 83 | x = self.activate(x) 84 | else: 85 | if norm and self.with_norm: 86 | x = self.norm(x) 87 | if activate and self.with_activatation: 88 | x = self.activate(x) 89 | x = self.conv(x) 90 | return x 91 | -------------------------------------------------------------------------------- /mmaction/models/tenons/cls_heads/cls_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import HEADS 5 | 6 | 7 | @HEADS.register_module 8 | class ClsHead(nn.Module): 9 | """Simplest classification head""" 10 | 11 | def __init__(self, 12 | with_avg_pool=True, 13 | temporal_feature_size=1, 14 | spatial_feature_size=7, 15 | dropout_ratio=0.8, 16 | in_channels=2048, 17 | num_classes=101, 18 | fcn_testing=False, 19 | init_std=0.01): 20 | 21 | super(ClsHead, self).__init__() 22 | 23 | self.with_avg_pool = with_avg_pool 24 | self.dropout_ratio = dropout_ratio 25 | self.in_channels = in_channels 26 | self.dropout_ratio = dropout_ratio 27 | self.temporal_feature_size = temporal_feature_size 28 | self.spatial_feature_size = spatial_feature_size 29 | self.init_std = init_std 30 | self.fcn_testing = fcn_testing 31 | 32 | if self.dropout_ratio != 0: 33 | self.dropout = nn.Dropout(p=self.dropout_ratio) 34 | else: 35 | self.dropout = None 36 | # self.with_avg_pool = fcn_testing 37 | if self.with_avg_pool: 38 | self.avg_pool = nn.AvgPool3d((temporal_feature_size, spatial_feature_size, spatial_feature_size), (1, 1, 1), 39 | (0, 0, 0)) 40 | if self.fcn_testing: 41 | self.new_cls = None 42 | self.in_channels = in_channels 43 | self.num_classes = num_classes 44 | self.fc_cls = nn.Linear(in_channels, num_classes) 45 | 46 | def init_weights(self): 47 | nn.init.normal_(self.fc_cls.weight, 0, self.init_std) 48 | nn.init.constant_(self.fc_cls.bias, 0) 49 | 50 | def forward(self, x): 51 | if not self.fcn_testing: 52 | if x.ndimension() == 4: 53 | x = x.unsqueeze(2) 54 | assert x.shape[1] == self.in_channels 55 | assert x.shape[2] == self.temporal_feature_size 56 | assert x.shape[3] == self.spatial_feature_size 57 | assert x.shape[4] == self.spatial_feature_size 58 | if self.with_avg_pool: 59 | x = self.avg_pool(x) 60 | if self.dropout is not None: 61 | x = self.dropout(x) 62 | x = x.view(x.size(0), -1) 63 | cls_score = self.fc_cls(x) 64 | return cls_score 65 | else: 66 | if self.with_avg_pool: 67 | x = self.avg_pool(x) 68 | if self.new_cls is None: 69 | self.new_cls = nn.Conv3d(self.in_channels, self.num_classes, 1, 1, 0).cuda() 70 | self.new_cls.weight.copy_(self.fc_cls.weight.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)) 71 | self.new_cls.bias.copy_(self.fc_cls.bias) 72 | self.fc_cls = None 73 | class_map = self.new_cls(x) 74 | # return class_map.mean([2,3,4]) 75 | return class_map 76 | 77 | def loss(self, 78 | cls_score, 79 | labels): 80 | losses = dict() 81 | losses['loss_cls'] = F.cross_entropy(cls_score, labels) 82 | 83 | return losses 84 | -------------------------------------------------------------------------------- /mmaction/datasets/loader/build_loader.py: -------------------------------------------------------------------------------- 1 | # from functools import partial 2 | # 3 | # from mmcv.runner import get_dist_info 4 | # from mmcv.parallel import collate 5 | # from torch.utils.data import DataLoader 6 | # 7 | # from .sampler import GroupSampler, DistributedGroupSampler 8 | # 9 | # # https://github.com/pytorch/pytorch/issues/973 10 | # import resource 11 | # rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 12 | # resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) 13 | # 14 | # 15 | # def build_dataloader(dataset, 16 | # imgs_per_gpu, 17 | # workers_per_gpu, 18 | # num_gpus=1, 19 | # dist=True, 20 | # **kwargs): 21 | # if dist: 22 | # rank, world_size = get_dist_info() 23 | # sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size, 24 | # rank) 25 | # batch_size = imgs_per_gpu 26 | # num_workers = workers_per_gpu 27 | # else: 28 | # if not kwargs.get('shuffle', True): 29 | # sampler = None 30 | # else: 31 | # sampler = GroupSampler(dataset, imgs_per_gpu) 32 | # batch_size = num_gpus * imgs_per_gpu 33 | # num_workers = num_gpus * workers_per_gpu 34 | # 35 | # data_loader = DataLoader( 36 | # dataset, 37 | # batch_size=batch_size, 38 | # sampler=sampler, 39 | # num_workers=num_workers, 40 | # collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu), 41 | # pin_memory=False, 42 | # **kwargs) 43 | # 44 | # return data_loader 45 | from functools import partial 46 | 47 | from mmcv.runner import get_dist_info 48 | from mmcv.parallel import collate 49 | from torch.utils.data import DataLoader 50 | 51 | from .sampler import GroupSampler, DistributedGroupSampler, DistributedSampler 52 | 53 | # https://github.com/pytorch/pytorch/issues/973 54 | import resource 55 | 56 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) 57 | resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) 58 | 59 | 60 | def build_dataloader(dataset, 61 | imgs_per_gpu, 62 | workers_per_gpu, 63 | num_gpus=1, 64 | dist=True, 65 | **kwargs): 66 | shuffle = kwargs.get('shuffle', True) 67 | if dist: 68 | rank, world_size = get_dist_info() 69 | if shuffle: 70 | sampler = DistributedGroupSampler(dataset, imgs_per_gpu, 71 | world_size, rank) 72 | else: 73 | sampler = DistributedSampler( 74 | dataset, imgs_per_gpu, world_size, rank, shuffle=False) 75 | batch_size = imgs_per_gpu 76 | num_workers = workers_per_gpu 77 | else: 78 | sampler = GroupSampler(dataset, imgs_per_gpu) if shuffle else None 79 | batch_size = num_gpus * imgs_per_gpu 80 | num_workers = num_gpus * workers_per_gpu 81 | 82 | data_loader = DataLoader( 83 | dataset, 84 | batch_size=batch_size, 85 | sampler=sampler, 86 | num_workers=num_workers, 87 | collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu), 88 | pin_memory=False, 89 | **kwargs) 90 | 91 | return data_loader 92 | -------------------------------------------------------------------------------- /mmaction/core/evaluation/eval_hooks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import logging 4 | import mmcv 5 | import time 6 | import torch 7 | import numpy as np 8 | import torch.distributed as dist 9 | from mmcv.runner import Hook, obj_from_dict 10 | from mmcv.parallel import scatter, collate 11 | from torch.utils.data import Dataset 12 | 13 | from mmaction import datasets 14 | from .accuracy import top_k_accuracy 15 | 16 | 17 | class DistEvalHook(Hook): 18 | def __init__(self, dataset, interval=1): 19 | if isinstance(dataset, Dataset): 20 | self.dataset = dataset 21 | elif isinstance(dataset, dict): 22 | self.dataset = obj_from_dict(dataset, datasets, 23 | {'test_mode': True}) 24 | else: 25 | raise TypeError( 26 | 'dataset must be a Dataset object or a dict, not {}'.format( 27 | type(dataset))) 28 | self.interval = interval 29 | 30 | def after_train_epoch(self, runner): 31 | if not self.every_n_epochs(runner, self.interval): 32 | return 33 | runner.model.eval() 34 | results = [None for _ in range(len(self.dataset))] 35 | if runner.rank == 0: 36 | prog_bar = mmcv.ProgressBar(len(self.dataset)) 37 | for idx in range(runner.rank, len(self.dataset), runner.world_size): 38 | data = self.dataset[idx] 39 | data_gpu = scatter( 40 | collate([data], samples_per_gpu=1), 41 | [torch.cuda.current_device()])[0] 42 | 43 | # compute output 44 | with torch.no_grad(): 45 | result = runner.model( 46 | return_loss=False, rescale=True, **data_gpu) 47 | results[idx] = result 48 | 49 | batch_size = runner.world_size 50 | if runner.rank == 0: 51 | for _ in range(batch_size): 52 | prog_bar.update() 53 | 54 | if runner.rank == 0: 55 | print('\n') 56 | dist.barrier() 57 | for i in range(1, runner.world_size): 58 | tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i)) 59 | tmp_results = mmcv.load(tmp_file) 60 | for idx in range(i, len(results), runner.world_size): 61 | results[idx] = tmp_results[idx] 62 | os.remove(tmp_file) 63 | self.evaluate(runner, results) 64 | else: 65 | tmp_file = osp.join(runner.work_dir, 66 | 'temp_{}.pkl'.format(runner.rank)) 67 | mmcv.dump(results, tmp_file) 68 | dist.barrier() 69 | dist.barrier() 70 | 71 | def evaluate(self): 72 | raise NotImplementedError 73 | 74 | 75 | class DistEvalTopKAccuracyHook(DistEvalHook): 76 | 77 | def __init__(self, 78 | dataset, 79 | k=(1,)): 80 | super(DistEvalTopKAccuracyHook, self).__init__(dataset) 81 | self.k = k 82 | 83 | def evaluate(self, runner, results): 84 | gt_labels = [] 85 | for i in range(len(self.dataset)): 86 | ann = self.dataset.get_ann_info(i) 87 | gt_labels.append(ann['label']) 88 | 89 | results = [res.squeeze() for res in results] 90 | top1, top5 = top_k_accuracy(results, gt_labels, k=self.k) 91 | runner.mode = 'val' 92 | runner.log_buffer.output['top1_acc'] = top1 93 | runner.log_buffer.output['top5_acc'] = top5 94 | runner.log_buffer.ready = True 95 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | from setuptools import find_packages, setup 5 | 6 | 7 | def readme(): 8 | with open('README.md', encoding='utf-8') as f: 9 | content = f.read() 10 | return content 11 | 12 | 13 | MAJOR = 0 14 | MINOR = 1 15 | PATCH = 'rc0' 16 | SUFFIX = '' 17 | SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX) 18 | 19 | version_file = 'mmaction/version.py' 20 | 21 | 22 | def get_git_hash(): 23 | def _minimal_ext_cmd(cmd): 24 | # construct minimal environment 25 | env = {} 26 | for k in ['SYSTEMROOT', 'PATH', 'HOME']: 27 | v = os.environ.get(k) 28 | if v is not None: 29 | env[k] = v 30 | # LANGUAGE is used on win32 31 | env['LANGUAGE'] = 'C' 32 | env['LANG'] = 'C' 33 | env['LC_ALL'] = 'C' 34 | out = subprocess.Popen( 35 | cmd, stdout=subprocess.PIPE, env=env).communicate()[0] 36 | return out 37 | 38 | try: 39 | out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) 40 | sha = out.strip().decode('ascii') 41 | except OSError: 42 | sha = 'unknown' 43 | 44 | return sha 45 | 46 | 47 | def get_hash(): 48 | if os.path.exists('.git'): 49 | sha = get_git_hash()[:7] 50 | elif os.path.exists(version_file): 51 | try: 52 | from mmaction.version import __version__ 53 | sha = __version__.split('+')[-1] 54 | except ImportError: 55 | raise ImportError('Unable to get git version') 56 | else: 57 | sha = 'unknown' 58 | 59 | return sha 60 | 61 | 62 | def write_version_py(): 63 | content = """# GENERATED VERSION FILE 64 | # TIME: {} 65 | 66 | __version__ = '{}' 67 | short_version = '{}' 68 | """ 69 | sha = get_hash() 70 | VERSION = SHORT_VERSION + '+' + sha 71 | 72 | with open(version_file, 'w') as f: 73 | f.write(content.format(time.asctime(), VERSION, SHORT_VERSION)) 74 | 75 | 76 | def get_version(): 77 | with open(version_file, 'r') as f: 78 | exec(compile(f.read(), version_file, 'exec')) 79 | return locals()['__version__'] 80 | 81 | 82 | if __name__ == '__main__': 83 | write_version_py() 84 | setup( 85 | name='mmaction', 86 | version=get_version(), 87 | description='Open MMLab Action Toolbox', 88 | long_description=readme(), 89 | keywords='computer vision, action recognition', 90 | url='https://github.com/open-mmlab/mmaction', 91 | packages=find_packages(exclude=('configs', 'tools', 'demo')), 92 | package_data={'mmaction.ops': ['*/*.so']}, 93 | classifiers=[ 94 | 'Development Status :: 4 - Beta', 95 | 'License :: OSI Approved :: Apache Software License', 96 | 'Operating System :: OS Independent', 97 | 'Programming Language :: Python :: 2', 98 | 'Programming Language :: Python :: 2.7', 99 | 'Programming Language :: Python :: 3', 100 | 'Programming Language :: Python :: 3.4', 101 | 'Programming Language :: Python :: 3.5', 102 | 'Programming Language :: Python :: 3.6', 103 | ], 104 | license='Apache License 2.0', 105 | setup_requires=['pytest-runner'], 106 | tests_require=['pytest'], 107 | install_requires=[ 108 | 'mmcv', 'numpy', 'scipy', 'scikit-learn', 'terminaltables', 'lmdb', 'joblib' 109 | ], 110 | zip_safe=False) 111 | -------------------------------------------------------------------------------- /config_files/sthv1/tsm_baseline.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='TSN2D', 3 | backbone=dict( 4 | type='ResNet', 5 | pretrained='modelzoo://resnet50', 6 | nsegments=8, 7 | depth=50, 8 | out_indices=(3,), 9 | tsm=True, 10 | bn_eval=False, 11 | partial_bn=False), 12 | spatial_temporal_module=dict( 13 | type='SimpleSpatialModule', 14 | spatial_type='avg', 15 | spatial_size=7), 16 | segmental_consensus=dict( 17 | type='SimpleConsensus', 18 | consensus_type='avg'), 19 | cls_head=dict( 20 | type='ClsHead', 21 | with_avg_pool=False, 22 | temporal_feature_size=1, 23 | spatial_feature_size=1, 24 | dropout_ratio=0.5, 25 | in_channels=2048, 26 | num_classes=174)) 27 | train_cfg = None 28 | test_cfg = None 29 | # dataset settings 30 | dataset_type = 'RawFramesDataset' 31 | data_root = '' 32 | data_root_val = '' 33 | 34 | img_norm_cfg = dict( 35 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 36 | 37 | data = dict( 38 | videos_per_gpu=8, 39 | workers_per_gpu=8, 40 | train=dict( 41 | type=dataset_type, 42 | ann_file='data/sthv1/train_videofolder.txt', 43 | img_prefix=data_root, 44 | img_norm_cfg=img_norm_cfg, 45 | num_segments=8, 46 | new_length=1, 47 | new_step=1, 48 | random_shift=True, 49 | modality='RGB', 50 | image_tmpl='{:05d}.jpg', 51 | img_scale=256, 52 | input_size=224, 53 | flip_ratio=0.5, 54 | resize_keep_ratio=True, 55 | resize_crop=True, 56 | color_jitter=True, 57 | color_space_aug=True, 58 | oversample=None, 59 | max_distort=1, 60 | test_mode=False), 61 | val=dict( 62 | type=dataset_type, 63 | ann_file='data/sthv1/val_videofolder.txt', 64 | img_prefix=data_root_val, 65 | img_norm_cfg=img_norm_cfg, 66 | num_segments=8, 67 | new_length=1, 68 | new_step=1, 69 | random_shift=False, 70 | modality='RGB', 71 | image_tmpl='{:05d}.jpg', 72 | img_scale=256, 73 | input_size=224, 74 | flip_ratio=0, 75 | resize_keep_ratio=True, 76 | oversample=None, 77 | test_mode=False), 78 | test=dict( 79 | type=dataset_type, 80 | ann_file='data/sthv1/val_videofolder.txt', 81 | img_prefix=data_root_val, 82 | img_norm_cfg=img_norm_cfg, 83 | num_segments=16, 84 | new_length=1, 85 | new_step=1, 86 | random_shift=False, 87 | modality='RGB', 88 | image_tmpl='{:05d}.jpg', 89 | img_scale=256, 90 | input_size=256, 91 | flip_ratio=0, 92 | resize_keep_ratio=True, 93 | oversample="three_crop", 94 | test_mode=True)) 95 | # optimizer 96 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True) 97 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2)) 98 | # learning policy 99 | lr_config = dict( 100 | policy='step', 101 | step=[75, 125]) 102 | checkpoint_config = dict(interval=1) 103 | workflow = [('train', 1)] 104 | # yapf:disable 105 | log_config = dict( 106 | interval=20, 107 | hooks=[ 108 | dict(type='TextLoggerHook'), 109 | # dict(type='TensorboardLoggerHook') 110 | ]) 111 | # yapf:enable 112 | # runtime settings 113 | total_epochs = 150 114 | dist_params = dict(backend='nccl') 115 | log_level = 'INFO' 116 | load_from = None 117 | resume_from = None 118 | -------------------------------------------------------------------------------- /config_files/sthv2/tsm_baseline.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='TSN2D', 3 | backbone=dict( 4 | type='ResNet', 5 | pretrained='modelzoo://resnet50', 6 | nsegments=8, 7 | depth=50, 8 | out_indices=(3,), 9 | tsm=True, 10 | bn_eval=False, 11 | partial_bn=False), 12 | spatial_temporal_module=dict( 13 | type='SimpleSpatialModule', 14 | spatial_type='avg', 15 | spatial_size=7), 16 | segmental_consensus=dict( 17 | type='SimpleConsensus', 18 | consensus_type='avg'), 19 | cls_head=dict( 20 | type='ClsHead', 21 | with_avg_pool=False, 22 | temporal_feature_size=1, 23 | spatial_feature_size=1, 24 | dropout_ratio=0.5, 25 | in_channels=2048, 26 | num_classes=174)) 27 | train_cfg = None 28 | test_cfg = None 29 | # dataset settings 30 | dataset_type = 'RawFramesDataset' 31 | data_root = '' 32 | data_root_val = '' 33 | 34 | img_norm_cfg = dict( 35 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 36 | 37 | data = dict( 38 | videos_per_gpu=8, 39 | workers_per_gpu=8, 40 | train=dict( 41 | type=dataset_type, 42 | ann_file='data/sthv2/train_videofolder.txt', 43 | img_prefix=data_root, 44 | img_norm_cfg=img_norm_cfg, 45 | num_segments=8, 46 | new_length=1, 47 | new_step=1, 48 | random_shift=True, 49 | modality='RGB', 50 | image_tmpl='img_{:05d}.jpg', 51 | img_scale=256, 52 | input_size=224, 53 | flip_ratio=0.5, 54 | resize_keep_ratio=True, 55 | resize_crop=True, 56 | color_jitter=True, 57 | color_space_aug=True, 58 | oversample=None, 59 | max_distort=1, 60 | test_mode=False), 61 | val=dict( 62 | type=dataset_type, 63 | ann_file='data/sthv2/val_videofolder.txt', 64 | img_prefix=data_root_val, 65 | img_norm_cfg=img_norm_cfg, 66 | num_segments=8, 67 | new_length=1, 68 | new_step=1, 69 | random_shift=False, 70 | modality='RGB', 71 | image_tmpl='img_{:05d}.jpg', 72 | img_scale=256, 73 | input_size=224, 74 | flip_ratio=0, 75 | resize_keep_ratio=True, 76 | oversample=None, 77 | test_mode=False), 78 | test=dict( 79 | type=dataset_type, 80 | ann_file='data/sthv2/val_videofolder.txt', 81 | img_prefix=data_root_val, 82 | img_norm_cfg=img_norm_cfg, 83 | num_segments=16, 84 | new_length=1, 85 | new_step=1, 86 | random_shift=False, 87 | modality='RGB', 88 | image_tmpl='img_{:05d}.jpg', 89 | img_scale=256, 90 | input_size=256, 91 | flip_ratio=0, 92 | resize_keep_ratio=True, 93 | oversample="three_crop", 94 | test_mode=True)) 95 | # optimizer 96 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True) 97 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2)) 98 | # learning policy 99 | lr_config = dict( 100 | policy='step', 101 | step=[75, 125]) 102 | checkpoint_config = dict(interval=1) 103 | workflow = [('train', 1)] 104 | # yapf:disable 105 | log_config = dict( 106 | interval=20, 107 | hooks=[ 108 | dict(type='TextLoggerHook'), 109 | # dict(type='TensorboardLoggerHook') 110 | ]) 111 | # yapf:enable 112 | # runtime settings 113 | total_epochs = 150 114 | dist_params = dict(backend='nccl') 115 | log_level = 'INFO' 116 | load_from = None 117 | resume_from = None 118 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | This directory provides basic tutorials for the usage of MMAction. 4 | 5 | After installation of codebase and preparation of data, you could use the given scripts for training/evaluating your models. 6 | 7 | ### Test a reference model 8 | Our codebase supports distributed and non-distributed evaluation mode for reference model. Actually, distributed testing is a little faster than non-distributed testing. 9 | ``` 10 | # non-distributed testing 11 | python tools/test_recognizer.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] {--gpus ${GPU_NUM}} --ignore_cache --fcn_testing 12 | 13 | # distributed testing 14 | ./tools/dist_test_recognizer.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --ignore_cache --fcn_testing 15 | ``` 16 | Optional arguments: 17 | - `--ignore_cache`: If specified, the results cache will be ignored. 18 | - `--fcn_testing`: If specified, spatially fully-convolutional testing is performed via 3 crops approximation. 19 | - `--flip`: If specified, all frames would be flipped firstly and then fed into models. 20 | 21 | **Important**: some of our models might requires machine with more than 24G memory. 22 | 23 | Examples: 24 | Assume that you have already downloaded the checkpoints to the directory `ckpt/`. 25 | 26 | 1. Test tpn_f8s8 model with non-distributed evaluation mode on 8 GPUs 27 | ``` 28 | python ./tools/test_recognizer.py config_files/kinetics400/tpn/r50f8s8.py ckpt/kinetics400_tpn_r50f8s8 --gpus 8 --out ckpt/kinetics400_tpn_r50f8s8.pkl --fcn_testing --ignore_cache 29 | ``` 30 | 2. Test tpn_f8s8 model with distributed evaluation mode on 8 GPUs 31 | ```shell 32 | ./tools/dist_test_recognizer.sh config_files/kinetics400/tpn/r50f8s8.py ckpt/kinetics400_tpn_r50f8s8 8 --out ckpt/kinetics400_tpn_r50f8s8.pkl --fcn_testing --ignore_cache 33 | ``` 34 | 35 | ### Train a model 36 | 37 | Our codebase also supports distributed training and non-distributed training. 38 | 39 | All outputs (log files and checkpoints) will be saved to the working directory, 40 | which is specified by `work_dir` in the config file. 41 | 42 | By default we evaluate the model on the validation set after each epoch, you can change the evaluation interval by adding the interval argument in the training config. 43 | ```python 44 | evaluation = dict(interval=10) # This evaluate the model per 10 epoch. 45 | ``` 46 | 47 | #### Train with a single GPU 48 | ``` 49 | python tools/train_recognizer.py ${CONFIG_FILE} 50 | ``` 51 | If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`. 52 | 53 | #### Train with multiple GPUs 54 | ```shell 55 | ./tools/dist_train_recognizer.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments] 56 | ``` 57 | 58 | Optional arguments: 59 | - `--validate`: Perform evaluation at every 1 epoch during the training. 60 | - `--work_dir`: All outputs (log files and checkpoints) will be saved to the working directory. 61 | - `--resume_from`: Resume from a previous checkpoint file. 62 | 63 | Difference between `resume_from` and `load_from`: `resume_from` loads both the model weights and optimizer status, and the epoch is also inherited from the specified checkpoint. It is usually used for resuming the training process that is interrupted accidentally. `load_from` only loads the model weights and the training epoch starts from 0. It is usually used for finetuning. 64 | 65 | **Important**: The default learning rate in config files is for 8 GPUs and 8 video/gpu (batch size = 8*8 = 64). According to the Linear Scaling Rule, you need to set the learning rate proportional to the batch size if you use different GPUs or images per GPU, e.g., lr=0.01 for 8 GPUs * 8 video/gpu and lr=0.04 for 32 GPUs * 8 video/gpu. 66 | 67 | Here is the example of using 8 GPUs to train Kinetics400_r50_f8s8: 68 | ```shell 69 | ./tools/dist_train_recognizer.sh config_files/kinetics400/tpn/r50f8s8.py 8 --validate 70 | ``` 71 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r101f16s4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=16, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=16, 56 | new_step=4, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=16, 79 | new_step=4, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=16, 98 | new_step=4, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | workflow = [('train', 1)] 119 | # yapf:disable 120 | log_config = dict( 121 | interval=20, 122 | hooks=[ 123 | dict(type='TextLoggerHook'), 124 | # dict(type='TensorboardLoggerHook') 125 | ]) 126 | # yapf:enable 127 | # runtime settings 128 | total_epochs = 150 129 | dist_params = dict(backend='nccl') 130 | log_level = 'INFO' 131 | load_from = None 132 | resume_from = None 133 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r101f8s8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=False, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=8, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=8, 56 | new_step=8, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=8, 79 | new_step=8, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=8, 98 | new_step=8, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r50f8s8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=False, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=8, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=8, 56 | new_step=8, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=8, 79 | new_step=8, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=8, 98 | new_step=8, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r101f32s2.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=32, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=32, 56 | new_step=2, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=32, 79 | new_step=2, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=32, 98 | new_step=2, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r50f16s4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=16, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=16, 56 | new_step=4, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=16, 79 | new_step=4, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=16, 98 | new_step=4, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /config_files/kinetics400/baseline/r50f32s2.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | spatial_temporal_module=dict( 22 | type='SimpleSpatialTemporalModule', 23 | spatial_type='avg', 24 | temporal_size=32, 25 | spatial_size=7), 26 | segmental_consensus=dict( 27 | type='SimpleConsensus', 28 | consensus_type='avg'), 29 | cls_head=dict( 30 | type='ClsHead', 31 | with_avg_pool=False, 32 | temporal_feature_size=1, 33 | spatial_feature_size=1, 34 | dropout_ratio=0.5, 35 | in_channels=2048, 36 | num_classes=400)) 37 | train_cfg = None 38 | test_cfg = None 39 | # dataset settings 40 | dataset_type = 'RawFramesDataset' 41 | data_root = '' 42 | data_root_val = '' 43 | img_norm_cfg = dict( 44 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 45 | data = dict( 46 | videos_per_gpu=8, 47 | workers_per_gpu=8, 48 | train=dict( 49 | type=dataset_type, 50 | ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt', 51 | img_prefix=data_root, 52 | img_norm_cfg=img_norm_cfg, 53 | input_format="NCTHW", 54 | num_segments=1, 55 | new_length=32, 56 | new_step=2, 57 | random_shift=True, 58 | modality='RGB', 59 | image_tmpl='img_{:05d}.jpg', 60 | img_scale=256, 61 | resize_keep_ratio=True, 62 | input_size=224, 63 | flip_ratio=0.5, 64 | oversample=None, 65 | resize_crop=True, 66 | color_jitter=True, 67 | color_space_aug=True, 68 | max_distort=0, 69 | test_mode=False, 70 | ), 71 | val=dict( 72 | type=dataset_type, 73 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 74 | img_prefix=data_root_val, 75 | img_norm_cfg=img_norm_cfg, 76 | input_format="NCTHW", 77 | num_segments=1, 78 | new_length=32, 79 | new_step=2, 80 | random_shift=True, 81 | modality='RGB', 82 | image_tmpl='img_{:05d}.jpg', 83 | img_scale=256, 84 | input_size=224, 85 | flip_ratio=0, 86 | resize_keep_ratio=True, 87 | oversample=None, 88 | test_mode=False, 89 | ), 90 | test=dict( 91 | type=dataset_type, 92 | ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt', 93 | img_prefix=data_root_val, 94 | img_norm_cfg=img_norm_cfg, 95 | input_format="NCTHW", 96 | num_segments=10, 97 | new_length=32, 98 | new_step=2, 99 | random_shift=True, 100 | modality='RGB', 101 | image_tmpl='img_{:05d}.jpg', 102 | img_scale=256, 103 | input_size=256, 104 | flip_ratio=0, 105 | resize_keep_ratio=True, 106 | oversample='three_crop', 107 | test_mode=True, 108 | )) 109 | # optimizer 110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 112 | # learning policy 113 | lr_config = dict( 114 | policy='step', 115 | step=[75, 125]) 116 | 117 | checkpoint_config = dict(interval=1) 118 | # workflow = [('train', 5), ('val', 1)] 119 | workflow = [('train', 1)] 120 | # yapf:disable 121 | log_config = dict( 122 | interval=20, 123 | hooks=[ 124 | dict(type='TextLoggerHook'), 125 | # dict(type='TensorboardLoggerHook') 126 | ]) 127 | # yapf:enable 128 | # runtime settings 129 | total_epochs = 150 130 | dist_params = dict(backend='nccl') 131 | log_level = 'INFO' 132 | load_from = None 133 | resume_from = None 134 | -------------------------------------------------------------------------------- /docs/assets/style.css: -------------------------------------------------------------------------------- 1 | /* Homepage Style */ 2 | 3 | /* Body */ 4 | body { 5 | background: #e3e5e8; 6 | color: #ffffff; 7 | font-family: 'Lato', Verdana, Helvetica, sans-serif; 8 | font-weight: 300; 9 | font-size: 14pt; 10 | } 11 | 12 | /* Headings */ 13 | h1 { 14 | font-size: 30pt; 15 | } 16 | 17 | h2 { 18 | font-size: 22pt; 19 | } 20 | 21 | h3 { 22 | font-size: 14pt; 23 | } 24 | 25 | /* Hyperlinks */ 26 | a { 27 | text-decoration: none; 28 | } 29 | 30 | a:link { 31 | color: #1772d0; 32 | } 33 | 34 | a:visited { 35 | color: #1772d0; 36 | } 37 | 38 | a:active { 39 | color: red; 40 | } 41 | 42 | a:hover { 43 | color: #f09228; 44 | } 45 | 46 | pre { 47 | background: #fcfcfc; 48 | border: 0; 49 | font-size: 12pt; 50 | margin: 5pt auto; 51 | } 52 | 53 | /* Container */ 54 | .container { 55 | width: 768pt; 56 | min-height: 100pt; 57 | margin: 15pt auto; 58 | padding: 20pt; 59 | border: 1pt hidden #000; 60 | text-align: justify; 61 | color: #000000; 62 | background: #ffffff; 63 | } 64 | 65 | .container .title { 66 | text-align: center; 67 | font-size: 22pt; 68 | margin: 5pt auto; 69 | } 70 | 71 | .container .author { 72 | text-align: center; 73 | font-size: 16pt; 74 | margin: 20pt auto; 75 | } 76 | 77 | .container .institution { 78 | text-align: center; 79 | font-size: 16pt; 80 | margin: 20pt auto; 81 | } 82 | 83 | .container .link { 84 | text-align: center; 85 | font-size: 16pt; 86 | margin: 20pt auto; 87 | } 88 | 89 | .container .teaser { 90 | text-align: center; 91 | } 92 | 93 | .container .teaser img { 94 | text-align: center; 95 | margin: 20pt auto; 96 | width: 95%; 97 | } 98 | 99 | .container .body { 100 | text-align: justify; 101 | font-size: 14pt; 102 | margin: 10pt auto; 103 | } 104 | 105 | .container .bibtex { 106 | text-align: left; 107 | font-size: 22pt; 108 | margin: 5pt auto; 109 | } 110 | 111 | .container .ref { 112 | text-align: left; 113 | font-size: 18pt; 114 | font-weight: bold; 115 | margin: 15pt auto; 116 | } 117 | 118 | .container .citation { 119 | margin: 8pt auto; 120 | font-size: 14pt; 121 | clear: both; 122 | } 123 | 124 | .container .citation img { 125 | float: left; 126 | margin: 0 8pt 8pt 0; /*top right bottom left*/ 127 | width: 120pt; 128 | } 129 | 130 | /* Homepage */ 131 | /* Followings can be removed for single project page. */ 132 | .homepage { 133 | width: 768pt; 134 | min-height: 100pt; 135 | margin: 15pt auto; 136 | padding: 20pt; 137 | border: 1pt hidden #000; 138 | text-align: justify; 139 | color: #000000; 140 | background: #ffffff; 141 | } 142 | 143 | .homepage .header { 144 | margin-top: 30pt; 145 | margin-bottom: 60pt; 146 | margin-right: 70pt; 147 | font-size: 28pt; 148 | text-align: center; 149 | } 150 | 151 | .homepage .header img { 152 | height: 80pt; 153 | float: left; 154 | object-fit: cover; 155 | margin-left: 20pt; 156 | } 157 | 158 | .homepage .section { 159 | text-align: left; 160 | font-size: 25pt; 161 | font-weight: bolder; 162 | margin: 50pt 20pt 20pt 20pt; /*top right bottom left*/ 163 | } 164 | 165 | .homepage .project { 166 | height: 130pt; 167 | outline: thin dotted #666666; 168 | margin: 10pt 20pt 10pt 20pt; /*top right bottom left*/ 169 | } 170 | 171 | .homepage .project .image { 172 | height: 120pt; 173 | width: 160pt; 174 | float: left; 175 | text-align: center; 176 | vertical-align: top; 177 | } 178 | 179 | .homepage .project .image img { 180 | height: 120pt; 181 | width: 160pt; 182 | object-fit: cover; 183 | border-radius: 6pt; 184 | box-shadow: 1pt 1pt 2pt #888888; 185 | -moz-box-shadow: 1pt 1pt 2pt #888888; 186 | -webkit-box-shadow: 1pt 1pt 2pt #888888; 187 | margin: 5pt; 188 | } 189 | 190 | .homepage .project .info { 191 | font-size: 16pt; 192 | text-align: left; 193 | margin: 10pt 20pt 0 180pt; /*top right bottom left*/ 194 | } 195 | 196 | .homepage .avatar { 197 | margin: -10pt 20pt 320pt 0pt; /*top right bottom left*/ 198 | } 199 | 200 | .homepage .avatar table { 201 | float: left; 202 | width: auto; 203 | height: auto; 204 | margin: 10pt auto; 205 | text-align: center; 206 | font-size: 16pt; 207 | border-collapse: separate; 208 | border-spacing: 20pt 10pt; 209 | } 210 | 211 | .homepage .avatar img { 212 | height: 100pt; 213 | width: 100pt; 214 | object-fit: cover; 215 | } 216 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | TPN 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 |
24 | Temporal Pyramid Network for Action Recognition 25 |
26 |
27 | Ceyuan Yang*,1,  28 | Yinghao Xu*,1,  29 | Jianping Shi2,  30 | Bo Dai1,  31 | Bolei Zhou1  32 |
33 |
34 | 1The Chinese University of Hong Kong, 35 | 2SenseTime Group Limited
36 |
37 | 41 |
42 | 43 |
44 |
45 | 46 | 47 | 48 | 49 |
50 |
Overview
51 |
52 | Visual tempo characterizes the dynamics and the temporal scale of an action, which actually describes how fast 53 | an action goes. 54 | Modeling such visual tempos of different actions facilitates their recognition. 55 | In this work we propose a generic Temporal Pyramid Network (TPN) at the feature-level, which can be flexibly 56 | integrated into 2D or 3D backbone networks in a plug-and-play manner. 57 | TPN also shows consistent improvements over other challenging baselines on several action recognition datasets. 58 | A further analysis also reveals that TPN gains most of its improvements on action classes that have large 59 | variances in their visual tempos, validating the effectiveness of TPN. 60 |
61 |
62 | 63 | 64 | 65 | 66 |
67 |
Results
68 |
69 |
  • Quantitive Results
  • 70 |

    71 | Our TPN could achieve 78.9%, 49.0% and 62.0% top-1 accuracy on the mainstream benchmarks of action 72 | recognition i.e., Kinetics-400, Something-Something V1 and V2 respectively, which basically outperforms 73 | other state-of-the-art methods. More detailed comparison and ablation studie are presented in our paper. 74 |

    75 |
  • Empirical Study
  • 76 |

    Per-class Performance Gain vs. Per-class Variance of Visual Tempos : 77 | Figure 4 indicates that the performance gain is clearly positively correlated with the variance of visual 78 | tempos. This study has strongly verified our motivation that TPN could bring a significant improvement for 79 | such actions with large variances of visual tempo.

    80 |

    Robustness of TPN to Visual Tempo Variation : 81 | Figure 5 suggests that TPN helps improve the robustness of I3D-50, resulting in a curve with moderater 82 | fluctuations. More discussion is presented in our experimental section. 83 |

    84 |
    85 | 86 |
    87 | 88 |
    89 |
    90 | 91 | 92 | 93 | 94 |
    95 |
    Bibtex
    96 |
     97 | @inproceedings{yang2020tpn,
     98 |   title   = {Temporal Pyramid Network for Action Recognition}},
     99 |   author  = {Yang, Ceyuan and Xu, Yinghao and Shi, Jianping and Dai, Bo and Zhou, Bolei},
    100 |   journal = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    101 |   year    = {2020}
    102 | }
    103 | 
    104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /config_files/sthv1/tsm_tpn.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='TSN2D', 3 | backbone=dict( 4 | type='ResNet', 5 | pretrained='modelzoo://resnet50', 6 | depth=50, 7 | nsegments=8, 8 | out_indices=(2, 3), 9 | tsm=True, 10 | bn_eval=False, 11 | partial_bn=False), 12 | necks=dict( 13 | type='TPN', 14 | in_channels=[1024, 2048], 15 | out_channels=1024, 16 | spatial_modulation_config=dict( 17 | inplanes=[1024, 2048], 18 | planes=2048, 19 | ), 20 | temporal_modulation_config=dict( 21 | scales=(8, 8), 22 | param=dict( 23 | inplanes=-1, 24 | planes=-1, 25 | downsample_scale=-1, 26 | )), 27 | upsampling_config=dict( 28 | scale=(1, 1, 1), 29 | ), 30 | downsampling_config=dict( 31 | scales=(1, 1, 1), 32 | param=dict( 33 | inplanes=-1, 34 | planes=-1, 35 | downsample_scale=-1, 36 | )), 37 | level_fusion_config=dict( 38 | in_channels=[1024, 1024], 39 | mid_channels=[1024, 1024], 40 | out_channels=2048, 41 | ds_scales=[(1, 1, 1), (1, 1, 1)], 42 | ), 43 | aux_head_config=dict( 44 | inplanes=-1, 45 | planes=174, 46 | loss_weight=0.5 47 | ), 48 | ), 49 | spatial_temporal_module=dict( 50 | type='SimpleSpatialModule', 51 | spatial_type='avg', 52 | spatial_size=7), 53 | segmental_consensus=dict( 54 | type='SimpleConsensus', 55 | consensus_type='avg'), 56 | cls_head=dict( 57 | type='ClsHead', 58 | with_avg_pool=False, 59 | temporal_feature_size=1, 60 | spatial_feature_size=1, 61 | dropout_ratio=0.5, 62 | in_channels=2048, 63 | num_classes=174)) 64 | train_cfg = None 65 | test_cfg = None 66 | # dataset settings 67 | dataset_type = 'RawFramesDataset' 68 | data_root = '' 69 | data_root_val = '' 70 | 71 | img_norm_cfg = dict( 72 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 73 | 74 | data = dict( 75 | videos_per_gpu=8, 76 | workers_per_gpu=8, 77 | train=dict( 78 | type=dataset_type, 79 | ann_file='data/sthv1/train_videofolder.txt', 80 | img_prefix=data_root, 81 | img_norm_cfg=img_norm_cfg, 82 | num_segments=8, 83 | new_length=1, 84 | new_step=1, 85 | random_shift=True, 86 | modality='RGB', 87 | image_tmpl='{:05d}.jpg', 88 | img_scale=256, 89 | input_size=224, 90 | flip_ratio=0.5, 91 | resize_keep_ratio=True, 92 | resize_crop=True, 93 | color_jitter=True, 94 | color_space_aug=True, 95 | oversample=None, 96 | max_distort=1, 97 | test_mode=False), 98 | val=dict( 99 | type=dataset_type, 100 | ann_file='data/sthv1/val_videofolder.txt', 101 | img_prefix=data_root_val, 102 | img_norm_cfg=img_norm_cfg, 103 | num_segments=8, 104 | new_length=1, 105 | new_step=1, 106 | random_shift=False, 107 | modality='RGB', 108 | image_tmpl='{:05d}.jpg', 109 | img_scale=256, 110 | input_size=224, 111 | flip_ratio=0, 112 | resize_keep_ratio=True, 113 | oversample=None, 114 | test_mode=False), 115 | test=dict( 116 | type=dataset_type, 117 | ann_file='data/sthv1/val_videofolder.txt', 118 | img_prefix=data_root_val, 119 | img_norm_cfg=img_norm_cfg, 120 | num_segments=16, 121 | new_length=1, 122 | new_step=1, 123 | random_shift=False, 124 | modality='RGB', 125 | image_tmpl='{:05d}.jpg', 126 | img_scale=256, 127 | input_size=256, 128 | flip_ratio=0, 129 | resize_keep_ratio=True, 130 | oversample="three_crop", 131 | test_mode=True)) 132 | # optimizer 133 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True) 134 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2)) 135 | # learning policy 136 | lr_config = dict( 137 | policy='step', 138 | step=[75, 125]) 139 | checkpoint_config = dict(interval=1) 140 | workflow = [('train', 1)] 141 | # yapf:disable 142 | log_config = dict( 143 | interval=20, 144 | hooks=[ 145 | dict(type='TextLoggerHook'), 146 | # dict(type='TensorboardLoggerHook') 147 | ]) 148 | # yapf:enable 149 | # runtime settings 150 | total_epochs = 150 151 | dist_params = dict(backend='nccl') 152 | log_level = 'INFO' 153 | load_from = None 154 | resume_from = None 155 | -------------------------------------------------------------------------------- /config_files/sthv2/tsm_tpn.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='TSN2D', 3 | backbone=dict( 4 | type='ResNet', 5 | pretrained='modelzoo://resnet50', 6 | depth=50, 7 | nsegments=8, 8 | out_indices=(2, 3), 9 | tsm=True, 10 | bn_eval=False, 11 | partial_bn=False), 12 | necks=dict( 13 | type='TPN', 14 | in_channels=[1024, 2048], 15 | out_channels=1024, 16 | spatial_modulation_config=dict( 17 | inplanes=[1024, 2048], 18 | planes=2048, 19 | ), 20 | temporal_modulation_config=dict( 21 | scales=(8, 8), 22 | param=dict( 23 | inplanes=-1, 24 | planes=-1, 25 | downsample_scale=-1, 26 | )), 27 | upsampling_config=dict( 28 | scale=(1, 1, 1), 29 | ), 30 | downsampling_config=dict( 31 | scales=(1, 1, 1), 32 | param=dict( 33 | inplanes=-1, 34 | planes=-1, 35 | downsample_scale=-1, 36 | )), 37 | level_fusion_config=dict( 38 | in_channels=[1024, 1024], 39 | mid_channels=[1024, 1024], 40 | out_channels=2048, 41 | ds_scales=[(1, 1, 1), (1, 1, 1)], 42 | ), 43 | aux_head_config=dict( 44 | inplanes=-1, 45 | planes=174, 46 | loss_weight=0.5 47 | ), 48 | ), 49 | spatial_temporal_module=dict( 50 | type='SimpleSpatialModule', 51 | spatial_type='avg', 52 | spatial_size=7), 53 | segmental_consensus=dict( 54 | type='SimpleConsensus', 55 | consensus_type='avg'), 56 | cls_head=dict( 57 | type='ClsHead', 58 | with_avg_pool=False, 59 | temporal_feature_size=1, 60 | spatial_feature_size=1, 61 | dropout_ratio=0.5, 62 | in_channels=2048, 63 | num_classes=174)) 64 | train_cfg = None 65 | test_cfg = None 66 | # dataset settings 67 | dataset_type = 'RawFramesDataset' 68 | data_root = '' 69 | data_root_val = '' 70 | 71 | img_norm_cfg = dict( 72 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 73 | 74 | data = dict( 75 | videos_per_gpu=8, 76 | workers_per_gpu=8, 77 | train=dict( 78 | type=dataset_type, 79 | ann_file='data/sthv2/train_videofolder.txt', 80 | img_prefix=data_root, 81 | img_norm_cfg=img_norm_cfg, 82 | num_segments=8, 83 | new_length=1, 84 | new_step=1, 85 | random_shift=True, 86 | modality='RGB', 87 | image_tmpl='img_{:05d}.jpg', 88 | img_scale=256, 89 | input_size=224, 90 | flip_ratio=0.5, 91 | resize_keep_ratio=True, 92 | resize_crop=True, 93 | color_jitter=True, 94 | color_space_aug=True, 95 | oversample=None, 96 | max_distort=1, 97 | test_mode=False), 98 | val=dict( 99 | type=dataset_type, 100 | ann_file='data/sthv2/val_videofolder.txt', 101 | img_prefix=data_root_val, 102 | img_norm_cfg=img_norm_cfg, 103 | num_segments=8, 104 | new_length=1, 105 | new_step=1, 106 | random_shift=False, 107 | modality='RGB', 108 | image_tmpl='img_{:05d}.jpg', 109 | img_scale=256, 110 | input_size=224, 111 | flip_ratio=0, 112 | resize_keep_ratio=True, 113 | oversample=None, 114 | test_mode=False), 115 | test=dict( 116 | type=dataset_type, 117 | ann_file='data/sthv2/val_videofolder.txt', 118 | img_prefix=data_root_val, 119 | img_norm_cfg=img_norm_cfg, 120 | num_segments=16, 121 | new_length=1, 122 | new_step=1, 123 | random_shift=False, 124 | modality='RGB', 125 | image_tmpl='img_{:05d}.jpg', 126 | img_scale=256, 127 | input_size=256, 128 | flip_ratio=0, 129 | resize_keep_ratio=True, 130 | oversample="three_crop", 131 | test_mode=True)) 132 | # optimizer 133 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True) 134 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2)) 135 | # learning policy 136 | lr_config = dict( 137 | policy='step', 138 | step=[75, 125]) 139 | checkpoint_config = dict(interval=1) 140 | workflow = [('train', 1)] 141 | # yapf:disable 142 | log_config = dict( 143 | interval=20, 144 | hooks=[ 145 | dict(type='TextLoggerHook'), 146 | # dict(type='TensorboardLoggerHook') 147 | ]) 148 | # yapf:enable 149 | # runtime settings 150 | total_epochs = 150 151 | dist_params = dict(backend='nccl') 152 | log_level = 'INFO' 153 | load_from = None 154 | resume_from = None 155 | -------------------------------------------------------------------------------- /config_files/kinetics400/tpn/r50f8s8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[2, 3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=False, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | necks=dict( 22 | type='TPN', 23 | in_channels=[1024, 2048], 24 | out_channels=1024, 25 | spatial_modulation_config=dict( 26 | inplanes=[1024, 2048], 27 | planes=2048, 28 | ), 29 | temporal_modulation_config=dict( 30 | scales=(8, 8), 31 | param=dict( 32 | inplanes=-1, 33 | planes=-1, 34 | downsample_scale=-1, 35 | )), 36 | upsampling_config=dict( 37 | scale=(1, 1, 1), 38 | ), 39 | downsampling_config=dict( 40 | scales=(1, 1, 1), 41 | param=dict( 42 | inplanes=-1, 43 | planes=-1, 44 | downsample_scale=-1, 45 | )), 46 | level_fusion_config=dict( 47 | in_channels=[1024, 1024], 48 | mid_channels=[1024, 1024], 49 | out_channels=2048, 50 | ds_scales=[(1, 1, 1), (1, 1, 1)], 51 | ), 52 | aux_head_config=dict( 53 | inplanes=-1, 54 | planes=400, 55 | loss_weight=0.5 56 | ), 57 | ), 58 | spatial_temporal_module=dict( 59 | type='SimpleSpatialTemporalModule', 60 | spatial_type='avg', 61 | temporal_size=1, 62 | spatial_size=7), 63 | segmental_consensus=dict( 64 | type='SimpleConsensus', 65 | consensus_type='avg'), 66 | cls_head=dict( 67 | type='ClsHead', 68 | with_avg_pool=False, 69 | temporal_feature_size=1, 70 | spatial_feature_size=1, 71 | dropout_ratio=0.5, 72 | in_channels=2048, 73 | num_classes=400)) 74 | train_cfg = None 75 | test_cfg = None 76 | # dataset settings 77 | dataset_type = 'RawFramesDataset' 78 | data_root = '' 79 | data_root_val = '' 80 | img_norm_cfg = dict( 81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 82 | data = dict( 83 | videos_per_gpu=8, 84 | workers_per_gpu=8, 85 | train=dict( 86 | type=dataset_type, 87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt', 88 | img_prefix=data_root, 89 | img_norm_cfg=img_norm_cfg, 90 | input_format="NCTHW", 91 | num_segments=1, 92 | new_length=8, 93 | new_step=8, 94 | random_shift=True, 95 | modality='RGB', 96 | image_tmpl='img_{:05d}.jpg', 97 | img_scale=256, 98 | resize_keep_ratio=True, 99 | input_size=224, 100 | flip_ratio=0.5, 101 | oversample=None, 102 | resize_crop=True, 103 | color_jitter=True, 104 | color_space_aug=True, 105 | max_distort=0, 106 | test_mode=False, 107 | ), 108 | val=dict( 109 | type=dataset_type, 110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 111 | img_prefix=data_root_val, 112 | img_norm_cfg=img_norm_cfg, 113 | input_format="NCTHW", 114 | num_segments=1, 115 | new_length=8, 116 | new_step=8, 117 | random_shift=True, 118 | modality='RGB', 119 | image_tmpl='img_{:05d}.jpg', 120 | img_scale=256, 121 | input_size=224, 122 | flip_ratio=0, 123 | resize_keep_ratio=True, 124 | oversample=None, 125 | test_mode=False, 126 | ), 127 | test=dict( 128 | type=dataset_type, 129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 130 | img_prefix=data_root_val, 131 | img_norm_cfg=img_norm_cfg, 132 | input_format="NCTHW", 133 | num_segments=10, 134 | new_length=8, 135 | new_step=8, 136 | random_shift=True, 137 | modality='RGB', 138 | image_tmpl='img_{:05d}.jpg', 139 | img_scale=256, 140 | input_size=256, 141 | flip_ratio=0, 142 | resize_keep_ratio=True, 143 | oversample='three_crop', 144 | test_mode=True, 145 | )) 146 | # optimizer 147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 149 | # learning policy 150 | lr_config = dict( 151 | policy='step', 152 | step=[75, 125]) 153 | 154 | checkpoint_config = dict(interval=1) 155 | # workflow = [('train', 5), ('val', 1)] 156 | workflow = [('train', 1)] 157 | # yapf:disable 158 | log_config = dict( 159 | interval=20, 160 | hooks=[ 161 | dict(type='TextLoggerHook'), 162 | # dict(type='TensorboardLoggerHook') 163 | ]) 164 | # yapf:enable 165 | # runtime settings 166 | total_epochs = 150 167 | dist_params = dict(backend='nccl') 168 | log_level = 'INFO' 169 | load_from = None 170 | resume_from = None 171 | -------------------------------------------------------------------------------- /config_files/kinetics400/tpn/r101f16s4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[2, 3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | necks=dict( 22 | type='TPN', 23 | in_channels=[1024, 2048], 24 | out_channels=1024, 25 | spatial_modulation_config=dict( 26 | inplanes=[1024, 2048], 27 | planes=2048, 28 | ), 29 | temporal_modulation_config=dict( 30 | scales=(8, 16), 31 | param=dict( 32 | inplanes=-1, 33 | planes=-1, 34 | downsample_scale=-1, 35 | )), 36 | upsampling_config=dict( 37 | scale=(1, 1, 1), 38 | ), 39 | downsampling_config=dict( 40 | scales=(2, 1, 1), 41 | param=dict( 42 | inplanes=-1, 43 | planes=-1, 44 | downsample_scale=-1, 45 | )), 46 | level_fusion_config=dict( 47 | in_channels=[1024, 1024], 48 | mid_channels=[1024, 1024], 49 | out_channels=2048, 50 | ds_scales=[(2, 1, 1), (1, 1, 1)], 51 | ), 52 | aux_head_config=dict( 53 | inplanes=-1, 54 | planes=400, 55 | loss_weight=0.5 56 | ), 57 | ), 58 | spatial_temporal_module=dict( 59 | type='SimpleSpatialTemporalModule', 60 | spatial_type='avg', 61 | temporal_size=1, 62 | spatial_size=7), 63 | segmental_consensus=dict( 64 | type='SimpleConsensus', 65 | consensus_type='avg'), 66 | cls_head=dict( 67 | type='ClsHead', 68 | with_avg_pool=False, 69 | temporal_feature_size=1, 70 | spatial_feature_size=1, 71 | dropout_ratio=0.5, 72 | in_channels=2048, 73 | num_classes=400)) 74 | train_cfg = None 75 | test_cfg = None 76 | # dataset settings 77 | dataset_type = 'RawFramesDataset' 78 | data_root = '' 79 | data_root_val = '' 80 | img_norm_cfg = dict( 81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 82 | data = dict( 83 | videos_per_gpu=8, 84 | workers_per_gpu=8, 85 | train=dict( 86 | type=dataset_type, 87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt', 88 | img_prefix=data_root, 89 | img_norm_cfg=img_norm_cfg, 90 | input_format="NCTHW", 91 | num_segments=1, 92 | new_length=16, 93 | new_step=4, 94 | random_shift=True, 95 | modality='RGB', 96 | image_tmpl='img_{:05d}.jpg', 97 | img_scale=256, 98 | resize_keep_ratio=True, 99 | input_size=224, 100 | flip_ratio=0.5, 101 | oversample=None, 102 | resize_crop=True, 103 | color_jitter=True, 104 | color_space_aug=True, 105 | max_distort=0, 106 | test_mode=False, 107 | ), 108 | val=dict( 109 | type=dataset_type, 110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 111 | img_prefix=data_root_val, 112 | img_norm_cfg=img_norm_cfg, 113 | input_format="NCTHW", 114 | num_segments=1, 115 | new_length=16, 116 | new_step=4, 117 | random_shift=True, 118 | modality='RGB', 119 | image_tmpl='img_{:05d}.jpg', 120 | img_scale=256, 121 | input_size=224, 122 | flip_ratio=0, 123 | resize_keep_ratio=True, 124 | oversample=None, 125 | test_mode=False, 126 | ), 127 | test=dict( 128 | type=dataset_type, 129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 130 | img_prefix=data_root_val, 131 | img_norm_cfg=img_norm_cfg, 132 | input_format="NCTHW", 133 | num_segments=10, 134 | new_length=16, 135 | new_step=4, 136 | random_shift=True, 137 | modality='RGB', 138 | image_tmpl='img_{:05d}.jpg', 139 | img_scale=256, 140 | input_size=256, 141 | flip_ratio=0, 142 | resize_keep_ratio=True, 143 | oversample='three_crop', 144 | test_mode=True, 145 | )) 146 | # optimizer 147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 149 | # learning policy 150 | lr_config = dict( 151 | policy='step', 152 | step=[75, 125]) 153 | 154 | checkpoint_config = dict(interval=1) 155 | # workflow = [('train', 5), ('val', 1)] 156 | workflow = [('train', 1)] 157 | # yapf:disable 158 | log_config = dict( 159 | interval=20, 160 | hooks=[ 161 | dict(type='TextLoggerHook'), 162 | # dict(type='TensorboardLoggerHook') 163 | ]) 164 | # yapf:enable 165 | # runtime settings 166 | total_epochs = 150 167 | dist_params = dict(backend='nccl') 168 | log_level = 'INFO' 169 | load_from = None 170 | resume_from = None 171 | -------------------------------------------------------------------------------- /config_files/kinetics400/tpn/r101f8s8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[2, 3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=False, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | necks=dict( 22 | type='TPN', 23 | in_channels=[1024, 2048], 24 | out_channels=1024, 25 | spatial_modulation_config=dict( 26 | inplanes=[1024, 2048], 27 | planes=2048, 28 | ), 29 | temporal_modulation_config=dict( 30 | scales=(4, 8), 31 | param=dict( 32 | inplanes=-1, 33 | planes=-1, 34 | downsample_scale=-1, 35 | )), 36 | upsampling_config=dict( 37 | scale=(1, 1, 1), 38 | ), 39 | downsampling_config=dict( 40 | scales=(2, 1, 1), 41 | param=dict( 42 | inplanes=-1, 43 | planes=-1, 44 | downsample_scale=-1, 45 | )), 46 | level_fusion_config=dict( 47 | in_channels=[1024, 1024], 48 | mid_channels=[1024, 1024], 49 | out_channels=2048, 50 | ds_scales=[(2, 1, 1), (1, 1, 1)], 51 | ), 52 | aux_head_config=dict( 53 | inplanes=-1, 54 | planes=400, 55 | loss_weight=0.5 56 | ), 57 | ), 58 | spatial_temporal_module=dict( 59 | type='SimpleSpatialTemporalModule', 60 | spatial_type='avg', 61 | temporal_size=1, 62 | spatial_size=7), 63 | segmental_consensus=dict( 64 | type='SimpleConsensus', 65 | consensus_type='avg'), 66 | cls_head=dict( 67 | type='ClsHead', 68 | with_avg_pool=False, 69 | temporal_feature_size=1, 70 | spatial_feature_size=1, 71 | dropout_ratio=0.5, 72 | in_channels=2048, 73 | num_classes=400)) 74 | train_cfg = None 75 | test_cfg = None 76 | # dataset settings 77 | dataset_type = 'RawFramesDataset' 78 | data_root = '' 79 | data_root_val = '' 80 | img_norm_cfg = dict( 81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 82 | data = dict( 83 | videos_per_gpu=8, 84 | workers_per_gpu=8, 85 | train=dict( 86 | type=dataset_type, 87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt', 88 | img_prefix=data_root, 89 | img_norm_cfg=img_norm_cfg, 90 | input_format="NCTHW", 91 | num_segments=1, 92 | new_length=8, 93 | new_step=8, 94 | random_shift=True, 95 | modality='RGB', 96 | image_tmpl='img_{:05d}.jpg', 97 | img_scale=256, 98 | resize_keep_ratio=True, 99 | input_size=224, 100 | flip_ratio=0.5, 101 | oversample=None, 102 | resize_crop=True, 103 | color_jitter=True, 104 | color_space_aug=True, 105 | max_distort=0, 106 | test_mode=False, 107 | ), 108 | val=dict( 109 | type=dataset_type, 110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 111 | img_prefix=data_root_val, 112 | img_norm_cfg=img_norm_cfg, 113 | input_format="NCTHW", 114 | num_segments=1, 115 | new_length=8, 116 | new_step=8, 117 | random_shift=True, 118 | modality='RGB', 119 | image_tmpl='img_{:05d}.jpg', 120 | img_scale=256, 121 | input_size=224, 122 | flip_ratio=0, 123 | resize_keep_ratio=True, 124 | oversample=None, 125 | test_mode=False, 126 | ), 127 | test=dict( 128 | type=dataset_type, 129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 130 | img_prefix=data_root_val, 131 | img_norm_cfg=img_norm_cfg, 132 | input_format="NCTHW", 133 | num_segments=10, 134 | new_length=8, 135 | new_step=8, 136 | random_shift=True, 137 | modality='RGB', 138 | image_tmpl='img_{:05d}.jpg', 139 | img_scale=256, 140 | input_size=256, 141 | flip_ratio=0, 142 | resize_keep_ratio=True, 143 | oversample='three_crop', 144 | test_mode=True, 145 | )) 146 | # optimizer 147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 149 | # learning policy 150 | lr_config = dict( 151 | policy='step', 152 | step=[75, 125]) 153 | 154 | checkpoint_config = dict(interval=1) 155 | # workflow = [('train', 5), ('val', 1)] 156 | workflow = [('train', 1)] 157 | # yapf:disable 158 | log_config = dict( 159 | interval=20, 160 | hooks=[ 161 | dict(type='TextLoggerHook'), 162 | # dict(type='TensorboardLoggerHook') 163 | ]) 164 | # yapf:enable 165 | # runtime settings 166 | total_epochs = 150 167 | dist_params = dict(backend='nccl') 168 | log_level = 'INFO' 169 | load_from = None 170 | resume_from = None 171 | -------------------------------------------------------------------------------- /config_files/kinetics400/tpn/r50f16s4.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[2, 3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | necks=dict( 22 | type='TPN', 23 | in_channels=[1024, 2048], 24 | out_channels=1024, 25 | spatial_modulation_config=dict( 26 | inplanes=[1024, 2048], 27 | planes=2048, 28 | ), 29 | temporal_modulation_config=dict( 30 | scales=(16, 16), 31 | param=dict( 32 | inplanes=-1, 33 | planes=-1, 34 | downsample_scale=-1, 35 | )), 36 | upsampling_config=dict( 37 | scale=(1, 1, 1), 38 | ), 39 | downsampling_config=dict( 40 | scales=(1, 1, 1), 41 | param=dict( 42 | inplanes=-1, 43 | planes=-1, 44 | downsample_scale=-1, 45 | )), 46 | level_fusion_config=dict( 47 | in_channels=[1024, 1024], 48 | mid_channels=[1024, 1024], 49 | out_channels=2048, 50 | ds_scales=[(1, 1, 1), (1, 1, 1)], 51 | ), 52 | aux_head_config=dict( 53 | inplanes=-1, 54 | planes=400, 55 | loss_weight=0.5 56 | ), 57 | ), 58 | spatial_temporal_module=dict( 59 | type='SimpleSpatialTemporalModule', 60 | spatial_type='avg', 61 | temporal_size=1, 62 | spatial_size=7), 63 | segmental_consensus=dict( 64 | type='SimpleConsensus', 65 | consensus_type='avg'), 66 | cls_head=dict( 67 | type='ClsHead', 68 | with_avg_pool=False, 69 | temporal_feature_size=1, 70 | spatial_feature_size=1, 71 | dropout_ratio=0.5, 72 | in_channels=2048, 73 | num_classes=400)) 74 | train_cfg = None 75 | test_cfg = None 76 | # dataset settings 77 | dataset_type = 'RawFramesDataset' 78 | data_root = '' 79 | data_root_val = '' 80 | img_norm_cfg = dict( 81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 82 | data = dict( 83 | videos_per_gpu=8, 84 | workers_per_gpu=8, 85 | train=dict( 86 | type=dataset_type, 87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt', 88 | img_prefix=data_root, 89 | img_norm_cfg=img_norm_cfg, 90 | input_format="NCTHW", 91 | num_segments=1, 92 | new_length=16, 93 | new_step=4, 94 | random_shift=True, 95 | modality='RGB', 96 | image_tmpl='img_{:05d}.jpg', 97 | img_scale=256, 98 | resize_keep_ratio=True, 99 | input_size=224, 100 | flip_ratio=0.5, 101 | oversample=None, 102 | resize_crop=True, 103 | color_jitter=True, 104 | color_space_aug=True, 105 | max_distort=0, 106 | test_mode=False, 107 | ), 108 | val=dict( 109 | type=dataset_type, 110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 111 | img_prefix=data_root_val, 112 | img_norm_cfg=img_norm_cfg, 113 | input_format="NCTHW", 114 | num_segments=1, 115 | new_length=16, 116 | new_step=4, 117 | random_shift=True, 118 | modality='RGB', 119 | image_tmpl='img_{:05d}.jpg', 120 | img_scale=256, 121 | input_size=224, 122 | flip_ratio=0, 123 | resize_keep_ratio=True, 124 | oversample=None, 125 | test_mode=False, 126 | ), 127 | test=dict( 128 | type=dataset_type, 129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 130 | img_prefix=data_root_val, 131 | img_norm_cfg=img_norm_cfg, 132 | input_format="NCTHW", 133 | num_segments=10, 134 | new_length=16, 135 | new_step=4, 136 | random_shift=True, 137 | modality='RGB', 138 | image_tmpl='img_{:05d}.jpg', 139 | img_scale=256, 140 | input_size=256, 141 | flip_ratio=0, 142 | resize_keep_ratio=True, 143 | oversample='three_crop', 144 | test_mode=True, 145 | )) 146 | # optimizer 147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 149 | # learning policy 150 | lr_config = dict( 151 | policy='step', 152 | step=[75, 125]) 153 | 154 | checkpoint_config = dict(interval=1) 155 | # workflow = [('train', 5), ('val', 1)] 156 | workflow = [('train', 1)] 157 | # yapf:disable 158 | log_config = dict( 159 | interval=20, 160 | hooks=[ 161 | dict(type='TextLoggerHook'), 162 | # dict(type='TensorboardLoggerHook') 163 | ]) 164 | # yapf:enable 165 | # runtime settings 166 | total_epochs = 150 167 | dist_params = dict(backend='nccl') 168 | log_level = 'INFO' 169 | load_from = None 170 | resume_from = None 171 | -------------------------------------------------------------------------------- /config_files/kinetics400/tpn/r101f32s2.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet101', 7 | depth=101, 8 | num_stages=4, 9 | out_indices=[2, 3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | necks=dict( 22 | type='TPN', 23 | in_channels=[1024, 2048], 24 | out_channels=1024, 25 | spatial_modulation_config=dict( 26 | inplanes=[1024, 2048], 27 | planes=2048, 28 | ), 29 | temporal_modulation_config=dict( 30 | scales=(16, 32), 31 | param=dict( 32 | inplanes=-1, 33 | planes=-1, 34 | downsample_scale=-1, 35 | )), 36 | upsampling_config=dict( 37 | scale=(1, 1, 1), 38 | ), 39 | downsampling_config=dict( 40 | scales=(2, 1, 1), 41 | param=dict( 42 | inplanes=-1, 43 | planes=-1, 44 | downsample_scale=-1, 45 | )), 46 | level_fusion_config=dict( 47 | in_channels=[1024, 1024], 48 | mid_channels=[1024, 1024], 49 | out_channels=2048, 50 | ds_scales=[(2, 1, 1), (1, 1, 1)], 51 | ), 52 | aux_head_config=dict( 53 | inplanes=-1, 54 | planes=400, 55 | loss_weight=0.5 56 | ), 57 | ), 58 | spatial_temporal_module=dict( 59 | type='SimpleSpatialTemporalModule', 60 | spatial_type='avg', 61 | temporal_size=1, 62 | spatial_size=7), 63 | segmental_consensus=dict( 64 | type='SimpleConsensus', 65 | consensus_type='avg'), 66 | cls_head=dict( 67 | type='ClsHead', 68 | with_avg_pool=False, 69 | temporal_feature_size=1, 70 | spatial_feature_size=1, 71 | dropout_ratio=0.5, 72 | in_channels=2048, 73 | num_classes=400)) 74 | train_cfg = None 75 | test_cfg = None 76 | # dataset settings 77 | dataset_type = 'RawFramesDataset' 78 | data_root = '' 79 | data_root_val = '' 80 | img_norm_cfg = dict( 81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 82 | data = dict( 83 | videos_per_gpu=8, 84 | workers_per_gpu=8, 85 | train=dict( 86 | type=dataset_type, 87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt', 88 | img_prefix=data_root, 89 | img_norm_cfg=img_norm_cfg, 90 | input_format="NCTHW", 91 | num_segments=1, 92 | new_length=32, 93 | new_step=2, 94 | random_shift=True, 95 | modality='RGB', 96 | image_tmpl='img_{:05d}.jpg', 97 | img_scale=256, 98 | resize_keep_ratio=True, 99 | input_size=224, 100 | flip_ratio=0.5, 101 | oversample=None, 102 | resize_crop=True, 103 | color_jitter=True, 104 | color_space_aug=True, 105 | max_distort=0, 106 | test_mode=False, 107 | ), 108 | val=dict( 109 | type=dataset_type, 110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 111 | img_prefix=data_root_val, 112 | img_norm_cfg=img_norm_cfg, 113 | input_format="NCTHW", 114 | num_segments=1, 115 | new_length=32, 116 | new_step=2, 117 | random_shift=True, 118 | modality='RGB', 119 | image_tmpl='img_{:05d}.jpg', 120 | img_scale=256, 121 | input_size=224, 122 | flip_ratio=0, 123 | resize_keep_ratio=True, 124 | oversample=None, 125 | test_mode=False, 126 | ), 127 | test=dict( 128 | type=dataset_type, 129 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 130 | img_prefix=data_root_val, 131 | img_norm_cfg=img_norm_cfg, 132 | input_format="NCTHW", 133 | num_segments=10, 134 | new_length=32, 135 | new_step=2, 136 | random_shift=True, 137 | modality='RGB', 138 | image_tmpl='img_{:05d}.jpg', 139 | img_scale=256, 140 | input_size=256, 141 | flip_ratio=0, 142 | resize_keep_ratio=True, 143 | oversample='three_crop', 144 | test_mode=True, 145 | )) 146 | # optimizer 147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 149 | # learning policy 150 | lr_config = dict( 151 | policy='step', 152 | step=[75, 125]) 153 | 154 | checkpoint_config = dict(interval=1) 155 | # workflow = [('train', 5), ('val', 1)] 156 | workflow = [('train', 1)] 157 | # yapf:disable 158 | log_config = dict( 159 | interval=20, 160 | hooks=[ 161 | dict(type='TextLoggerHook'), 162 | # dict(type='TensorboardLoggerHook') 163 | ]) 164 | # yapf:enable 165 | # runtime settings 166 | total_epochs = 150 167 | dist_params = dict(backend='nccl') 168 | log_level = 'INFO' 169 | load_from = None 170 | resume_from = None 171 | -------------------------------------------------------------------------------- /config_files/kinetics400/tpn/r50f32s2.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='TSN3D', 4 | backbone=dict( 5 | type='ResNet_SlowFast', 6 | pretrained='modelzoo://resnet50', 7 | depth=50, 8 | num_stages=4, 9 | out_indices=[2, 3], 10 | frozen_stages=-1, 11 | inflate_freq=(0, 0, 1, 1), 12 | inflate_style='3x1x1', 13 | conv1_kernel_t=1, 14 | conv1_stride_t=1, 15 | pool1_kernel_t=1, 16 | pool1_stride_t=1, 17 | with_cp=True, 18 | bn_eval=False, 19 | partial_bn=False, 20 | style='pytorch'), 21 | necks=dict( 22 | type='TPN', 23 | in_channels=[1024, 2048], 24 | out_channels=1024, 25 | spatial_modulation_config=dict( 26 | inplanes=[1024, 2048], 27 | planes=2048, 28 | ), 29 | temporal_modulation_config=dict( 30 | scales=(32, 32), 31 | param=dict( 32 | inplanes=-1, 33 | planes=-1, 34 | downsample_scale=-1, 35 | )), 36 | upsampling_config=dict( 37 | scale=(1, 1, 1), 38 | ), 39 | downsampling_config=dict( 40 | scales=(1, 1, 1), 41 | param=dict( 42 | inplanes=-1, 43 | planes=-1, 44 | downsample_scale=-1, 45 | )), 46 | level_fusion_config=dict( 47 | in_channels=[1024, 1024], 48 | mid_channels=[1024, 1024], 49 | out_channels=2048, 50 | ds_scales=[(1, 1, 1), (1, 1, 1)], 51 | ), 52 | aux_head_config=dict( 53 | inplanes=-1, 54 | planes=400, 55 | loss_weight=0.5 56 | ), 57 | ), 58 | spatial_temporal_module=dict( 59 | type='SimpleSpatialTemporalModule', 60 | spatial_type='avg', 61 | temporal_size=1, 62 | spatial_size=7), 63 | segmental_consensus=dict( 64 | type='SimpleConsensus', 65 | consensus_type='avg'), 66 | cls_head=dict( 67 | type='ClsHead', 68 | with_avg_pool=False, 69 | temporal_feature_size=1, 70 | spatial_feature_size=1, 71 | dropout_ratio=0.5, 72 | in_channels=2048, 73 | num_classes=400)) 74 | train_cfg = None 75 | test_cfg = None 76 | # dataset settings 77 | dataset_type = 'RawFramesDataset' 78 | data_root = '' 79 | data_root_val = '' 80 | img_norm_cfg = dict( 81 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 82 | data = dict( 83 | videos_per_gpu=8, 84 | workers_per_gpu=8, 85 | train=dict( 86 | type=dataset_type, 87 | ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt', 88 | img_prefix=data_root, 89 | img_norm_cfg=img_norm_cfg, 90 | input_format="NCTHW", 91 | num_segments=1, 92 | new_length=32, 93 | new_step=2, 94 | random_shift=True, 95 | modality='RGB', 96 | image_tmpl='img_{:05d}.jpg', 97 | img_scale=256, 98 | resize_keep_ratio=True, 99 | input_size=224, 100 | flip_ratio=0.5, 101 | oversample=None, 102 | resize_crop=True, 103 | color_jitter=True, 104 | color_space_aug=True, 105 | max_distort=0, 106 | test_mode=False, 107 | ), 108 | val=dict( 109 | type=dataset_type, 110 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 111 | img_prefix=data_root_val, 112 | img_norm_cfg=img_norm_cfg, 113 | input_format="NCTHW", 114 | num_segments=1, 115 | new_length=32, 116 | new_step=2, 117 | random_shift=True, 118 | modality='RGB', 119 | image_tmpl='img_{:05d}.jpg', 120 | img_scale=256, 121 | input_size=224, 122 | div_255=False, 123 | flip_ratio=0, 124 | resize_keep_ratio=True, 125 | oversample=None, 126 | test_mode=False, 127 | ), 128 | test=dict( 129 | type=dataset_type, 130 | ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt', 131 | img_prefix=data_root_val, 132 | img_norm_cfg=img_norm_cfg, 133 | input_format="NCTHW", 134 | num_segments=10, 135 | new_length=32, 136 | new_step=2, 137 | random_shift=True, 138 | modality='RGB', 139 | image_tmpl='img_{:05d}.jpg', 140 | img_scale=256, 141 | input_size=256, 142 | flip_ratio=0, 143 | resize_keep_ratio=True, 144 | oversample='three_crop', 145 | test_mode=True, 146 | )) 147 | # optimizer 148 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True) 149 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) 150 | # learning policy 151 | lr_config = dict( 152 | policy='step', 153 | step=[75, 125]) 154 | 155 | checkpoint_config = dict(interval=1) 156 | # workflow = [('train', 5), ('val', 1)] 157 | workflow = [('train', 1)] 158 | # yapf:disable 159 | log_config = dict( 160 | interval=20, 161 | hooks=[ 162 | dict(type='TextLoggerHook'), 163 | # dict(type='TensorboardLoggerHook') 164 | ]) 165 | # yapf:enable 166 | # runtime settings 167 | total_epochs = 150 168 | dist_params = dict(backend='nccl') 169 | log_level = 'INFO' 170 | load_from = None 171 | resume_from = None 172 | -------------------------------------------------------------------------------- /mmaction/models/recognizers/TSN3D.py: -------------------------------------------------------------------------------- 1 | from .base import BaseRecognizer 2 | from .. import builder 3 | from ..registry import RECOGNIZERS 4 | 5 | import torch 6 | 7 | 8 | @RECOGNIZERS.register_module 9 | class TSN3D(BaseRecognizer): 10 | 11 | def __init__(self, 12 | backbone, 13 | necks=None, 14 | spatial_temporal_module=None, 15 | segmental_consensus=None, 16 | fcn_testing=False, 17 | flip=False, 18 | cls_head=None, 19 | train_cfg=None, 20 | test_cfg=None): 21 | 22 | super(TSN3D, self).__init__() 23 | self.backbone = builder.build_backbone(backbone) 24 | 25 | if necks is not None: 26 | self.necks = builder.build_neck(necks) 27 | else: 28 | self.necks = None 29 | 30 | if spatial_temporal_module is not None: 31 | self.spatial_temporal_module = builder.build_spatial_temporal_module( 32 | spatial_temporal_module) 33 | else: 34 | raise NotImplementedError 35 | 36 | if segmental_consensus is not None: 37 | self.segmental_consensus = builder.build_segmental_consensus( 38 | segmental_consensus) 39 | else: 40 | raise NotImplementedError 41 | 42 | if cls_head is not None: 43 | self.cls_head = builder.build_head(cls_head) 44 | else: 45 | raise NotImplementedError 46 | 47 | self.train_cfg = train_cfg 48 | self.test_cfg = test_cfg 49 | self.fcn_testing = fcn_testing 50 | self.flip = flip 51 | self.init_weights() 52 | 53 | @property 54 | def with_spatial_temporal_module(self): 55 | return hasattr(self, 'spatial_temporal_module') and self.spatial_temporal_module is not None 56 | 57 | @property 58 | def with_segmental_consensus(self): 59 | return hasattr(self, 'segmental_consensus') and self.segmental_consensus is not None 60 | 61 | @property 62 | def with_cls_head(self): 63 | return hasattr(self, 'cls_head') and self.cls_head is not None 64 | 65 | def init_weights(self): 66 | super(TSN3D, self).init_weights() 67 | self.backbone.init_weights() 68 | 69 | if self.with_spatial_temporal_module: 70 | self.spatial_temporal_module.init_weights() 71 | 72 | if self.with_segmental_consensus: 73 | self.segmental_consensus.init_weights() 74 | 75 | if self.with_cls_head: 76 | self.cls_head.init_weights() 77 | 78 | if self.necks is not None: 79 | self.necks.init_weights() 80 | 81 | def extract_feat(self, img_group): 82 | x = self.backbone(img_group) 83 | return x 84 | 85 | def forward_train(self, 86 | num_modalities, 87 | img_meta, 88 | gt_label, 89 | **kwargs): 90 | assert num_modalities == 1 91 | img_group = kwargs['img_group_0'] 92 | 93 | bs = img_group.shape[0] 94 | img_group = img_group.reshape((-1,) + img_group.shape[2:]) 95 | num_seg = img_group.shape[0] // bs 96 | 97 | x = self.extract_feat(img_group) 98 | 99 | if self.necks is not None: 100 | x, aux_losses = self.necks(x, gt_label.squeeze()) 101 | 102 | if self.with_spatial_temporal_module: 103 | x = self.spatial_temporal_module(x) 104 | if self.with_segmental_consensus: 105 | x = x.reshape((-1, num_seg) + x.shape[1:]) 106 | x = self.segmental_consensus(x) 107 | x = x.squeeze(1) 108 | losses = dict() 109 | if self.with_cls_head: 110 | cls_score = self.cls_head(x) 111 | gt_label = gt_label.squeeze() 112 | loss_cls = self.cls_head.loss(cls_score, gt_label) 113 | losses.update(loss_cls) 114 | if self.necks is not None: 115 | if aux_losses is not None: 116 | losses.update(aux_losses) 117 | 118 | return losses 119 | 120 | def forward_test(self, 121 | num_modalities, 122 | img_meta, 123 | **kwargs): 124 | assert num_modalities == 1 125 | img_group = kwargs['img_group_0'] 126 | 127 | bs = img_group.shape[0] 128 | img_group = img_group.reshape((-1,) + img_group.shape[2:]) 129 | num_seg = img_group.shape[0] // bs 130 | 131 | if self.flip: 132 | img_group = self.extract_feat(torch.flip(img_group, [-1])) 133 | x = self.extract_feat(img_group) 134 | if self.necks is not None: 135 | x, _ = self.necks(x) 136 | if self.fcn_testing: 137 | if self.with_cls_head: 138 | x = self.cls_head(x) 139 | prob1 = torch.nn.functional.softmax(x.mean([2, 3, 4]), 1).mean(0, keepdim=True).detach().cpu().numpy() 140 | return prob1 141 | 142 | if self.with_spatial_temporal_module: 143 | x = self.spatial_temporal_module(x) 144 | if self.with_segmental_consensus: 145 | x = x.reshape((-1, num_seg) + x.shape[1:]) 146 | x = self.segmental_consensus(x) 147 | x = x.squeeze(1) 148 | if self.with_cls_head: 149 | x = self.cls_head(x) 150 | 151 | return x.cpu().numpy() 152 | -------------------------------------------------------------------------------- /mmaction/models/tenons/spatial_temporal_modules/non_local.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from mmcv.cnn import constant_init, kaiming_init 5 | from ...registry import SPATIAL_TEMPORAL_MODULES 6 | 7 | 8 | @SPATIAL_TEMPORAL_MODULES.register_module 9 | class NonLocalModule(nn.Module): 10 | def __init__(self, in_channels=1024, nonlocal_type="gaussian", dim=3, embed=True, embed_dim=None, sub_sample=True, 11 | use_bn=True): 12 | super(NonLocalModule, self).__init__() 13 | 14 | assert nonlocal_type in ['gaussian', 'dot', 'concat'] 15 | assert dim == 2 or dim == 3 16 | self.nonlocal_type = nonlocal_type 17 | self.embed = embed 18 | self.embed_dim = embed_dim if embed_dim is not None else in_channels // 2 19 | self.sub_sample = sub_sample 20 | self.use_bn = use_bn 21 | 22 | if self.embed: 23 | if dim == 2: 24 | self.theta = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)) 25 | self.phi = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)) 26 | self.g = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)) 27 | elif dim == 3: 28 | self.theta = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1), 29 | padding=(0, 0, 0)) 30 | self.phi = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1), 31 | padding=(0, 0, 0)) 32 | self.g = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1), 33 | padding=(0, 0, 0)) 34 | 35 | if self.nonlocal_type == 'gaussian': 36 | self.softmax = nn.Softmax(dim=2) 37 | elif self.nonlocal_type == 'concat': 38 | if dim == 2: 39 | self.concat_proj = nn.Sequential( 40 | nn.Conv2d(self.embed_dim * 2, 1, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)), 41 | nn.ReLU()) 42 | elif dim == 3: 43 | self.concat_proj = nn.Sequential( 44 | nn.Conv3d(self.embed_dim * 2, 1, kernel_size=(1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0)), 45 | nn.ReLU()) 46 | 47 | if sub_sample: 48 | if dim == 2: 49 | self.max_pool = nn.MaxPool2d(kernel_size=(2, 2)) 50 | elif dim == 3: 51 | self.max_pool = nn.MaxPool3d(kernel_size=(1, 2, 2)) 52 | self.g = nn.Sequential(self.max_pool, self.g) 53 | self.phi = nn.Sequential(self.max_pool, self.phi) 54 | 55 | if dim == 2: 56 | self.W = nn.Conv2d(self.embed_dim, in_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)) 57 | elif dim == 3: 58 | self.W = nn.Conv3d(self.embed_dim, in_channels, kernel_size=(1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0)) 59 | 60 | if use_bn: 61 | if dim == 2: 62 | self.bn = nn.BatchNorm2d(in_channels, eps=1e-05, momentum=0.9, affine=True) 63 | elif dim == 3: 64 | self.bn = nn.BatchNorm3d(in_channels, eps=1e-05, momentum=0.9, affine=True) 65 | self.W = nn.Sequential(self.W, self.bn) 66 | 67 | def init_weights(self): 68 | for m in self.modules(): 69 | if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv3d): 70 | kaiming_init(m) 71 | elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d): 72 | constant_init(m, 0) 73 | 74 | def forward(self, input): 75 | if self.embed: 76 | theta = self.theta(input) 77 | phi = self.phi(input) 78 | g = self.g(input) 79 | else: 80 | theta = input 81 | phi = input 82 | g = input 83 | 84 | if self.nonlocal_type in ['gaussian', 'dot']: 85 | # reshape [BxC'xTxHxW] to [BxC'x(T)HW] 86 | theta = theta.reshape(theta.shape[:2] + (-1,)) 87 | phi = phi.reshape(theta.shape[:2] + (-1,)) 88 | g = g.reshape(theta.shape[:2] + (-1,)) 89 | theta_phi = torch.matmul(theta.transpose(1, 2), phi) 90 | if self.nonlocal_type == 'gaussian': 91 | p = self.softmax(theta_phi) 92 | elif self.nonlocal_type == 'dot': 93 | N = theta_phi.size(-1) 94 | p = theta_phi / N 95 | elif self.non_local_type == 'concat': 96 | # reshape [BxC'xTxHxW] to [BxC'x(T)HWx1] 97 | theta = theta.reshape(theta.shape[:2] + (-1, 1)) 98 | # reshape [BxC'xTxHxW] to [BxC'x1x(T)HW] 99 | phi = phi.reshape(theta.shape[:2] + (1, -1)) 100 | theta_x = theta.repeat(1, 1, 1, phi.size(3)) 101 | phi_x = phi.repeat(1, 1, theta.size(2), 1) 102 | theta_phi = torch.cat([theta_x, phi_x], dim=1) 103 | theta_phi = self.concat_proj(theta_phi) 104 | theta_phi = theta_phi.squeeze() 105 | N = theta_phi.size(-1) 106 | p = theta_phi / N 107 | else: 108 | NotImplementedError 109 | 110 | # BxC'xddd , Bxdxddd => BxC'xd 111 | y = torch.matmul(g, p.transpose(1, 2)) 112 | y = y.reshape(y.shape[:2] + input.shape[2:]) 113 | z = self.W(y) + input 114 | 115 | return z 116 | -------------------------------------------------------------------------------- /MODELZOO.md: -------------------------------------------------------------------------------- 1 | # Model Zoo 2 | 3 | ## Pretrained Models 4 | All pretrained models can be downloaded from [Google Drive](https://drive.google.com/drive/folders/1UnqZ48doF0UTYjH6iZCXQW3HlDocbBxl). After downloading, put them into `ckpt/`. 5 | 6 | ## Main Results 7 | We report our methods on Kinetics-400, Something-Something V1 and V2. All the numbers including baselines and TPN are obtained via fully-convolutional testing. 8 | 9 | ### Kinetics-400 10 | Since the number of Kinetics-400 videos is slightly different (might lead to a performance drop), we report all results on our own dataset. Our data contains 240403 training videos and 19769 validation videos which are rescaled to 240*320 resolution. Note that the trimmed time of [Non-Local](https://github.com/facebookresearch/video-nonlocal-net/blob/master/DATASET.md) data and the resolution of [MMAction](https://github.com/open-mmlab/mmaction/blob/master/MODEL_ZOO.md) data are different from ours. But the improvements of TPN are consistent. In order to ensure the reproduction, we will find a proper way to release our validation set. All the following results on Kinetics-400 also take flip augmentation testing (~0.1% fluctuation). We sample F frames with a stride of S frames (denote FxS). 11 | 12 | 13 | | Model | Frames | TPN | Top-1 | Weights | Config | 14 | | :---: | :------: | :--------: | :------: | :------: | :------ | 15 | |R50 | 8 x 8 | - | 74.9 | [link](https://drive.google.com/open?id=1uKHvZsY_heFHTBl6RXo02I7-W_aLBhFI) | config_files/kinetics400/baseline/r50f8s8.py | 16 | |R50 | 8 x 8 | Yes | 76.1 | [link](https://drive.google.com/open?id=1KoISwdKDlfzZdEsLItygcvPGkKNwWyR-) | config_files/kinetics400/tpn/r50f8s8.py | 17 | |R50 | 16 x 4 | - | 76.1 | [link](https://drive.google.com/open?id=1Qgck89mUVs9gyUzalbYJPfJPwQEPbyI9) | config_files/kinetics400/baseline/r50f16s4.py | 18 | |R50 | 16 x 4 | Yes | 77.3 | [link](https://drive.google.com/open?id=1TY39uBR-ckUw3aiabeFLNpR9uPSxt--H) | config_files/kinetics400/tpn/r50f16s4.py | 19 | |R50 | 32 x 2 | - | 75.7 | [link](https://drive.google.com/open?id=1oJ1sTzMeLPXHtnutJAAD8gWfm0b3NYpi) | config_files/kinetics400/baseline/r50f32s2.py | 20 | |R50 | 32 x 2 | Yes | 77.7 | [link](https://drive.google.com/open?id=1TjeqcTJ2tReDz4VnLR8ajSHySre9sZDd) | config_files/kinetics400/tpn/r50f32s2.py | 21 | |R101 | 8 x 8 | - | 76.0 | [link](https://drive.google.com/open?id=1dqLWiI3DFHAPIzGtEY_jfI66nthw2GEX) | config_files/kinetics400/baseline/r101f8s8.py | 22 | |R101 | 8 x 8 | Yes | 77.2 | [link](https://drive.google.com/open?id=1B4Vsld-JzQe4QmXeZHd0TolMPNyZypXI) | config_files/kinetics400/tpn/r101f8s8.py | 23 | |R101 | 16 x 4 | - | 77.0 | [link](https://drive.google.com/open?id=1tj2Y0OChKW7RoElXXmBeU63dph40kEyJ) | config_files/kinetics400/baseline/r101f16s4.py | 24 | |R101 | 16 x 4 | Yes | 78.1 | [link](https://drive.google.com/open?id=1mT4kuaYuAGA-Zjagc56vByMQdvx0bE-H) | config_files/kinetics400/tpn/r101f16s4.py | 25 | |R101 | 32 x 2 | - | 77.4 | [link](https://drive.google.com/open?id=1IAobiYS3PhXC1sA_MCdudGCdHRWcWc9J) | config_files/kinetics400/baseline/r101f32s2.py | 26 | |R101 | 32 x 2 | Yes | 78.9 | [link](https://drive.google.com/open?id=1OPudI7CzJzpdeI0YpwLgZB59VCzcoidp) | config_files/kinetics400/tpn/r101f32s2.py | 27 | 28 | We also train our TPN on [MMAction](https://github.com/open-mmlab/mmaction/blob/master/MODEL_ZOO.md) data, the performance will increase due to the raw resolution and ratio. 29 | 30 | | Model | Frames | TPN | Top-1 | Weights | Config | 31 | | :---: | :------: | :--------: | :------: | :------: | :------ | 32 | |R50 | 8 x 8 | Yes | 76.7 | [link](https://drive.google.com/open?id=1pCY4oiWK3hs6MwaPZ8QVPMb-qDCV56w5) | config_files/kinetics400/baseline/r50f8s8.py | 33 | |R101 | 8 x 8 | Yes | 78.2 | [link](https://drive.google.com/open?id=1DeVp7cf-dk-x6Um4NouLq5tFniTge0Bd) | config_files/kinetics400/baseline/r101f8s8.py | 34 | 35 | All models are trained on 32 GPUs with 150 epochs. More details could be found in `config_files`. 36 | 37 | ### Something-Something 38 | Something-Something is a more stable benchmark and the whole data could be downloaded from their [website](https://20bn.com/datasets/something-something). We report our results on both V1 and V2. All numbers are obtained by following the standard protocol i.e., 3 crops * 2 clips. [TSM](https://github.com/mit-han-lab/temporal-shift-module) serves as our backbone network. 39 | Different from original [repo](https://github.com/mit-han-lab/temporal-shift-module) of TSM which takes Kinetics-pretrain, our implementation is initialized by imagenet-pretrain and trained with longer schedule. We use **the same** hyper-parameters of training for both baseline and TPN. Therefore, the improvements come from TPN design instead of other training tricks. We take the uniform sampling for training and validation. 40 | 41 | | Model | Dataset Version | Frames | TPN | Top-1 | Weights | Config | 42 | | :---: | :------: | :------: | :--------: | :------: | :------: | :------ | 43 | |TSM50 | V1 | 8 | - | 48.2 | [link](https://drive.google.com/open?id=1x7iwL2Op0qxaUluyQCPOVVEEH53cavhL) | config_files/sthv1/tsm_baseline.py | 44 | |TSM50 | V1 | 8 | Yes | 50.7 | [link](https://drive.google.com/open?id=1NVjsCYgNXKUKAn33XCxV2YEIaWXlEnLS) | config_files/sthv1/tsm_tpn.py | 45 | |TSM50 | V2 | 8 | - | 62.3 | [link](https://drive.google.com/open?id=1fU1b9WySld5knJ8E2bMXfuyRenoViSEX) | config_files/sthv2/tsm_baseline.py | 46 | |TSM50 | V2 | 8 | Yes | 64.7 | [link](https://drive.google.com/open?id=15HHKGIhksTf0dSmgxrTsoHzZxF6n7eRa) | config_files/sthv2/tsm_tpn.py | 47 | 48 | If you have any problem about how to reproduce our results, please contact Ceyuan Yang (yc019@ie.cuhk.edu.hk) or Yinghao Xu (xy119@ie.cuhk.edu.hk). 49 | 50 | -------------------------------------------------------------------------------- /mmaction/datasets/loader/sampler.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import math 4 | import torch 5 | import numpy as np 6 | 7 | from torch.distributed import get_world_size, get_rank 8 | from torch.utils.data import Sampler 9 | from torch.utils.data import DistributedSampler as _DistributedSampler 10 | 11 | 12 | class DistributedSampler(_DistributedSampler): 13 | 14 | def __init__(self, dataset, imgs_per_gpu, num_replicas=None, rank=None, shuffle=True): 15 | super().__init__(dataset, num_replicas=num_replicas, rank=rank) 16 | self.shuffle = shuffle 17 | self.samples_per_gpu = imgs_per_gpu 18 | 19 | self.num_samples = int( 20 | math.ceil(len(dataset) * 1.0 / self.samples_per_gpu / 21 | self.num_replicas)) * self.samples_per_gpu 22 | self.total_size = self.num_samples * self.num_replicas 23 | 24 | def __iter__(self): 25 | # deterministically shuffle based on epoch 26 | if self.shuffle: 27 | g = torch.Generator() 28 | g.manual_seed(self.epoch) 29 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 30 | else: 31 | indices = torch.arange(len(self.dataset)).tolist() 32 | 33 | # add extra samples to make it evenly divisible 34 | indices += indices[:(self.total_size - len(indices))] 35 | assert len(indices) == self.total_size 36 | 37 | # subsample 38 | indices = indices[self.rank:self.total_size:self.num_replicas] 39 | assert len(indices) == self.num_samples 40 | 41 | return iter(indices) 42 | 43 | 44 | class GroupSampler(Sampler): 45 | 46 | def __init__(self, dataset, samples_per_gpu=1): 47 | assert hasattr(dataset, 'flag') 48 | self.dataset = dataset 49 | self.samples_per_gpu = samples_per_gpu 50 | self.flag = dataset.flag.astype(np.int64) 51 | self.group_sizes = np.bincount(self.flag) 52 | self.num_samples = 0 53 | for i, size in enumerate(self.group_sizes): 54 | self.num_samples += int(np.ceil( 55 | size / self.samples_per_gpu)) * self.samples_per_gpu 56 | 57 | def __iter__(self): 58 | indices = [] 59 | for i, size in enumerate(self.group_sizes): 60 | if size == 0: 61 | continue 62 | indice = np.where(self.flag == i)[0] 63 | assert len(indice) == size 64 | np.random.shuffle(indice) 65 | num_extra = int(np.ceil(size / self.samples_per_gpu) 66 | ) * self.samples_per_gpu - len(indice) 67 | indice = np.concatenate([indice, indice[:num_extra]]) 68 | indices.append(indice) 69 | indices = np.concatenate(indices) 70 | indices = [ 71 | indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu] 72 | for i in np.random.permutation( 73 | range(len(indices) // self.samples_per_gpu)) 74 | ] 75 | indices = np.concatenate(indices) 76 | indices = torch.from_numpy(indices).long() 77 | assert len(indices) == self.num_samples 78 | return iter(indices) 79 | 80 | def __len__(self): 81 | return self.num_samples 82 | 83 | 84 | class DistributedGroupSampler(Sampler): 85 | """Sampler that restricts data loading to a subset of the dataset. 86 | It is especially useful in conjunction with 87 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 88 | process can pass a DistributedSampler instance as a DataLoader sampler, 89 | and load a subset of the original dataset that is exclusive to it. 90 | .. note:: 91 | Dataset is assumed to be of constant size. 92 | Arguments: 93 | dataset: Dataset used for sampling. 94 | num_replicas (optional): Number of processes participating in 95 | distributed training. 96 | rank (optional): Rank of the current process within num_replicas. 97 | """ 98 | 99 | def __init__(self, 100 | dataset, 101 | samples_per_gpu=1, 102 | num_replicas=None, 103 | rank=None): 104 | if num_replicas is None: 105 | num_replicas = get_world_size() 106 | if rank is None: 107 | rank = get_rank() 108 | self.dataset = dataset 109 | self.samples_per_gpu = samples_per_gpu 110 | self.num_replicas = num_replicas 111 | self.rank = rank 112 | self.epoch = 0 113 | 114 | assert hasattr(self.dataset, 'flag') 115 | self.flag = self.dataset.flag 116 | self.group_sizes = np.bincount(self.flag) 117 | 118 | self.num_samples = 0 119 | for i, j in enumerate(self.group_sizes): 120 | self.num_samples += int( 121 | math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / 122 | self.num_replicas)) * self.samples_per_gpu 123 | self.total_size = self.num_samples * self.num_replicas 124 | 125 | def __iter__(self): 126 | # deterministically shuffle based on epoch 127 | g = torch.Generator() 128 | g.manual_seed(self.epoch) 129 | 130 | indices = [] 131 | for i, size in enumerate(self.group_sizes): 132 | if size > 0: 133 | indice = np.where(self.flag == i)[0] 134 | assert len(indice) == size 135 | indice = indice[list(torch.randperm(int(size), 136 | generator=g))].tolist() 137 | extra = int( 138 | math.ceil( 139 | size * 1.0 / self.samples_per_gpu / self.num_replicas) 140 | ) * self.samples_per_gpu * self.num_replicas - len(indice) 141 | indice += indice[:extra] 142 | indices += indice 143 | 144 | assert len(indices) == self.total_size 145 | 146 | indices = [ 147 | indices[j] for i in list( 148 | torch.randperm( 149 | len(indices) // self.samples_per_gpu, generator=g)) 150 | for j in range(i * self.samples_per_gpu, (i + 1) * 151 | self.samples_per_gpu) 152 | ] 153 | 154 | # subsample 155 | offset = self.num_samples * self.rank 156 | indices = indices[offset:offset + self.num_samples] 157 | assert len(indices) == self.num_samples 158 | 159 | return iter(indices) 160 | 161 | def __len__(self): 162 | return self.num_samples 163 | 164 | def set_epoch(self, epoch): 165 | self.epoch = epoch 166 | -------------------------------------------------------------------------------- /test_video.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import cv2 4 | import argparse 5 | import functools 6 | import subprocess 7 | import warnings 8 | from scipy.special import softmax 9 | import moviepy.editor as mpy 10 | import numpy as np 11 | import torch 12 | 13 | import mmcv 14 | from mmcv.runner import load_checkpoint 15 | from mmcv.parallel import collate, scatter 16 | 17 | from mmaction.models import build_recognizer 18 | from mmaction.datasets.transforms import GroupImageTransform 19 | 20 | 21 | def init_recognizer(config, checkpoint=None, label_file=None, device='cuda:0'): 22 | if isinstance(config, str): 23 | config = mmcv.Config.fromfile(config) 24 | elif not isinstance(config, mmcv.Config): 25 | raise TypeError('config must be a filename or Config object, ' 26 | 'but got {}'.format(type(config))) 27 | config.model.backbone.pretrained = None 28 | config.model.spatial_temporal_module.spatial_size = 8 29 | model = build_recognizer( 30 | config.model, train_cfg=None, test_cfg=config.test_cfg) 31 | if checkpoint is not None: 32 | checkpoint = load_checkpoint(model, checkpoint) 33 | if label_file is not None: 34 | classes = [line.rstrip() for line in open(label_file, 'r').readlines()] 35 | model.CLASSES = classes 36 | else: 37 | if 'CLASSES' in checkpoint['meta']: 38 | model.CLASSES = checkpoint['meta']['CLASSES'] 39 | else: 40 | warnings.warn('Class names are not saved in the checkpoint\'s ' 41 | 'meta data, use something-something-v2 classes by default.') 42 | model.CLASSES = get_classes('something=something-v2') 43 | model.cfg = config # save the config in the model for convenience 44 | model.to(device) 45 | model.eval() 46 | return model 47 | 48 | 49 | def inference_recognizer(model, frames): 50 | cfg = model.cfg 51 | device = next(model.parameters()).device # model device 52 | # build the data pipeline 53 | test_transform = GroupImageTransform( 54 | crop_size=cfg.data.test.input_size, 55 | oversample=None, 56 | resize_crop=False, 57 | **dict(mean=[123.675, 116.28, 103.53], 58 | std=[58.395, 57.12, 57.375], to_rgb=True)) 59 | # prepare data 60 | frames, *l = test_transform( 61 | frames, (cfg.data.test.img_scale, cfg.data.test.img_scale), 62 | crop_history=None, 63 | flip=False, 64 | keep_ratio=False, 65 | div_255=False, 66 | is_flow=False) 67 | data = dict(img_group_0=frames, 68 | num_modalities=1, 69 | img_meta={}) 70 | data = scatter(collate([data], samples_per_gpu=1), [device])[0] 71 | # forward the model 72 | with torch.no_grad(): 73 | result = model(return_loss=False, rescale=True, **data) 74 | return result 75 | 76 | 77 | def extract_frames(video_file, num_frames=8): 78 | try: 79 | os.makedirs(os.path.join(os.getcwd(), 'frames')) 80 | except OSError: 81 | pass 82 | fps = subprocess.check_output(['ffprobe', '-v', 'error', 83 | '-select_streams', 84 | 'v', '-of', 'default=noprint_wrappers=1:nokey=1', 85 | '-show_entries', 86 | ' stream=r_frame_rate', 87 | video_file]).decode('utf-8').strip().split('/')[0] 88 | fps = int(fps) 89 | 90 | output = subprocess.Popen(['ffmpeg', '-i', video_file, 91 | '-loglevel', 'panic', 92 | 'frames/%d.jpg']).communicate() 93 | frame_paths = [os.path.join('frames', frame) 94 | for frame in sorted(os.listdir('frames'), key=lambda x: int(x.split('.')[0]))] 95 | 96 | seg_frames, raw_frames = load_frames(frame_paths) 97 | subprocess.call(['rm', '-rf', 'frames']) 98 | 99 | return seg_frames, raw_frames, fps 100 | 101 | 102 | def load_frames(frame_paths, num_frames=8): 103 | frames = [mmcv.imread(frame) for frame in frame_paths] 104 | if len(frames) >= num_frames: 105 | return frames[::int(np.floor(len(frames) / float(num_frames)))][:num_frames].copy(), frames.copy() 106 | else: 107 | raise ValueError('Video must have at least {} frames'.format(num_frames)) 108 | 109 | 110 | def render_frames(frames, prediction): 111 | rendered_frames = [] 112 | for frame in frames: 113 | img = np.array(frame[:, :, ::-1]) 114 | height, width, _ = img.shape 115 | cv2.putText(img=img, text=prediction, org=(1, int(height / 8)), fontFace=cv2.FONT_HERSHEY_TRIPLEX, 116 | fontScale=0.6, color=(255, 255, 255), lineType=cv2.LINE_8, bottomLeftOrigin=False) 117 | rendered_frames.append(img) 118 | return rendered_frames 119 | 120 | 121 | # options 122 | parser = argparse.ArgumentParser(description="test TPN on a single video") 123 | parser.add_argument('config', type=str, default=None, help='model init config') 124 | parser.add_argument('checkpoint', type=str, default=None) 125 | parser.add_argument('--label_file', type=str, default='demo/category.txt') 126 | parser.add_argument('--video_file', type=str, default='demo/demo.mp4') 127 | parser.add_argument('--frame_folder', type=str, default=None) 128 | parser.add_argument('--rendered_output', type=str, default='demo/demo_pred.mp4') 129 | args = parser.parse_args() 130 | 131 | # Obtain video frames 132 | if args.frame_folder is not None: 133 | print('Loading frames in {}'.format(args.frame_folder)) 134 | import glob 135 | 136 | # Here, make sure after sorting the frame paths have the correct temporal order 137 | frame_paths = sorted(glob.glob(os.path.join(args.frame_folder, '*.jpg'))) 138 | seg_frames, raw_frames = load_frames(frame_paths) 139 | fps = 4 140 | else: 141 | print('Extracting frames using ffmpeg...') 142 | seg_frames, raw_frames, fps = extract_frames(args.video_file, 8) 143 | 144 | model = init_recognizer(args.config, checkpoint=args.checkpoint, label_file=args.label_file) 145 | results = inference_recognizer(model, seg_frames) 146 | prob = softmax(results.squeeze()) 147 | idx = np.argsort(-prob) 148 | # Output the prediction. 149 | video_name = args.frame_folder if args.frame_folder is not None else args.video_file 150 | print('RESULT ON ' + video_name) 151 | for i in range(0, 5): 152 | print('{:.3f} -> {}'.format(prob[idx[i]], model.CLASSES[idx[i]])) 153 | 154 | # Render output frames with prediction text. 155 | if args.rendered_output is not None: 156 | prediction = model.CLASSES[idx[0]] 157 | rendered_frames = render_frames(raw_frames, prediction) 158 | clip = mpy.ImageSequenceClip(rendered_frames, fps=fps) 159 | clip.write_videofile(args.rendered_output) 160 | -------------------------------------------------------------------------------- /mmaction/models/tenons/segmental_consensuses/stpp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ...registry import SEGMENTAL_CONSENSUSES 5 | import numpy as np 6 | 7 | 8 | def parse_stage_config(stage_cfg): 9 | if isinstance(stage_cfg, int): 10 | return (stage_cfg,), stage_cfg 11 | elif isinstance(stage_cfg, tuple) or isinstance(stage_cfg, list): 12 | return stage_cfg, sum(stage_cfg) 13 | else: 14 | raise ValueError("Incorrect STPP config {}".format(stage_cfg)) 15 | 16 | 17 | @SEGMENTAL_CONSENSUSES.register_module 18 | class StructuredTemporalPyramidPooling(nn.Module): 19 | def __init__(self, standalong_classifier=False, stpp_cfg=(1, (1, 2), 1), num_seg=(2, 5, 2)): 20 | super(StructuredTemporalPyramidPooling, self).__init__() 21 | 22 | self.sc = standalong_classifier 23 | 24 | starting_parts, starting_mult = parse_stage_config(stpp_cfg[0]) 25 | course_parts, course_mult = parse_stage_config(stpp_cfg[1]) 26 | ending_parts, ending_mult = parse_stage_config(stpp_cfg[2]) 27 | 28 | self.feat_multiplier = starting_mult + course_mult + ending_mult 29 | self.parts = (starting_parts, course_parts, ending_parts) 30 | self.norm_num = (starting_mult, course_mult, ending_mult) 31 | 32 | self.num_seg = num_seg 33 | 34 | def init_weights(self): 35 | pass 36 | 37 | def forward(self, input, scaling): 38 | x1 = self.num_seg[0] 39 | x2 = x1 + self.num_seg[1] 40 | n_seg = x2 + self.num_seg[2] 41 | 42 | feat_dim = input.size(1) 43 | src = input.view(-1, n_seg, feat_dim) 44 | num_sample = src.size(0) 45 | 46 | scaling = scaling.view(-1, 2) 47 | 48 | def get_stage_stpp(stage_feat, stage_parts, norm_num, scaling): 49 | stage_stpp = [] 50 | stage_len = stage_feat.size(1) 51 | for n_part in stage_parts: 52 | ticks = torch.arange(0, stage_len + 1e-5, stage_len / n_part) 53 | for i in range(n_part): 54 | part_feat = stage_feat[:, int(ticks[i]):int(ticks[i + 1]), :].mean(dim=1) / norm_num 55 | if scaling is not None: 56 | part_feat = part_feat * scaling.view(num_sample, 1) 57 | stage_stpp.append(part_feat) 58 | return stage_stpp 59 | 60 | feature_parts = [] 61 | feature_parts.extend(get_stage_stpp(src[:, :x1, :], self.parts[0], self.norm_num[0], scaling[:, 0])) 62 | feature_parts.extend(get_stage_stpp(src[:, x1:x2, :], self.parts[1], self.norm_num[1], None)) 63 | feature_parts.extend(get_stage_stpp(src[:, x2:, :], self.parts[2], self.norm_num[2], scaling[:, 1])) 64 | stpp_feat = torch.cat(feature_parts, dim=1) 65 | if not self.sc: 66 | return stpp_feat, stpp_feat 67 | else: 68 | course_feat = src[:, x1:x2, :].mean(dim=1) 69 | return course_feat, stpp_feat 70 | 71 | 72 | @SEGMENTAL_CONSENSUSES.register_module 73 | class STPPReorganized(nn.Module): 74 | def __init__(self, feat_dim, act_score_len, 75 | comp_score_len, reg_score_len, 76 | standalong_classifier=False, 77 | with_regression=True, 78 | stpp_cfg=(1, (1, 2), 1)): 79 | super(STPPReorganized, self).__init__() 80 | 81 | self.sc = standalong_classifier 82 | self.feat_dim = feat_dim 83 | self.act_score_len = act_score_len 84 | self.comp_score_len = comp_score_len 85 | self.reg_score_len = reg_score_len 86 | self.with_regression = with_regression 87 | 88 | starting_parts, starting_mult = parse_stage_config(stpp_cfg[0]) 89 | course_parts, course_mult = parse_stage_config(stpp_cfg[1]) 90 | ending_parts, ending_mult = parse_stage_config(stpp_cfg[2]) 91 | 92 | self.feat_multiplier = starting_mult + course_mult + ending_mult 93 | self.stpp_cfg = (starting_parts, course_parts, ending_parts) 94 | 95 | self.act_slice = slice(0, self.act_score_len if self.sc else (self.act_score_len * self.feat_multiplier)) 96 | self.comp_slice = slice(self.act_slice.stop, self.act_slice.stop + self.comp_score_len * self.feat_multiplier) 97 | self.reg_slice = slice(self.comp_slice.stop, self.comp_slice.stop + self.reg_score_len * self.feat_multiplier) 98 | 99 | def init_weights(self): 100 | pass 101 | 102 | def forward(self, input, proposal_ticks, scaling): 103 | assert input.size(1) == self.feat_dim 104 | n_ticks = proposal_ticks.size(0) 105 | 106 | out_act_scores = torch.zeros((n_ticks, self.act_score_len)).type_as(input) 107 | raw_act_scores = input[:, self.act_slice] 108 | 109 | out_comp_scores = torch.zeros((n_ticks, self.comp_score_len)).type_as(input) 110 | raw_comp_scores = input[:, self.comp_slice] 111 | 112 | if self.with_regression: 113 | out_reg_scores = torch.zeros((n_ticks, self.reg_score_len)).type_as(input) 114 | raw_reg_scores = input[:, self.reg_slice] 115 | else: 116 | out_reg_scores = None 117 | raw_reg_scores = None 118 | 119 | def pspool(out_scores, index, raw_scores, ticks, scaling, score_len, stpp_cfg): 120 | offset = 0 121 | for stage_idx, stage_cfg in enumerate(stpp_cfg): 122 | if stage_idx == 0: 123 | s = scaling[0] 124 | elif stage_idx == len(stpp_cfg) - 1: 125 | s = scaling[1] 126 | else: 127 | s = 1.0 128 | 129 | stage_cnt = sum(stage_cfg) 130 | left = ticks[stage_idx] 131 | right = max(ticks[stage_idx] + 1, ticks[stage_idx + 1]) 132 | 133 | if right <= 0 or left >= raw_scores.size(0): 134 | offset += stage_cnt 135 | continue 136 | for n_part in stage_cfg: 137 | part_ticks = np.arange(left, right + 1e-5, (right - left) / n_part) 138 | for i in range(n_part): 139 | pl = int(part_ticks[i]) 140 | pr = int(part_ticks[i + 1]) 141 | if pr - pl >= 1: 142 | out_scores[index, :] += raw_scores[pl:pr, 143 | offset * score_len: (offset + 1) * score_len].mean(dim=0) * s 144 | offset += 1 145 | 146 | for i in range(n_ticks): 147 | ticks = proposal_ticks[i].cpu().numpy() 148 | if self.sc: 149 | out_act_scores[i, :] = raw_act_scores[ticks[1]: max(ticks[1] + 1, ticks[2]), :].mean(dim=0) 150 | else: 151 | pspool(out_act_scores, i, raw_act_scores, ticks, scaling[i], self.act_score_len, self.stpp_cfg) 152 | 153 | pspool(out_comp_scores, i, raw_comp_scores, ticks, scaling[i], self.comp_score_len, self.stpp_cfg) 154 | 155 | if self.with_regression: 156 | pspool(out_reg_scores, i, raw_reg_scores, ticks, scaling[i], self.reg_score_len, self.stpp_cfg) 157 | 158 | return out_act_scores, out_comp_scores, out_reg_scores 159 | -------------------------------------------------------------------------------- /mmaction/models/recognizers/TSN2D.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from .base import BaseRecognizer 3 | from .. import builder 4 | from ..registry import RECOGNIZERS 5 | import torch 6 | import numpy as np 7 | 8 | 9 | @RECOGNIZERS.register_module 10 | class TSN2D(BaseRecognizer): 11 | 12 | def __init__(self, 13 | backbone, 14 | necks=None, 15 | modality='RGB', 16 | in_channels=3, 17 | spatial_temporal_module=None, 18 | segmental_consensus=None, 19 | fcn_testing=False, 20 | flip=False, 21 | cls_head=None, 22 | train_cfg=None, 23 | test_cfg=None): 24 | 25 | super(TSN2D, self).__init__() 26 | self.backbone = builder.build_backbone(backbone) 27 | self.modality = modality 28 | self.in_channels = in_channels 29 | if necks is not None: 30 | self.necks = builder.build_neck(necks) 31 | else: 32 | self.necks = None 33 | 34 | if spatial_temporal_module is not None: 35 | self.spatial_temporal_module = builder.build_spatial_temporal_module( 36 | spatial_temporal_module) 37 | else: 38 | raise NotImplementedError 39 | 40 | if segmental_consensus is not None: 41 | self.segmental_consensus = builder.build_segmental_consensus( 42 | segmental_consensus) 43 | else: 44 | raise NotImplementedError 45 | 46 | if cls_head is not None: 47 | self.cls_head = builder.build_head(cls_head) 48 | else: 49 | raise NotImplementedError 50 | 51 | self.train_cfg = train_cfg 52 | self.test_cfg = test_cfg 53 | self.fcn_testing = fcn_testing 54 | self.flip = flip 55 | assert modality in ['RGB', 'Flow', 'RGBDiff'] 56 | 57 | self.init_weights() 58 | 59 | @property 60 | def with_spatial_temporal_module(self): 61 | return hasattr(self, 'spatial_temporal_module') and self.spatial_temporal_module is not None 62 | 63 | @property 64 | def with_segmental_consensus(self): 65 | return hasattr(self, 'segmental_consensus') and self.segmental_consensus is not None 66 | 67 | @property 68 | def with_cls_head(self): 69 | return hasattr(self, 'cls_head') and self.cls_head is not None 70 | 71 | def init_weights(self): 72 | super(TSN2D, self).init_weights() 73 | self.backbone.init_weights() 74 | 75 | if self.with_spatial_temporal_module: 76 | self.spatial_temporal_module.init_weights() 77 | 78 | if self.with_segmental_consensus: 79 | self.segmental_consensus.init_weights() 80 | 81 | if self.with_cls_head: 82 | self.cls_head.init_weights() 83 | 84 | if self.necks is not None: 85 | self.necks.init_weights() 86 | 87 | def extract_feat(self, img_group): 88 | x = self.backbone(img_group) 89 | return x 90 | 91 | def forward_train(self, 92 | num_modalities, 93 | img_meta, 94 | gt_label, 95 | **kwargs): 96 | assert num_modalities == 1 97 | img_group = kwargs['img_group_0'] 98 | 99 | bs = img_group.shape[0] 100 | img_group = img_group.reshape( 101 | (-1, self.in_channels) + img_group.shape[3:]) 102 | num_seg = img_group.shape[0] // bs 103 | 104 | x = self.extract_feat(img_group) 105 | if self.necks is not None: 106 | x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x] 107 | x, aux_losses = self.necks(x, gt_label.squeeze()) 108 | x = x.squeeze(2) 109 | num_seg = 1 110 | 111 | if self.with_spatial_temporal_module: 112 | x = self.spatial_temporal_module(x) 113 | x = x.reshape((-1, num_seg) + x.shape[1:]) 114 | if self.with_segmental_consensus: 115 | x = self.segmental_consensus(x) 116 | x = x.squeeze(1) 117 | losses = dict() 118 | if self.with_cls_head: 119 | cls_score = self.cls_head(x) 120 | gt_label = gt_label.squeeze() 121 | loss_cls = self.cls_head.loss(cls_score, gt_label) 122 | losses.update(loss_cls) 123 | if self.necks is not None: 124 | if aux_losses is not None: 125 | losses.update(aux_losses) 126 | return losses 127 | 128 | def forward_test(self, 129 | num_modalities, 130 | img_meta, 131 | **kwargs): 132 | if not self.fcn_testing: 133 | # 1crop * 1clip 134 | assert num_modalities == 1 135 | img_group = kwargs['img_group_0'] 136 | 137 | bs = img_group.shape[0] 138 | img_group = img_group.reshape( 139 | (-1, self.in_channels) + img_group.shape[3:]) 140 | num_seg = img_group.shape[0] // bs 141 | 142 | x = self.extract_feat(img_group) 143 | 144 | if self.necks is not None: 145 | x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x] 146 | x, _ = self.necks(x) 147 | x = x.squeeze(2) 148 | num_seg = 1 149 | 150 | if self.with_spatial_temporal_module: 151 | x = self.spatial_temporal_module(x) 152 | x = x.reshape((-1, num_seg) + x.shape[1:]) 153 | if self.with_segmental_consensus: 154 | x = self.segmental_consensus(x) 155 | x = x.squeeze(1) 156 | if self.with_cls_head: 157 | x = self.cls_head(x) 158 | 159 | return x.cpu().numpy() 160 | else: 161 | # fcn testing 162 | assert num_modalities == 1 163 | img_group = kwargs['img_group_0'] 164 | 165 | bs = img_group.shape[0] 166 | img_group = img_group.reshape( 167 | (-1, self.in_channels) + img_group.shape[3:]) 168 | # standard protocol i.e. 3 crops * 2 clips 169 | num_seg = self.backbone.nsegments * 2 170 | # 3 crops to cover full resolution 171 | num_crops = 3 172 | img_group = img_group.reshape((num_crops, num_seg) + img_group.shape[1:]) 173 | 174 | x1 = img_group[:, ::2, :, :, :] 175 | x2 = img_group[:, 1::2, :, :, :] 176 | img_group = torch.cat([x1, x2], 0) 177 | num_seg = num_seg // 2 178 | num_clips = img_group.shape[0] 179 | img_group = img_group.view(num_clips * num_seg, img_group.shape[2], img_group.shape[3], img_group.shape[4]) 180 | 181 | if self.flip: 182 | img_group = self.extract_feat(torch.flip(img_group, [-1])) 183 | x = self.extract_feat(img_group) 184 | if self.necks is not None: 185 | x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x] 186 | x, _ = self.necks(x) 187 | else: 188 | x = x.reshape((-1, num_seg) + x.shape[1:]).transpose(1, 2) 189 | x = self.cls_head(x) 190 | 191 | prob = torch.nn.functional.softmax(x.mean([2, 3, 4]), 1).mean(0, keepdim=True).detach().cpu().numpy() 192 | return prob 193 | -------------------------------------------------------------------------------- /mmaction/losses/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def weighted_nll_loss(pred, label, weight, avg_factor=None): 6 | if avg_factor is None: 7 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.) 8 | raw = F.nll_loss(pred, label, reduction='none') 9 | return torch.sum(raw * weight)[None] / avg_factor 10 | 11 | 12 | def weighted_cross_entropy(pred, label, weight, avg_factor=None, reduce=True): 13 | if avg_factor is None: 14 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.) 15 | raw = F.cross_entropy(pred, label, reduction='none') 16 | if reduce: 17 | return torch.sum(raw * weight)[None] / avg_factor 18 | else: 19 | return raw * weight / avg_factor 20 | 21 | 22 | def weighted_binary_cross_entropy(pred, label, weight, avg_factor=None): 23 | if pred.dim() != label.dim(): 24 | label, weight = _expand_binary_labels(label, weight, pred.size(-1)) 25 | if avg_factor is None: 26 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.) 27 | return F.binary_cross_entropy_with_logits( 28 | pred, label.float(), weight.float(), 29 | reduction='sum')[None] / avg_factor 30 | 31 | 32 | def smooth_l1_loss(pred, target, beta=1.0, reduction='mean'): 33 | assert beta > 0 34 | assert pred.size() == target.size() and target.numel() > 0 35 | diff = torch.abs(pred - target) 36 | loss = torch.where(diff < beta, 0.5 * diff * diff / beta, 37 | diff - 0.5 * beta) 38 | reduction_enum = F._Reduction.get_enum(reduction) 39 | # none: 0, mean: 1, sum: 2 40 | if reduction_enum == 0: 41 | return loss 42 | elif reduction_enum == 1: 43 | return loss.sum() / pred.numel() 44 | elif reduction_enum == 2: 45 | return loss.sum() 46 | 47 | 48 | def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None): 49 | if avg_factor is None: 50 | avg_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6 51 | loss = smooth_l1_loss(pred, target, beta, reduction='none') 52 | return torch.sum(loss * weight)[None] / avg_factor 53 | 54 | 55 | def accuracy(pred, target, topk=1): 56 | if isinstance(topk, int): 57 | topk = (topk,) 58 | return_single = True 59 | else: 60 | return_single = False 61 | 62 | maxk = max(topk) 63 | _, pred_label = pred.topk(maxk, 1, True, True) 64 | pred_label = pred_label.t() 65 | correct = pred_label.eq(target.view(1, -1).expand_as(pred_label)) 66 | 67 | res = [] 68 | for k in topk: 69 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 70 | res.append(correct_k.mul_(100.0 / pred.size(0))) 71 | return res[0] if return_single else res 72 | 73 | 74 | def _expand_binary_labels(labels, label_weights, label_channels): 75 | bin_labels = labels.new_full((labels.size(0), label_channels), 0) 76 | inds = torch.nonzero(labels >= 1).squeeze() 77 | if inds.numel() > 0: 78 | bin_labels[inds, labels[inds] - 1] = 1 79 | bin_label_weights = label_weights.view(-1, 1).expand( 80 | label_weights.size(0), label_channels) 81 | return bin_labels, bin_label_weights 82 | 83 | 84 | def weighted_multilabel_binary_cross_entropy( 85 | pred, label, weight, avg_factor=None): 86 | label, weight = _expand_multilabel_binary_labels( 87 | label, weight, pred.size(-1)) 88 | if avg_factor is None: 89 | avg_factor = max(torch.sum(weight > 0).float().item(), 1.) 90 | return F.binary_cross_entropy_with_logits( 91 | pred, label.float(), weight.float(), 92 | reduction='sum')[None] / avg_factor 93 | 94 | 95 | def _expand_multilabel_binary_labels(labels, label_weights, label_channels): 96 | bin_labels = labels.new_full((labels.size(0), label_channels), 0) 97 | inds = torch.nonzero(labels >= 1) 98 | if inds.numel() > 0: 99 | for ind in inds: 100 | # note that labels starts from 1 101 | bin_labels[ind[0], labels[ind[0], ind[1]] - 1] = 1 102 | # bin_labels[ind[0], 0] = 1 103 | bin_label_weights = label_weights 104 | return bin_labels, bin_label_weights 105 | 106 | 107 | def multilabel_accuracy(pred, target, topk=1, thr=0.5): 108 | if topk is None: 109 | topk = () 110 | elif isinstance(topk, int): 111 | topk = (topk,) 112 | 113 | pred = pred.sigmoid() 114 | pred_bin_labels = pred.new_full((pred.size(0),), 0, dtype=torch.long) 115 | pred_vec_labels = pred.new_full(pred.size(), 0, dtype=torch.long) 116 | for i in range(pred.size(0)): 117 | inds = torch.nonzero(pred[i, 1:] > thr).squeeze() + 1 118 | if inds.numel() > 0: 119 | pred_vec_labels[i, inds] = 1 120 | # pred_bin_labels[i] = 1 121 | if pred[i, 0] > thr: 122 | pred_bin_labels[i] = 1 123 | target_bin_labels = target.new_full( 124 | (target.size(0),), 0, dtype=torch.long) 125 | target_vec_labels = target.new_full(target.size(), 0, dtype=torch.long) 126 | for i in range(target.size(0)): 127 | inds = torch.nonzero(target[i, :] >= 1).squeeze() 128 | if inds.numel() > 0: 129 | target_vec_labels[i, target[i, inds]] = 1 130 | target_bin_labels[i] = 1 131 | # overall accuracy 132 | correct = pred_bin_labels.eq(target_bin_labels) 133 | acc = correct.float().sum(0, keepdim=True).mul_(100.0 / correct.size(0)) 134 | 135 | # def overlap(tensor1, tensor2): 136 | # indices = tensor1.new_zeros(tensor1).astype(torch.uint8) 137 | # for elem in tensor2: 138 | # indices = indices | (tensor1 == elem) 139 | # return tensor1[indices] 140 | 141 | # recall@thr 142 | recall_thr, prec_thr = recall_prec(pred_vec_labels, target_vec_labels) 143 | 144 | # recall@k 145 | recalls = [] 146 | precs = [] 147 | for k in topk: 148 | _, pred_label = pred.topk(k, 1, True, True) 149 | pred_vec_labels = pred.new_full(pred.size(), 0, dtype=torch.long) 150 | for i in range(pred.size(0)): 151 | pred_vec_labels[i, pred_label[i]] = 1 152 | recall_k, prec_k = recall_prec(pred_vec_labels, target_vec_labels) 153 | recalls.append(recall_k) 154 | precs.append(prec_k) 155 | 156 | return acc, recall_thr, prec_thr, recalls, precs 157 | 158 | 159 | def recall_prec(pred_vec, target_vec): 160 | """ 161 | Args: 162 | pred_vec: (n, C+1), each element is either 0 or 1 163 | target_vec: (n, C+1), each element is either 0 or 1 164 | 165 | Returns: 166 | recall 167 | prec 168 | """ 169 | recall = pred_vec.new_full((pred_vec.size(0),), 0).float() 170 | prec = pred_vec.new_full((pred_vec.size(0),), 0).float() 171 | num_pos = 0 172 | for i in range(target_vec.size(0)): 173 | if target_vec[i, :].float().sum(0) == 0: 174 | continue 175 | correct_labels = pred_vec[i, :] & target_vec[i, :] 176 | recall[i] = correct_labels.float().sum(0, keepdim=True) / \ 177 | target_vec[i, :].float().sum(0, keepdim=True) 178 | prec[i] = correct_labels.float().sum(0, keepdim=True) / \ 179 | (pred_vec[i, :].float().sum(0, keepdim=True) + 1e-6) 180 | num_pos += 1 181 | recall = recall.float().sum(0, keepdim=True).mul_(100. / num_pos) 182 | prec = prec.float().sum(0, keepdim=True).mul_(100. / num_pos) 183 | return recall, prec 184 | -------------------------------------------------------------------------------- /mmaction/apis/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import re 3 | from collections import OrderedDict 4 | 5 | import torch 6 | from mmcv.runner import Runner, DistSamplerSeedHook, obj_from_dict 7 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 8 | 9 | from mmaction.core import (DistOptimizerHook, DistEvalTopKAccuracyHook, 10 | ) 11 | from mmaction.datasets import build_dataloader 12 | from .env import get_root_logger 13 | 14 | 15 | def parse_losses(losses): 16 | log_vars = OrderedDict() 17 | for loss_name, loss_value in losses.items(): 18 | if isinstance(loss_value, torch.Tensor): 19 | log_vars[loss_name] = loss_value.mean() 20 | elif isinstance(loss_value, list): 21 | log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) 22 | else: 23 | raise TypeError( 24 | '{} is not a tensor or list of tensors'.format(loss_name)) 25 | 26 | loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) 27 | 28 | log_vars['loss'] = loss 29 | for name in log_vars: 30 | log_vars[name] = log_vars[name].item() 31 | 32 | return loss, log_vars 33 | 34 | 35 | def batch_processor(model, data, train_mode): 36 | losses = model(**data) 37 | loss, log_vars = parse_losses(losses) 38 | 39 | outputs = dict( 40 | loss=loss, log_vars=log_vars, 41 | num_samples=len(data['img_group_0'].data)) 42 | 43 | return outputs 44 | 45 | 46 | def train_network(model, 47 | dataset, 48 | cfg, 49 | distributed=False, 50 | validate=False, 51 | logger=None): 52 | if logger is None: 53 | logger = get_root_logger(cfg.log_level) 54 | 55 | # start training 56 | if distributed: 57 | _dist_train(model, dataset, cfg, validate=validate) 58 | else: 59 | _non_dist_train(model, dataset, cfg, validate=validate) 60 | 61 | 62 | def build_optimizer(model, optimizer_cfg): 63 | """Build optimizer from configs. 64 | Args: 65 | model (:obj:`nn.Module`): The model with parameters to be optimized. 66 | optimizer_cfg (dict): The config dict of the optimizer. 67 | Positional fields are: 68 | - type: class name of the optimizer. 69 | - lr: base learning rate. 70 | Optional fields are: 71 | - any arguments of the corresponding optimizer type, e.g., 72 | weight_decay, momentum, etc. 73 | - paramwise_options: a dict with 3 accepted fileds 74 | (bias_lr_mult, bias_decay_mult, norm_decay_mult). 75 | `bias_lr_mult` and `bias_decay_mult` will be multiplied to 76 | the lr and weight decay respectively for all bias parameters 77 | (except for the normalization layers), and 78 | `norm_decay_mult` will be multiplied to the weight decay 79 | for all weight and bias parameters of normalization layers. 80 | Returns: 81 | torch.optim.Optimizer: The initialized optimizer. 82 | """ 83 | if hasattr(model, 'module'): 84 | model = model.module 85 | 86 | optimizer_cfg = optimizer_cfg.copy() 87 | paramwise_options = optimizer_cfg.pop('paramwise_options', None) 88 | # if no paramwise option is specified, just use the global setting 89 | if paramwise_options is None: 90 | return obj_from_dict(optimizer_cfg, torch.optim, 91 | dict(params=model.parameters())) 92 | else: 93 | assert isinstance(paramwise_options, dict) 94 | # get base lr and weight decay 95 | base_lr = optimizer_cfg['lr'] 96 | base_wd = optimizer_cfg.get('weight_decay', None) 97 | # weight_decay must be explicitly specified if mult is specified 98 | if ('bias_decay_mult' in paramwise_options 99 | or 'norm_decay_mult' in paramwise_options): 100 | assert base_wd is not None 101 | # get param-wise options 102 | bias_lr_mult = paramwise_options.get('bias_lr_mult', 1.) 103 | bias_decay_mult = paramwise_options.get('bias_decay_mult', 1.) 104 | norm_decay_mult = paramwise_options.get('norm_decay_mult', 1.) 105 | # set param-wise lr and weight decay 106 | params = [] 107 | for name, param in model.named_parameters(): 108 | param_group = {'params': [param]} 109 | if not param.requires_grad: 110 | # FP16 training needs to copy gradient/weight between master 111 | # weight copy and model weight, it is convenient to keep all 112 | # parameters here to align with model.parameters() 113 | params.append(param_group) 114 | continue 115 | 116 | # for norm layers, overwrite the weight decay of weight and bias 117 | # TODO: obtain the norm layer prefixes dynamically 118 | if re.search(r'(bn|gn)(\d+)?.(weight|bias)', name): 119 | if base_wd is not None: 120 | param_group['weight_decay'] = base_wd * norm_decay_mult 121 | # for other layers, overwrite both lr and weight decay of bias 122 | elif name.endswith('.bias'): 123 | param_group['lr'] = base_lr * bias_lr_mult 124 | if base_wd is not None: 125 | param_group['weight_decay'] = base_wd * bias_decay_mult 126 | # otherwise use the global settings 127 | 128 | params.append(param_group) 129 | 130 | optimizer_cls = getattr(torch.optim, optimizer_cfg.pop('type')) 131 | return optimizer_cls(params, **optimizer_cfg) 132 | 133 | 134 | def _dist_train(model, dataset, cfg, validate=False): 135 | # prepare data loaders 136 | data_loaders = [ 137 | build_dataloader( 138 | dataset, 139 | cfg.data.videos_per_gpu, 140 | cfg.data.workers_per_gpu, 141 | dist=True) 142 | ] 143 | # put model on gpus 144 | model = MMDistributedDataParallel(model.cuda()) 145 | # build runner 146 | # build runner 147 | optimizer = build_optimizer(model, cfg.optimizer) 148 | 149 | runner = Runner(model, batch_processor, optimizer, cfg.work_dir, 150 | cfg.log_level) 151 | # register hooks 152 | optimizer_config = DistOptimizerHook(**cfg.optimizer_config) 153 | runner.register_training_hooks(cfg.lr_config, optimizer_config, 154 | cfg.checkpoint_config, cfg.log_config) 155 | runner.register_hook(DistSamplerSeedHook()) 156 | # register eval hooks 157 | if validate: 158 | if cfg.data.val.type in ['RawFramesDataset']: 159 | runner.register_hook( 160 | DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5))) 161 | 162 | if cfg.resume_from: 163 | runner.resume(cfg.resume_from) 164 | elif cfg.load_from: 165 | runner.load_checkpoint(cfg.load_from) 166 | runner.run(data_loaders, cfg.workflow, cfg.total_epochs) 167 | 168 | 169 | def _non_dist_train(model, dataset, cfg, validate=False): 170 | # prepare data loaders 171 | data_loaders = [ 172 | build_dataloader( 173 | dataset, 174 | cfg.data.videos_per_gpu, 175 | cfg.data.workers_per_gpu, 176 | cfg.gpus, 177 | dist=False) 178 | ] 179 | # put model on gpus 180 | model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() 181 | # build runner 182 | runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir, 183 | cfg.log_level) 184 | runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, 185 | cfg.checkpoint_config, cfg.log_config) 186 | 187 | if cfg.resume_from: 188 | runner.resume(cfg.resume_from) 189 | elif cfg.load_from: 190 | runner.load_checkpoint(cfg.load_from) 191 | runner.run(data_loaders, cfg.workflow, cfg.total_epochs) 192 | -------------------------------------------------------------------------------- /demo/category.txt: -------------------------------------------------------------------------------- 1 | Approaching something with your camera 2 | Attaching something to something 3 | Bending something so that it deforms 4 | Bending something until it breaks 5 | Burying something in something 6 | Closing something 7 | Covering something with something 8 | Digging something out of something 9 | Dropping something behind something 10 | Dropping something in front of something 11 | Dropping something into something 12 | Dropping something next to something 13 | Dropping something onto something 14 | Failing to put something into something because something does not fit 15 | Folding something 16 | Hitting something with something 17 | Holding something 18 | Holding something behind something 19 | Holding something in front of something 20 | Holding something next to something 21 | Holding something over something 22 | Laying something on the table on its side, not upright 23 | Letting something roll along a flat surface 24 | Letting something roll down a slanted surface 25 | Letting something roll up a slanted surface, so it rolls back down 26 | Lifting a surface with something on it but not enough for it to slide down 27 | Lifting a surface with something on it until it starts sliding down 28 | Lifting something up completely without letting it drop down 29 | Lifting something up completely, then letting it drop down 30 | Lifting something with something on it 31 | Lifting up one end of something without letting it drop down 32 | Lifting up one end of something, then letting it drop down 33 | Moving away from something with your camera 34 | Moving part of something 35 | Moving something across a surface until it falls down 36 | Moving something across a surface without it falling down 37 | Moving something and something away from each other 38 | Moving something and something closer to each other 39 | Moving something and something so they collide with each other 40 | Moving something and something so they pass each other 41 | Moving something away from something 42 | Moving something away from the camera 43 | Moving something closer to something 44 | Moving something down 45 | Moving something towards the camera 46 | Moving something up 47 | Opening something 48 | Picking something up 49 | Piling something up 50 | Plugging something into something 51 | Plugging something into something but pulling it right out as you remove your hand 52 | Poking a hole into some substance 53 | Poking a hole into something soft 54 | Poking a stack of something so the stack collapses 55 | Poking a stack of something without the stack collapsing 56 | Poking something so it slightly moves 57 | Poking something so lightly that it doesn't or almost doesn't move 58 | Poking something so that it falls over 59 | Poking something so that it spins around 60 | Pouring something into something 61 | Pouring something into something until it overflows 62 | Pouring something onto something 63 | Pouring something out of something 64 | Pretending or failing to wipe something off of something 65 | Pretending or trying and failing to twist something 66 | Pretending to be tearing something that is not tearable 67 | Pretending to close something without actually closing it 68 | Pretending to open something without actually opening it 69 | Pretending to pick something up 70 | Pretending to poke something 71 | Pretending to pour something out of something, but something is empty 72 | Pretending to put something behind something 73 | Pretending to put something into something 74 | Pretending to put something next to something 75 | Pretending to put something on a surface 76 | Pretending to put something onto something 77 | Pretending to put something underneath something 78 | Pretending to scoop something up with something 79 | Pretending to spread air onto something 80 | Pretending to sprinkle air onto something 81 | Pretending to squeeze something 82 | Pretending to take something from somewhere 83 | Pretending to take something out of something 84 | Pretending to throw something 85 | Pretending to turn something upside down 86 | Pulling something from behind of something 87 | Pulling something from left to right 88 | Pulling something from right to left 89 | Pulling something onto something 90 | Pulling something out of something 91 | Pulling two ends of something but nothing happens 92 | Pulling two ends of something so that it gets stretched 93 | Pulling two ends of something so that it separates into two pieces 94 | Pushing something from left to right 95 | Pushing something from right to left 96 | Pushing something off of something 97 | Pushing something onto something 98 | Pushing something so it spins 99 | Pushing something so that it almost falls off but doesn't 100 | Pushing something so that it falls off the table 101 | Pushing something so that it slightly moves 102 | Pushing something with something 103 | Putting number of something onto something 104 | Putting something and something on the table 105 | Putting something behind something 106 | Putting something in front of something 107 | Putting something into something 108 | Putting something next to something 109 | Putting something on a flat surface without letting it roll 110 | Putting something on a surface 111 | Putting something on the edge of something so it is not supported and falls down 112 | Putting something onto a slanted surface but it doesn't glide down 113 | Putting something onto something 114 | Putting something onto something else that cannot support it so it falls down 115 | Putting something similar to other things that are already on the table 116 | Putting something that can't roll onto a slanted surface, so it slides down 117 | Putting something that can't roll onto a slanted surface, so it stays where it is 118 | Putting something that cannot actually stand upright upright on the table, so it falls on its side 119 | Putting something underneath something 120 | Putting something upright on the table 121 | Putting something, something and something on the table 122 | Removing something, revealing something behind 123 | Rolling something on a flat surface 124 | Scooping something up with something 125 | Showing a photo of something to the camera 126 | Showing something behind something 127 | Showing something next to something 128 | Showing something on top of something 129 | Showing something to the camera 130 | Showing that something is empty 131 | Showing that something is inside something 132 | Something being deflected from something 133 | Something colliding with something and both are being deflected 134 | Something colliding with something and both come to a halt 135 | Something falling like a feather or paper 136 | Something falling like a rock 137 | Spilling something behind something 138 | Spilling something next to something 139 | Spilling something onto something 140 | Spinning something so it continues spinning 141 | Spinning something that quickly stops spinning 142 | Spreading something onto something 143 | Sprinkling something onto something 144 | Squeezing something 145 | Stacking number of something 146 | Stuffing something into something 147 | Taking one of many similar things on the table 148 | Taking something from somewhere 149 | Taking something out of something 150 | Tearing something into two pieces 151 | Tearing something just a little bit 152 | Throwing something 153 | Throwing something against something 154 | Throwing something in the air and catching it 155 | Throwing something in the air and letting it fall 156 | Throwing something onto a surface 157 | Tilting something with something on it slightly so it doesn't fall down 158 | Tilting something with something on it until it falls off 159 | Tipping something over 160 | Tipping something with something in it over, so something in it falls out 161 | Touching (without moving) part of something 162 | Trying but failing to attach something to something because it doesn't stick 163 | Trying to bend something unbendable so nothing happens 164 | Trying to pour something into something, but missing so it spills next to it 165 | Turning something upside down 166 | Turning the camera downwards while filming something 167 | Turning the camera left while filming something 168 | Turning the camera right while filming something 169 | Turning the camera upwards while filming something 170 | Twisting (wringing) something wet until water comes out 171 | Twisting something 172 | Uncovering something 173 | Unfolding something 174 | Wiping something off of something -------------------------------------------------------------------------------- /tools/test_recognizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import mmcv 5 | import tempfile 6 | import os.path as osp 7 | import torch.distributed as dist 8 | import shutil 9 | from mmcv.runner import load_checkpoint, parallel_test, obj_from_dict, get_dist_info 10 | from mmcv.parallel import scatter, collate, MMDataParallel, MMDistributedDataParallel 11 | from mmaction.apis import init_dist 12 | from mmaction import datasets 13 | from mmaction.datasets import build_dataloader 14 | from mmaction.models import build_recognizer, recognizers 15 | from mmaction.core.evaluation.accuracy import (softmax, top_k_accuracy, non_mean_class_accuracy, 16 | mean_class_accuracy) 17 | 18 | 19 | def single_test(model, data_loader): 20 | model.eval() 21 | results = [] 22 | dataset = data_loader.dataset 23 | prog_bar = mmcv.ProgressBar(len(dataset)) 24 | for i, data in enumerate(data_loader): 25 | with torch.no_grad(): 26 | data['get_logit'] = True 27 | result = model(return_loss=False, **data) 28 | results.append(result) 29 | 30 | batch_size = data['img_group_0'].data[0].size(0) 31 | for _ in range(batch_size): 32 | prog_bar.update() 33 | return results 34 | 35 | 36 | def _data_func(data, device_id): 37 | data = scatter(collate([data], samples_per_gpu=1), [device_id])[0] 38 | return dict(return_loss=False, rescale=True, **data) 39 | 40 | 41 | def multi_gpu_test(model, data_loader, tmpdir=None): 42 | model.eval() 43 | results = [] 44 | dataset = data_loader.dataset 45 | rank, world_size = get_dist_info() 46 | if rank == 0: 47 | prog_bar = mmcv.ProgressBar(len(dataset)) 48 | for i, data in enumerate(data_loader): 49 | with torch.no_grad(): 50 | # data['get_logit'] = True 51 | result = model(return_loss=False, rescale=True, **data) 52 | results.append(result) 53 | 54 | if rank == 0: 55 | batch_size = data['img_group_0'].data[0].size(0) 56 | for _ in range(batch_size * world_size): 57 | prog_bar.update() 58 | 59 | # collect results from all ranks 60 | results = collect_results(results, len(dataset), tmpdir) 61 | 62 | return results 63 | 64 | 65 | def collect_results(result_part, size, tmpdir=None): 66 | rank, world_size = get_dist_info() 67 | # create a tmp dir if it is not specified 68 | if tmpdir is None: 69 | MAX_LEN = 512 70 | # 32 is whitespace 71 | dir_tensor = torch.full( 72 | (MAX_LEN,), 32, dtype=torch.uint8, device='cuda') 73 | if rank == 0: 74 | tmpdir = tempfile.mkdtemp() 75 | print('temp_dir', tmpdir) 76 | tmpdir = torch.tensor( 77 | bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') 78 | dir_tensor[:len(tmpdir)] = tmpdir 79 | dist.broadcast(dir_tensor, 0) 80 | tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() 81 | else: 82 | mmcv.mkdir_or_exist(tmpdir) 83 | # dump the part result to the dir 84 | mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) 85 | dist.barrier() 86 | # collect all parts 87 | if rank != 0: 88 | return None 89 | else: 90 | # load results of all parts from tmp dir 91 | part_list = [] 92 | for i in range(world_size): 93 | part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) 94 | part_list.append(mmcv.load(part_file)) 95 | # sort the results 96 | ordered_results = [] 97 | for res in zip(*part_list): 98 | ordered_results.extend(list(res)) 99 | # the dataloader may pad some samples 100 | ordered_results = ordered_results[:size] 101 | # remove tmp dir 102 | shutil.rmtree(tmpdir) 103 | return ordered_results 104 | 105 | 106 | def parse_args(): 107 | parser = argparse.ArgumentParser(description='Test an action recognizer') 108 | parser.add_argument('config', help='test config file path') 109 | parser.add_argument('checkpoint', help='checkpoinls' 110 | 't file') 111 | parser.add_argument( 112 | '--gpus', default=8, type=int, help='GPU number used for testing') 113 | parser.add_argument( 114 | '--proc_per_gpu', 115 | default=1, 116 | type=int, 117 | help='Number of processes per GPU') 118 | parser.add_argument('--out', help='output result file') 119 | parser.add_argument('--log', help='output log file') 120 | parser.add_argument('--fcn_testing', action='store_true', default=False, 121 | help='whether to use fcn testing') 122 | parser.add_argument('--flip', action='store_true', default=False, 123 | help='whether to flip videos') 124 | parser.add_argument('--tmpdir', help='tmp dir for writing some results') 125 | parser.add_argument( 126 | '--launcher', 127 | choices=['none', 'pytorch', 'slurm', 'mpi'], 128 | default='none', 129 | help='job launcher') 130 | parser.add_argument('--local_rank', type=int, default=0) 131 | parser.add_argument( 132 | '--ignore_cache', action='store_true', help='whether to ignore cache') 133 | args = parser.parse_args() 134 | print('args==>>', args) 135 | return args 136 | 137 | 138 | def main(): 139 | args = parse_args() 140 | 141 | assert args.out, ('Please specify the output path for results') 142 | 143 | if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): 144 | raise ValueError('The output file must be a pkl file.') 145 | 146 | cfg = mmcv.Config.fromfile(args.config) 147 | # set cudnn_benchmark 148 | if cfg.get('cudnn_benchmark', False): 149 | torch.backends.cudnn.benchmark = True 150 | cfg.data.test.test_mode = True 151 | 152 | if args.launcher == 'none': 153 | distributed = False 154 | else: 155 | distributed = True 156 | init_dist(args.launcher, **cfg.dist_params) 157 | 158 | if cfg.model.get('necks', None) is not None: 159 | cfg.model.necks.aux_head_config = None 160 | 161 | if cfg.data.test.oversample == 'three_crop': 162 | cfg.model.spatial_temporal_module.spatial_size = 8 163 | if args.fcn_testing: 164 | cfg.model['cls_head'].update({'fcn_testing': True}) 165 | cfg.model.update({'fcn_testing': True}) 166 | if args.flip: 167 | cfg.model.update({'flip': True}) 168 | 169 | dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True)) 170 | 171 | if args.ignore_cache and args.out is not None: 172 | if not distributed: 173 | if args.gpus == 1: 174 | model = build_recognizer( 175 | cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) 176 | load_checkpoint(model, args.checkpoint, strict=False, map_location='cpu') 177 | model = MMDataParallel(model, device_ids=[0]) 178 | 179 | data_loader = build_dataloader( 180 | dataset, 181 | imgs_per_gpu=1, 182 | workers_per_gpu=cfg.data.workers_per_gpu, 183 | num_gpus=1, 184 | dist=False, 185 | shuffle=False) 186 | outputs = single_test(model, data_loader) 187 | else: 188 | model_args = cfg.model.copy() 189 | model_args.update(train_cfg=None, test_cfg=cfg.test_cfg) 190 | model_type = getattr(recognizers, model_args.pop('type')) 191 | 192 | outputs = parallel_test( 193 | model_type, 194 | model_args, 195 | args.checkpoint, 196 | dataset, 197 | _data_func, 198 | range(args.gpus), 199 | workers_per_gpu=args.proc_per_gpu) 200 | else: 201 | data_loader = build_dataloader( 202 | dataset, 203 | imgs_per_gpu=1, 204 | workers_per_gpu=cfg.data.workers_per_gpu, 205 | dist=distributed, 206 | shuffle=False) 207 | model = build_recognizer( 208 | cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) 209 | load_checkpoint(model, args.checkpoint, strict=False, map_location='cpu') 210 | model = MMDistributedDataParallel(model.cuda()) 211 | outputs = multi_gpu_test(model, data_loader, args.tmpdir) 212 | else: 213 | try: 214 | if distributed: 215 | rank, _ = get_dist_info() 216 | if rank == 0: 217 | outputs = mmcv.load(args.out) 218 | else: 219 | outputs = mmcv.load(args.out) 220 | except: 221 | raise FileNotFoundError 222 | 223 | rank, _ = get_dist_info() 224 | if args.out: 225 | if rank == 0: 226 | print('writing results to {}'.format(args.out)) 227 | mmcv.dump(outputs, args.out) 228 | gt_labels = [] 229 | for i in range(len(dataset)): 230 | ann = dataset.get_ann_info(i) 231 | gt_labels.append(ann['label']) 232 | 233 | results = [] 234 | for res in outputs: 235 | res_list = [res[i] for i in range(res.shape[0])] 236 | results += res_list 237 | results = results[:len(gt_labels)] 238 | print('results_length', len(results)) 239 | top1, top5 = top_k_accuracy(results, gt_labels, k=(1, 5)) 240 | mean_acc = mean_class_accuracy(results, gt_labels) 241 | non_mean_acc = non_mean_class_accuracy(results, gt_labels) 242 | if args.log: 243 | f = open(args.log, 'w') 244 | f.write(f'Testing ckpt from {args.checkpoint}\n') 245 | f.write(f'Testing config from {args.config}\n') 246 | f.write("Mean Class Accuracy = {:.04f}\n".format(mean_acc * 100)) 247 | f.write("Top-1 Accuracy = {:.04f}\n".format(top1 * 100)) 248 | f.write("Top-5 Accuracy = {:.04f}\n".format(top5 * 100)) 249 | f.close() 250 | else: 251 | print("Mean Class Accuracy = {:.02f}".format(mean_acc * 100)) 252 | print("Top-1 Accuracy = {:.02f}".format(top1 * 100)) 253 | print("Top-5 Accuracy = {:.02f}".format(top5 * 100)) 254 | print("Non mean Class Accuracy", non_mean_acc) 255 | print('saving non_mean acc') 256 | 257 | 258 | if __name__ == '__main__': 259 | main() 260 | --------------------------------------------------------------------------------