├── mmaction
    ├── core
    │   ├── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── dist_utils.py
    │   └── evaluation
    │   │   ├── __init__.py
    │   │   ├── accuracy.py
    │   │   └── eval_hooks.py
    ├── models
    │   ├── tenons
    │   │   ├── necks
    │   │   │   └── __init__.py
    │   │   ├── cls_heads
    │   │   │   ├── __init__.py
    │   │   │   └── cls_head.py
    │   │   ├── segmental_consensuses
    │   │   │   ├── TODO.md
    │   │   │   ├── __init__.py
    │   │   │   ├── simple_consensus.py
    │   │   │   └── stpp.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── nonlocal_block.py
    │   │   │   ├── norm.py
    │   │   │   └── conv_module.py
    │   │   ├── backbones
    │   │   │   └── __init__.py
    │   │   └── spatial_temporal_modules
    │   │   │   ├── __init__.py
    │   │   │   ├── simple_spatial_module.py
    │   │   │   ├── simple_spatial_temporal_module.py
    │   │   │   ├── avgfusion.py
    │   │   │   └── non_local.py
    │   ├── recognizers
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── TSN3D.py
    │   │   └── TSN2D.py
    │   ├── __init__.py
    │   ├── registry.py
    │   └── builder.py
    ├── __init__.py
    ├── apis
    │   ├── __init__.py
    │   ├── env.py
    │   └── train.py
    ├── datasets
    │   ├── loader
    │   │   ├── __init__.py
    │   │   ├── build_loader.py
    │   │   └── sampler.py
    │   └── __init__.py
    ├── losses
    │   ├── __init__.py
    │   └── losses.py
    ├── README.md
    └── utils
    │   └── misc.py
├── demo
    ├── demo_pred.gif
    └── category.txt
├── docs
    ├── figures
    │   ├── empirical.png
    │   ├── exp_result.png
    │   └── framework.png
    ├── assets
    │   ├── font.css
    │   └── style.css
    └── index.html
├── .style.yapf
├── tools
    ├── dist_train_recognizer.sh
    ├── dist_test_recognizer.sh
    ├── extract_backbone_weights.py
    ├── train_recognizer.py
    ├── README.md
    └── test_recognizer.py
├── INSTALL.md
├── data
    └── README.md
├── .gitignore
├── README.md
├── setup.py
├── config_files
    ├── sthv1
    │   ├── tsm_baseline.py
    │   └── tsm_tpn.py
    ├── sthv2
    │   ├── tsm_baseline.py
    │   └── tsm_tpn.py
    └── kinetics400
    │   ├── baseline
    │       ├── r101f16s4.py
    │       ├── r101f8s8.py
    │       ├── r50f8s8.py
    │       ├── r101f32s2.py
    │       ├── r50f16s4.py
    │       └── r50f32s2.py
    │   └── tpn
    │       ├── r50f8s8.py
    │       ├── r101f16s4.py
    │       ├── r101f8s8.py
    │       ├── r50f16s4.py
    │       ├── r101f32s2.py
    │       └── r50f32s2.py
├── MODELZOO.md
└── test_video.py


/mmaction/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluation import *
2 | from .utils import *
3 | 


--------------------------------------------------------------------------------
/demo/demo_pred.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decisionforce/TPN/HEAD/demo/demo_pred.gif


--------------------------------------------------------------------------------
/mmaction/models/tenons/necks/__init__.py:
--------------------------------------------------------------------------------
1 | from .tpn import TPN
2 | 
3 | __all__ = ['TPN']
4 | 


--------------------------------------------------------------------------------
/docs/figures/empirical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/empirical.png


--------------------------------------------------------------------------------
/docs/figures/exp_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/exp_result.png


--------------------------------------------------------------------------------
/docs/figures/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/decisionforce/TPN/HEAD/docs/figures/framework.png


--------------------------------------------------------------------------------
/mmaction/__init__.py:
--------------------------------------------------------------------------------
1 | from .version import __version__, short_version
2 | 
3 | __all__ = ['__version__', 'short_version']
4 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/cls_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .cls_head import ClsHead
2 | 
3 | __all__ = [
4 |     'ClsHead',
5 | ]
6 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/segmental_consensuses/TODO.md:
--------------------------------------------------------------------------------
1 | ### TODO
2 | 
3 | [x] SimpleConsensus
4 | 
5 | [ ] STPP
6 | 
7 | [ ] TRN
8 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | BASED_ON_STYLE = pep8
3 | BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true
4 | SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
5 | 


--------------------------------------------------------------------------------
/mmaction/core/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .dist_utils import allreduce_grads, DistOptimizerHook
2 | 
3 | __all__ = [
4 |     'allreduce_grads', 'DistOptimizerHook',
5 | ]
6 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .conv_module import ConvModule
2 | from .norm import build_norm_layer
3 | 
4 | __all__ = [
5 |     'ConvModule', 'build_norm_layer',
6 | ]
7 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet_slow import ResNet_SlowFast
2 | from .resnet import ResNet
3 | 
4 | __all__ = [
5 |     'ResNet_SlowFast',
6 |     'ResNet'
7 | ]
8 | 


--------------------------------------------------------------------------------
/mmaction/models/recognizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseRecognizer
2 | from .TSN2D import TSN2D
3 | from .TSN3D import TSN3D
4 | 
5 | __all__ = [
6 |     'BaseRecognizer', 'TSN2D', 'TSN3D',
7 | ]
8 | 


--------------------------------------------------------------------------------
/mmaction/core/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .eval_hooks import (DistEvalHook, DistEvalTopKAccuracyHook,
2 |                          )
3 | 
4 | __all__ = [
5 |     'DistEvalHook', 'DistEvalTopKAccuracyHook',
6 | ]
7 | 


--------------------------------------------------------------------------------
/tools/dist_train_recognizer.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | PYTHON=${PYTHON:-"python"}
4 | 
5 | $PYTHON -m torch.distributed.launch --nproc_per_node=$2 $(dirname "$0")/train_recognizer.py $1 --launcher pytorch ${@:3}
6 | 


--------------------------------------------------------------------------------
/mmaction/apis/__init__.py:
--------------------------------------------------------------------------------
1 | from .env import init_dist, get_root_logger, set_random_seed
2 | from .train import train_network
3 | 
4 | __all__ = [
5 |     'init_dist', 'get_root_logger', 'set_random_seed',
6 |     'train_network',
7 | ]
8 | 


--------------------------------------------------------------------------------
/mmaction/datasets/loader/__init__.py:
--------------------------------------------------------------------------------
1 | from .build_loader import build_dataloader
2 | from .sampler import GroupSampler, DistributedGroupSampler
3 | 
4 | __all__ = [
5 |     'GroupSampler', 'DistributedGroupSampler', 'build_dataloader'
6 | ]
7 | 


--------------------------------------------------------------------------------
/tools/dist_test_recognizer.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | PYTHON=${PYTHON:-"python"}
 4 | 
 5 | CONFIG=$1
 6 | CHECKPOINT=$2
 7 | GPUS=$3
 8 | 
 9 | $PYTHON -m torch.distributed.launch --nproc_per_node=$GPUS \
10 |     $(dirname "$0")/test_recognizer.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
11 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/utils/nonlocal_block.py:
--------------------------------------------------------------------------------
 1 | from ..spatial_temporal_modules.non_local import NonLocalModule
 2 | 
 3 | 
 4 | def build_nonlocal_block(cfg):
 5 |     """ Build nonlocal block
 6 | 
 7 |     Args:
 8 |     """
 9 |     assert isinstance(cfg, dict)
10 |     cfg_ = cfg.copy()
11 |     return NonLocalModule(**cfg_)
12 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/segmental_consensuses/__init__.py:
--------------------------------------------------------------------------------
 1 | from .simple_consensus import SimpleConsensus
 2 | from .stpp import parse_stage_config
 3 | from .stpp import StructuredTemporalPyramidPooling
 4 | 
 5 | __all__ = [
 6 |     'SimpleConsensus',
 7 |     'StructuredTemporalPyramidPooling',
 8 |     'parse_stage_config'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/spatial_temporal_modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from .simple_spatial_module import SimpleSpatialModule
 2 | from .simple_spatial_temporal_module import SimpleSpatialTemporalModule
 3 | from .avgfusion import AvgFusion
 4 | 
 5 | __all__ = [
 6 |     'SimpleSpatialModule',
 7 |     'SimpleSpatialTemporalModule',
 8 |     'AvgFusion'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/tools/extract_backbone_weights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import collections
 4 | 
 5 | model = torch.load(sys.argv[1])
 6 | 
 7 | weight = model['state_dict']
 8 | 
 9 | out = collections.OrderedDict()
10 | for k, v in weight.items():
11 |     name = k.replace('backbone.', '').replace('cls_head.', '')
12 |     out[name] = v.cpu()
13 |     print(name)
14 | 
15 | torch.save(out, sys.argv[2])
16 | 


--------------------------------------------------------------------------------
/mmaction/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from .rawframes_dataset import RawFramesDataset
 2 | from .utils import get_untrimmed_dataset, get_trimmed_dataset
 3 | from .loader import GroupSampler, DistributedGroupSampler, build_dataloader
 4 | 
 5 | __all__ = [
 6 |     'RawFramesDataset',
 7 |     'get_trimmed_dataset', 'get_untrimmed_dataset',
 8 |     'GroupSampler', 'DistributedGroupSampler', 'build_dataloader'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ```shell
 4 | git clone https://github.com/decisionforce/TPN.git
 5 | ```
 6 | 
 7 | ## Requirements
 8 | 
 9 | - Linux
10 | - Python 3.5+
11 | - PyTorch 1.0+
12 | - CUDA 9.0+
13 | - NVCC 2+
14 | - GCC 4.9+
15 | - mmcv 0.2.10
16 | 
17 | ## Install MMAction
18 | (a) Install Cython
19 | ```shell
20 | pip install cython
21 | ```
22 | (b) Install mmaction
23 | ```shell
24 | python setup.py develop
25 | ```
26 | 
27 | 


--------------------------------------------------------------------------------
/mmaction/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | from .losses import (
 2 |     weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy,
 3 |     weighted_smoothl1, accuracy,
 4 |     weighted_multilabel_binary_cross_entropy,
 5 |     multilabel_accuracy)
 6 | 
 7 | __all__ = [
 8 |     'weighted_nll_loss', 'weighted_cross_entropy',
 9 |     'weighted_binary_cross_entropy',
10 |     'weighted_smoothl1', 'accuracy',
11 |     'weighted_multilabel_binary_cross_entropy',
12 |     'multilabel_accuracy',
13 | 
14 | ]
15 | 


--------------------------------------------------------------------------------
/mmaction/README.md:
--------------------------------------------------------------------------------
 1 | # mmaction
 2 | 
 3 | This code is based on [MMAction](https://github.com/open-mmlab/mmaction) which supports modular design and high efficiency. Our TPN would be merged into the latest MMAction in the future.
 4 | 
 5 | Here we briefly introduce the structure of this codebase:
 6 | 
 7 | - `apis`: contains the launcher of the whole codebase and intializer of distributed training environment.
 8 | - `core`: contains multiple hooks for evaluation e.g. calculating the Top-1/Top-5 accuracy.
 9 | - `datasets`: contains `rawframes_dataset` and transform for training.
10 | - `losses`: contains kinds of CrossEntropy loss.
11 | - `models`: contains recognizers and various submodules of network e.g. *backbone*, *neck*,and *head* under `models/tenons` 
12 | 
13 | Such modular design helps us quickly and easily conduct experiments with different modules.
14 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/spatial_temporal_modules/simple_spatial_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from ...registry import SPATIAL_TEMPORAL_MODULES
 5 | 
 6 | 
 7 | @SPATIAL_TEMPORAL_MODULES.register_module
 8 | class SimpleSpatialModule(nn.Module):
 9 |     def __init__(self, spatial_type='avg', spatial_size=7):
10 |         super(SimpleSpatialModule, self).__init__()
11 | 
12 |         assert spatial_type in ['avg']
13 |         self.spatial_type = spatial_type
14 | 
15 |         self.spatial_size = spatial_size if not isinstance(spatial_size, int) else (spatial_size, spatial_size)
16 | 
17 |         if self.spatial_type == 'avg':
18 |             self.op = nn.AvgPool2d(self.spatial_size, stride=1, padding=0)
19 | 
20 |     def init_weights(self):
21 |         pass
22 | 
23 |     def forward(self, input):
24 |         return self.op(input)
25 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/spatial_temporal_modules/simple_spatial_temporal_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from ...registry import SPATIAL_TEMPORAL_MODULES
 5 | 
 6 | 
 7 | @SPATIAL_TEMPORAL_MODULES.register_module
 8 | class SimpleSpatialTemporalModule(nn.Module):
 9 |     def __init__(self, spatial_type='avg', spatial_size=7, temporal_size=1):
10 |         super(SimpleSpatialTemporalModule, self).__init__()
11 | 
12 |         assert spatial_type in ['avg']
13 |         self.spatial_type = spatial_type
14 | 
15 |         self.spatial_size = spatial_size if not isinstance(spatial_size, int) else (spatial_size, spatial_size)
16 |         self.temporal_size = temporal_size
17 |         self.pool_size = (self.temporal_size,) + self.spatial_size
18 | 
19 |         if self.spatial_type == 'avg':
20 |             self.op = nn.AvgPool3d(self.pool_size, stride=1, padding=0)
21 | 
22 |     def init_weights(self):
23 |         pass
24 | 
25 |     def forward(self, input):
26 |         return self.op(input)
27 | 


--------------------------------------------------------------------------------
/mmaction/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .tenons.backbones import *
 2 | from .tenons.spatial_temporal_modules import *
 3 | from .tenons.segmental_consensuses import *
 4 | from .tenons.cls_heads import *
 5 | from .recognizers import *
 6 | from .tenons.necks import *
 7 | 
 8 | from .registry import (BACKBONES, SPATIAL_TEMPORAL_MODULES, SEGMENTAL_CONSENSUSES, HEADS,
 9 |                        RECOGNIZERS, LOCALIZERS, DETECTORS, ARCHITECTURES,
10 |                        NECKS, ROI_EXTRACTORS)
11 | from .builder import (build_backbone, build_spatial_temporal_module, build_segmental_consensus,
12 |                       build_head, build_recognizer, build_detector,
13 |                       build_localizer, build_architecture,
14 |                       build_neck, build_roi_extractor)
15 | 
16 | __all__ = [
17 |     'BACKBONES', 'SPATIAL_TEMPORAL_MODULES', 'SEGMENTAL_CONSENSUSES', 'HEADS',
18 |     'RECOGNIZERS', 'LOCALIZERS', 'DETECTORS', 'ARCHITECTURES',
19 |     'NECKS', 'ROI_EXTRACTORS',
20 |     'build_backbone', 'build_spatial_temporal_module', 'build_segmental_consensus',
21 |     'build_head', 'build_recognizer', 'build_detector',
22 |     'build_localizer', 'build_architecture',
23 |     'build_neck', 'build_roi_extractor'
24 | ]
25 | 


--------------------------------------------------------------------------------
/mmaction/models/recognizers/base.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from abc import ABCMeta, abstractmethod
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class BaseRecognizer(nn.Module):
 8 |     """Base class for recognizers"""
 9 | 
10 |     __metaclass__ = ABCMeta
11 | 
12 |     def __init__(self):
13 |         super(BaseRecognizer, self).__init__()
14 | 
15 |     @property
16 |     def with_tenon_list(self):
17 |         return hasattr(self, 'tenon_list') and self.tenon_list is not None
18 | 
19 |     @property
20 |     def with_cls(self):
21 |         return hasattr(self, 'cls_head') and self.cls_head is not None
22 | 
23 |     @abstractmethod
24 |     def forward_train(self, num_modalities, **kwargs):
25 |         pass
26 | 
27 |     @abstractmethod
28 |     def forward_test(self, num_modalities, **kwargs):
29 |         pass
30 | 
31 |     def init_weights(self, pretrained=None):
32 |         if pretrained is not None:
33 |             logger = logging.getLogger()
34 |             logger.info("load model from: {}".format(pretrained))
35 | 
36 |     def forward(self, num_modalities, img_meta, return_loss=True, **kwargs):
37 |         num_modalities = int(num_modalities[0])
38 |         if return_loss:
39 |             return self.forward_train(num_modalities, img_meta, **kwargs)
40 |         else:
41 |             return self.forward_test(num_modalities, img_meta, **kwargs)
42 | 


--------------------------------------------------------------------------------
/mmaction/core/evaluation/accuracy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import confusion_matrix
 3 | 
 4 | 
 5 | def softmax(x, dim=1):
 6 |     """Compute softmax values for each sets of scores in x."""
 7 |     e_x = np.exp(x - np.max(x, axis=dim, keepdims=True))
 8 |     return e_x / e_x.sum(axis=dim, keepdims=True)
 9 | 
10 | 
11 | def mean_class_accuracy(scores, labels):
12 |     pred = np.argmax(scores, axis=1)
13 |     cf = confusion_matrix(labels, pred).astype(float)
14 | 
15 |     cls_cnt = cf.sum(axis=1)
16 |     cls_hit = np.diag(cf)
17 | 
18 |     return np.mean(cls_hit / cls_cnt)
19 | 
20 | 
21 | def non_mean_class_accuracy(scores, labels):
22 |     pred = np.argmax(scores, axis=1)
23 |     cf = confusion_matrix(labels, pred).astype(float)
24 | 
25 |     cls_cnt = cf.sum(axis=1)
26 |     cls_hit = np.diag(cf)
27 | 
28 |     return cls_hit / cls_cnt
29 | 
30 | 
31 | def top_k_acc(score, lb_set, k=3):
32 |     idx = np.argsort(score)[-k:]
33 |     return len(lb_set.intersection(idx)), len(lb_set)
34 | 
35 | 
36 | def top_k_hit(score, lb_set, k=3):
37 |     idx = np.argsort(score)[-k:]
38 |     return len(lb_set.intersection(idx)) > 0, 1
39 | 
40 | 
41 | def top_k_accuracy(scores, labels, k=(1,)):
42 |     res = []
43 |     for kk in k:
44 |         hits = []
45 |         for x, y in zip(scores, labels):
46 |             y = [y] if isinstance(y, int) else y
47 |             hits.append(top_k_hit(x, set(y), k=kk)[0])
48 |         res.append(np.mean(hits))
49 |     return res
50 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/segmental_consensuses/simple_consensus.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from ...registry import SEGMENTAL_CONSENSUSES
 5 | 
 6 | 
 7 | class _SimpleConsensus(torch.autograd.Function):
 8 |     """Simplest segmental consensus module"""
 9 | 
10 |     def __init__(self,
11 |                  consensus_type='avg',
12 |                  dim=1):
13 |         super(_SimpleConsensus, self).__init__()
14 | 
15 |         assert consensus_type in ['avg']
16 |         self.consensus_type = consensus_type
17 |         self.dim = dim
18 |         self.shape = None
19 | 
20 |     def forward(self, x):
21 |         self.shape = x.size()
22 |         if self.consensus_type == 'avg':
23 |             output = x.mean(dim=self.dim, keepdim=True)
24 |         else:
25 |             output = None
26 |         return output
27 | 
28 |     def backward(self, grad_output):
29 |         if self.consensus_type == 'avg':
30 |             grad_in = grad_output.expand(self.shape) / float(self.shape[self.dim])
31 |         else:
32 |             grad_in = None
33 |         return grad_in
34 | 
35 | 
36 | @SEGMENTAL_CONSENSUSES.register_module
37 | class SimpleConsensus(nn.Module):
38 |     def __init__(self, consensus_type, dim=1):
39 |         super(SimpleConsensus, self).__init__()
40 | 
41 |         assert consensus_type in ['avg']
42 |         self.consensus_type = consensus_type
43 |         self.dim = dim
44 | 
45 |     def init_weights(self):
46 |         pass
47 | 
48 |     def forward(self, input):
49 |         return _SimpleConsensus(self.consensus_type, self.dim)(input)
50 | 


--------------------------------------------------------------------------------
/mmaction/models/registry.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class Registry(object):
 5 | 
 6 |     def __init__(self, name):
 7 |         self._name = name
 8 |         self._module_dict = dict()
 9 | 
10 |     @property
11 |     def name(self):
12 |         return self._name
13 | 
14 |     @property
15 |     def module_dict(self):
16 |         return self._module_dict
17 | 
18 |     def _register_module(self, module_class):
19 |         """Register a module
20 | 
21 |         Args:
22 |             module (:obj:`nn.Module`): Module to be registered.
23 |         """
24 |         if not issubclass(module_class, nn.Module):
25 |             raise TypeError(
26 |                 'module must be a child of nn.Module, but got {}'.format(
27 |                     module_class))
28 |         module_name = module_class.__name__
29 |         if module_name in self._module_dict:
30 |             raise KeyError('{} is already registered in {}'.format(
31 |                 module_name, self.name))
32 |         self._module_dict[module_name] = module_class
33 | 
34 |     def register_module(self, cls):
35 |         self._register_module(cls)
36 |         return cls
37 | 
38 | 
39 | BACKBONES = Registry('backbone')
40 | FLOWNETS = Registry('flownet')
41 | SPATIAL_TEMPORAL_MODULES = Registry('spatial_temporal_module')
42 | SEGMENTAL_CONSENSUSES = Registry('segmental_consensus')
43 | HEADS = Registry('head')
44 | RECOGNIZERS = Registry('recognizer')
45 | LOCALIZERS = Registry('localizer')
46 | DETECTORS = Registry('detector')
47 | ARCHITECTURES = Registry('architecture')
48 | NECKS = Registry('neck')
49 | ROI_EXTRACTORS = Registry('roi_extractor')
50 | 


--------------------------------------------------------------------------------
/mmaction/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import numpy as np
 3 | import mmcv
 4 | 
 5 | 
 6 | def rsetattr(obj, attr, val):
 7 |     '''
 8 |         See:
 9 |         https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects
10 |     '''
11 |     pre, _, post = attr.rpartition('.')
12 |     return setattr(rgetattr(obj, pre) if pre else obj, post, val)
13 | 
14 | 
15 | def rgetattr(obj, attr, *args):
16 |     def _getattr(obj, attr):
17 |         return getattr(obj, attr, *args)
18 | 
19 |     return functools.reduce(_getattr, [obj] + attr.split('.'))
20 | 
21 | 
22 | def rhasattr(obj, attr, *args):
23 |     def _hasattr(obj, attr):
24 |         if hasattr(obj, attr):
25 |             return getattr(obj, attr)
26 |         else:
27 |             return None
28 | 
29 |     return functools.reduce(_hasattr, [obj] + attr.split('.')) is not None
30 | 
31 | 
32 | def tensor2video_snaps(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
33 |     num_videos = tensor.size(0)
34 |     num_frames = tensor.size(2)
35 |     mean = np.array(mean, dtype=np.float32)
36 |     std = np.array(std, dtype=np.float32)
37 |     video_snaps = []
38 |     for vid_id in range(num_videos):
39 |         img = tensor[vid_id, :, num_frames //
40 |                                 2, ...].cpu().numpy().transpose(1, 2, 0)
41 |         img = mmcv.imdenormalize(
42 |             img, mean, std, to_bgr=to_rgb).astype(np.uint8)
43 |         video_snaps.append(np.ascontiguousarray(img))
44 |     return video_snaps
45 | 
46 | 
47 | def multi_apply(func, *args, **kwargs):
48 |     pfunc = functools.partial(func, **kwargs) if kwargs else func
49 |     map_results = map(pfunc, *args)
50 |     return tuple(map(list, zip(*map_results)))
51 | 


--------------------------------------------------------------------------------
/docs/assets/font.css:
--------------------------------------------------------------------------------
 1 | /* Homepage Font */
 2 | 
 3 | /* latin-ext */
 4 | @font-face {
 5 |     font-family: 'Lato';
 6 |     font-style: normal;
 7 |     font-weight: 400;
 8 |     src: local('Lato Regular'), local('Lato-Regular'), url(https://fonts.gstatic.com/s/lato/v16/S6uyw4BMUTPHjxAwXjeu.woff2) format('woff2');
 9 |     unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
10 | }
11 | 
12 | /* latin */
13 | @font-face {
14 |     font-family: 'Lato';
15 |     font-style: normal;
16 |     font-weight: 400;
17 |     src: local('Lato Regular'), local('Lato-Regular'), url(https://fonts.gstatic.com/s/lato/v16/S6uyw4BMUTPHjx4wXg.woff2) format('woff2');
18 |     unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
19 | }
20 | 
21 | /* latin-ext */
22 | @font-face {
23 |     font-family: 'Lato';
24 |     font-style: normal;
25 |     font-weight: 700;
26 |     src: local('Lato Bold'), local('Lato-Bold'), url(https://fonts.gstatic.com/s/lato/v16/S6u9w4BMUTPHh6UVSwaPGR_p.woff2) format('woff2');
27 |     unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
28 | }
29 | 
30 | /* latin */
31 | @font-face {
32 |     font-family: 'Lato';
33 |     font-style: normal;
34 |     font-weight: 700;
35 |     src: local('Lato Bold'), local('Lato-Bold'), url(https://fonts.gstatic.com/s/lato/v16/S6u9w4BMUTPHh6UVSwiPGQ.woff2) format('woff2');
36 |     unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
37 | }
38 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Preparation
 2 | 
 3 | ### Notes on Video Data format
 4 | Since the original VideoDataloader of MMAction requires [decord](https://github.com/zhreshold/decord) for efficient video loading which is non-trivial to compile, this repo only supports **raw frame** format of videos. Therefore, you have to extract frames from raw videos. We will find another libaries and support VideoLoader soon.
 5 | 
 6 | ### Supported datasets
 7 | The `rawframe_dataset` loads data in a general manner by preparing a `.txt` file which contains the directory path of frames, total number of a certain video, and the groundtruth label. After that, specify the `data_root` and `image_tmpl` of config files. See the sample below:
 8 | 
 9 | ```bash
10 | shot_put/c5-PBp04AQI 299 298
11 | marching/5OEnoefcO1Y 299 192
12 | dancing_ballet/pR1jxLvjcgU 249 84
13 | motorcycling/0dC3o90WYHs 299 199
14 | hoverboarding/RVkof6bxvg0 278 157
15 | playing_piano/H3JzOkvTrJk 297 241
16 | ```
17 | Such general loader might help your experiment with other dataset e.g. UCF101 or custom dataset.
18 | 
19 | ### Prepare annotations
20 | 
21 | - [Kinetics400](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) contains ~240k training videos and ~19k validation videos. See the [guide](https://github.com/open-mmlab/mmaction/tree/master/data_tools/kinetics400/PREPARING_KINETICS400.md) of original MMAction to generate annotations.
22 | - [Something-Someting](https://github.com/TwentyBN) has 2 versions which you have to apply on their [website](https://20bn.com/datasets/something-something). See the [guide](https://github.com/mit-han-lab/temporal-shift-module/tree/master/tools) of TSM to generate annotations.
23 | 
24 | Thank original [MMAction](https://github.com/open-mmlab/mmaction) and [TSM](https://github.com/mit-han-lab/temporal-shift-module) repo for kindly providing preprocessing scripts.
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # cython generated cpp
107 | mmaction/version.py
108 | .vscode
109 | .idea
110 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/utils/norm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | norm_cfg = {
 4 |     # format: layer_type: (abbreviation, module)
 5 |     'BN': ('bn', nn.BatchNorm2d),
 6 |     'SyncBN': ('bn', None),
 7 |     'GN': ('gn', nn.GroupNorm),
 8 |     # and potentially 'SN'
 9 | }
10 | 
11 | 
12 | def build_norm_layer(cfg, num_features, postfix=''):
13 |     """ Build normalization layer
14 |     Args:
15 |         cfg (dict): cfg should contain:
16 |             type (str): identify norm layer type.
17 |             layer args: args needed to instantiate a norm layer.
18 |             frozen (bool): [optional] whether stop gradient updates
19 |                 of norm layer, it is helpful to set frozen mode
20 |                 in backbone's norms.
21 |         num_features (int): number of channels from input
22 |         postfix (int, str): appended into norm abbreation to
23 |             create named layer.
24 |     Returns:
25 |         name (str): abbreation + postfix
26 |         layer (nn.Module): created norm layer
27 |     """
28 |     assert isinstance(cfg, dict) and 'type' in cfg
29 |     cfg_ = cfg.copy()
30 | 
31 |     layer_type = cfg_.pop('type')
32 |     if layer_type not in norm_cfg:
33 |         raise KeyError('Unrecognized norm type {}'.format(layer_type))
34 |     else:
35 |         abbr, norm_layer = norm_cfg[layer_type]
36 |         if norm_layer is None:
37 |             raise NotImplementedError
38 | 
39 |     assert isinstance(postfix, (int, str))
40 |     name = abbr + str(postfix)
41 | 
42 |     frozen = cfg_.pop('frozen', False)
43 |     cfg_.setdefault('eps', 1e-5)
44 |     if layer_type != 'GN':
45 |         layer = norm_layer(num_features, **cfg_)
46 |     else:
47 |         assert 'num_groups' in cfg_
48 |         layer = norm_layer(num_channels=num_features, **cfg_)
49 | 
50 |     if frozen:
51 |         for param in layer.parameters():
52 |             param.requires_grad = False
53 | 
54 |     return name, layer
55 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/spatial_temporal_modules/avgfusion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from ...registry import SPATIAL_TEMPORAL_MODULES
 5 | 
 6 | 
 7 | @SPATIAL_TEMPORAL_MODULES.register_module
 8 | class AvgFusion(nn.Module):
 9 |     def __init__(self, fusion_type='concat'):
10 |         super(AvgFusion, self).__init__()
11 |         assert fusion_type in ['add', 'avg', 'concat', 'concatadd', 'concatavg']
12 |         self.fusion_type = fusion_type
13 | 
14 |     def init_weights(self):
15 |         pass
16 | 
17 |     def forward(self, input):
18 |         assert (isinstance(input, tuple))
19 |         after_avgpool = [F.adaptive_avg_pool3d(each, 1) for each in input]
20 | 
21 |         if self.fusion_type == 'add':
22 |             out = torch.sum(torch.cat(after_avgpool, -1), -1, keepdim=True)
23 | 
24 |         elif self.fusion_type == 'avg':
25 |             out = torch.mean(torch.cat(after_avgpool, -1), -1, keepdim=True)
26 | 
27 |         elif self.fusion_type == 'concat':
28 |             out = torch.cat(after_avgpool, 1)
29 | 
30 |         elif self.fusion_type == 'concatadd':
31 |             out_first = torch.cat(after_avgpool[:-1], 1)
32 |             out = torch.sum(torch.cat([out_first, after_avgpool[-1]], -1), -1, keepdim=True)
33 |         elif self.fusion_type == 'concatavg':
34 |             out_first = torch.cat(after_avgpool[:-1], 1)
35 |             out = torch.mean(torch.cat([out_first, after_avgpool[-1]], -1), -1, keepdim=True)
36 |         else:
37 |             raise ValueError
38 | 
39 |         return out
40 | 
41 | 
42 | def main():
43 |     res2 = torch.FloatTensor(8, 512, 8, 56, 56).cuda()
44 |     res3 = torch.FloatTensor(8, 512, 8, 28, 28).cuda()
45 |     res4 = torch.FloatTensor(8, 512, 8, 14, 14).cuda()
46 |     res5 = torch.FloatTensor(8, 512, 8, 7, 7).cuda()
47 |     feature = tuple([res2, res3, res4, res5])
48 |     model = AvgFusion(fusion_type='add').cuda()
49 |     out = model(feature)
50 |     print(out.shape)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/mmaction/core/utils/dist_utils.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch.distributed as dist
 4 | from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors,
 5 |                           _take_tensors)
 6 | from mmcv.runner import OptimizerHook
 7 | 
 8 | 
 9 | def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
10 |     if bucket_size_mb > 0:
11 |         bucket_size_bytes = bucket_size_mb * 1024 * 1024
12 |         buckets = _take_tensors(tensors, bucket_size_bytes)
13 |     else:
14 |         buckets = OrderedDict()
15 |         for tensor in tensors:
16 |             tp = tensor.type()
17 |             if tp not in buckets:
18 |                 buckets[tp] = []
19 |             buckets[tp].append(tensor)
20 |         buckets = buckets.values()
21 | 
22 |     for bucket in buckets:
23 |         flat_tensors = _flatten_dense_tensors(bucket)
24 |         dist.all_reduce(flat_tensors)
25 |         flat_tensors.div_(world_size)
26 |         for tensor, synced in zip(
27 |                 bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
28 |             tensor.copy_(synced)
29 | 
30 | 
31 | def allreduce_grads(model, coalesce=True, bucket_size_mb=-1):
32 |     grads = [
33 |         param.grad.data for param in model.parameters()
34 |         if param.requires_grad and param.grad is not None
35 |     ]
36 |     world_size = dist.get_world_size()
37 |     if coalesce:
38 |         _allreduce_coalesced(grads, world_size, bucket_size_mb)
39 |     else:
40 |         for tensor in grads:
41 |             dist.all_reduce(tensor.div_(world_size))
42 | 
43 | 
44 | class DistOptimizerHook(OptimizerHook):
45 | 
46 |     def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
47 |         self.grad_clip = grad_clip
48 |         self.coalesce = coalesce
49 |         self.bucket_size_mb = bucket_size_mb
50 | 
51 |     def after_train_iter(self, runner):
52 |         runner.optimizer.zero_grad()
53 |         runner.outputs['loss'].backward()
54 |         allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb)
55 |         if self.grad_clip is not None:
56 |             self.clip_grads(runner.model.parameters())
57 |         runner.optimizer.step()
58 | 


--------------------------------------------------------------------------------
/mmaction/apis/env.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import random
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | import torch.distributed as dist
 8 | import torch.multiprocessing as mp
 9 | from mmcv.runner import get_dist_info
10 | import subprocess
11 | 
12 | 
13 | def init_dist(launcher, backend='nccl', **kwargs):
14 |     if mp.get_start_method(allow_none=True) is None:
15 |         mp.set_start_method('spawn')
16 |     if launcher == 'pytorch':
17 |         _init_dist_pytorch(backend, **kwargs)
18 |     elif launcher == 'mpi':
19 |         _init_dist_mpi(backend, **kwargs)
20 |     elif launcher == 'slurm':
21 |         _init_dist_slurm(backend, **kwargs)
22 |     else:
23 |         raise ValueError('Invalid launcher type: {}'.format(launcher))
24 | 
25 | 
26 | def _init_dist_pytorch(backend, **kwargs):
27 |     # TODO: use local_rank instead of rank % num_gpus
28 |     rank = int(os.environ['RANK'])
29 |     num_gpus = torch.cuda.device_count()
30 |     torch.cuda.set_device(rank % num_gpus)
31 |     dist.init_process_group(backend=backend, **kwargs)
32 | 
33 | 
34 | def _init_dist_mpi(backend, **kwargs):
35 |     raise NotImplementedError
36 | 
37 | 
38 | def _init_dist_slurm(backend, port=12345, **kwargs):
39 |     proc_id = int(os.environ['SLURM_PROCID'])
40 |     ntasks = int(os.environ['SLURM_NTASKS'])
41 |     node_list = os.environ['SLURM_NODELIST']
42 |     num_gpus = torch.cuda.device_count()
43 |     torch.cuda.set_device(proc_id % num_gpus)
44 |     addr = subprocess.getoutput(
45 |         'scontrol show hostname {} | head -n1'.format(node_list))
46 |     os.environ['MASTER_PORT'] = str(port)
47 |     os.environ['MASTER_ADDR'] = addr
48 |     os.environ['WORLD_SIZE'] = str(ntasks)
49 |     os.environ['RANK'] = str(proc_id)
50 |     dist.init_process_group(backend=backend)
51 |     # raise NotImplementedError
52 | 
53 | 
54 | def set_random_seed(seed):
55 |     random.seed(seed)
56 |     np.random.seed(seed)
57 |     torch.manual_seed(seed)
58 |     torch.cuda.manual_seed_all(seed)
59 | 
60 | 
61 | def get_root_logger(log_level=logging.INFO):
62 |     logger = logging.getLogger()
63 |     if not logger.hasHandlers():
64 |         logging.basicConfig(
65 |             format='%(asctime)s - %(levelname)s - %(message)s',
66 |             level=log_level)
67 |     rank, _ = get_dist_info()
68 |     if rank != 0:
69 |         logger.setLevel('ERROR')
70 |     return logger
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Temporal Pyramid Network for Action Recognition 
 2 | 
 3 | ![image](./docs/figures/framework.png)
 4 | [[Paper](https://arxiv.org/pdf/2004.03548.pdf)]
 5 | [[Project Page](https://decisionforce.github.io/TPN/)]
 6 | 
 7 | 
 8 | ## License
 9 | The project is release under the [Apache 2.0 license](./LICENSE).
10 | 
11 | ## Model Zoo
12 | Results and reference models are available in the [model zoo](./MODELZOO.md).
13 | 
14 | ## Installation and Data Preparation
15 | Please refer to [INSTALL](INSTALL.md) for installation and [DATA](./data/README.md) for data preparation.
16 | 
17 | ## Get Started
18 | Please refer to [GETTING_STARTED](./tools/README.md) for detailed usage.
19 | 
20 | ## Quick Demo
21 | We provide `test_video.py` to inference a single video.
22 | Download the checkpoints and put them to the `ckpt/.` and run:
23 | ```
24 | python ./test_video.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --video_file ${VIDOE_NAME} --label_file ${LABLE_FILE} --rendered_output ${RENDERED_NAME}
25 | ```
26 | Arguments:
27 | - `--video_file`: Path for demo video, default is `./demo/demo.mp4` 
28 | - `--label_file`: The label file for pretrained model, default is `demo/category.txt`
29 | - `--redndered_output`: The output file name. If specified, the script will render output video with label name, default is `demo/demo_pred.webm`. 
30 | 
31 | For example, we can predict for the demo video (download [here](https://drive.google.com/open?id=14VYS8hGA5i1J70qBqrUqLiDxJq_FgXiW) and put it under `demo/.`)  by running:
32 | ```
33 | python ./test_video.py config_files/sthv2/tsm_tpn.py ckpt/sthv2_tpn.pth
34 | ```
35 | The rendered output video:
36 | 
37 | ![image](./demo/demo_pred.gif)
38 | 
39 | ## Acknowledgement
40 | We really appreciate developers of [MMAction](https://github.com/open-mmlab/mmaction) for such wonderful codebase. We also thank Yue Zhao for the insightful discussion.
41 | 
42 | ## Contact
43 | This repo is currently maintained by Ceyuan Yang ([@limbo0000](https://github.com/limbo0000)) and Yinghao Xu ([@justimyhxu](https://github.com/justimyhxu)).
44 | 
45 | ## Bibtex
46 | ```
47 | @inproceedings{yang2020tpn,
48 |   title={Temporal Pyramid Network for Action Recognition},
49 |   author={Yang, Ceyuan and Xu, Yinghao and Shi, Jianping and Dai, Bo and Zhou, Bolei},
50 |   booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
51 |   year={2020},
52 | }
53 | ```
54 | 


--------------------------------------------------------------------------------
/mmaction/models/builder.py:
--------------------------------------------------------------------------------
 1 | import mmcv
 2 | from torch import nn
 3 | 
 4 | from .registry import (BACKBONES, FLOWNETS, SPATIAL_TEMPORAL_MODULES,
 5 |                        SEGMENTAL_CONSENSUSES, HEADS,
 6 |                        RECOGNIZERS, DETECTORS, LOCALIZERS, ARCHITECTURES,
 7 |                        NECKS, ROI_EXTRACTORS)
 8 | 
 9 | 
10 | def _build_module(cfg, registry, default_args):
11 |     assert isinstance(cfg, dict) and 'type' in cfg
12 |     assert isinstance(default_args, dict) or default_args is None
13 |     args = cfg.copy()
14 |     obj_type = args.pop('type')
15 |     if mmcv.is_str(obj_type):
16 |         if obj_type not in registry.module_dict:
17 |             raise KeyError('{} is not in the {} registry'.format(
18 |                 obj_type, registry.name))
19 |         obj_type = registry.module_dict[obj_type]
20 |     elif not isinstance(obj_type, type):
21 |         raise TypeError('type must be a str or valid type, but got {}'.format(
22 |             type(obj_type)))
23 |     if default_args is not None:
24 |         for name, value in default_args.items():
25 |             args.setdefault(name, value)
26 |     return obj_type(**args)
27 | 
28 | 
29 | def build(cfg, registry, default_args=None):
30 |     if isinstance(cfg, list):
31 |         modules = [_build_module(cfg_, registry, default_args) for cfg_ in cfg]
32 |         return nn.Sequential(*modules)
33 |     else:
34 |         return _build_module(cfg, registry, default_args)
35 | 
36 | 
37 | def build_backbone(cfg):
38 |     return build(cfg, BACKBONES)
39 | 
40 | 
41 | def build_flownet(cfg):
42 |     return build(cfg, FLOWNETS)
43 | 
44 | 
45 | def build_spatial_temporal_module(cfg):
46 |     return build(cfg, SPATIAL_TEMPORAL_MODULES)
47 | 
48 | 
49 | def build_segmental_consensus(cfg):
50 |     return build(cfg, SEGMENTAL_CONSENSUSES)
51 | 
52 | 
53 | def build_head(cfg):
54 |     return build(cfg, HEADS)
55 | 
56 | 
57 | def build_recognizer(cfg, train_cfg=None, test_cfg=None):
58 |     return build(cfg, RECOGNIZERS,
59 |                  dict(train_cfg=train_cfg, test_cfg=test_cfg))
60 | 
61 | 
62 | def build_localizer(cfg, train_cfg=None, test_cfg=None):
63 |     return build(cfg, LOCALIZERS, dict(train_cfg=train_cfg, test_cfg=test_cfg))
64 | 
65 | 
66 | def build_detector(cfg, train_cfg=None, test_cfg=None):
67 |     return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg))
68 | 
69 | 
70 | def build_architecture(cfg, train_cfg=None, test_cfg=None):
71 |     return build(cfg, ARCHITECTURES,
72 |                  dict(train_cfg=train_cfg, test_cfg=test_cfg))
73 | 
74 | 
75 | def build_neck(cfg):
76 |     return build(cfg, NECKS)
77 | 
78 | 
79 | def build_roi_extractor(cfg):
80 |     return build(cfg, ROI_EXTRACTORS)
81 | 


--------------------------------------------------------------------------------
/tools/train_recognizer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import argparse
 4 | from mmcv import Config
 5 | 
 6 | from mmaction import __version__
 7 | from mmaction.datasets import get_trimmed_dataset
 8 | from mmaction.apis import (train_network, init_dist, get_root_logger,
 9 |                            set_random_seed)
10 | from mmaction.models import build_recognizer
11 | import torch
12 | 
13 | 
14 | def parse_args():
15 |     parser = argparse.ArgumentParser(description='Train an action recognizer')
16 |     parser.add_argument('config', help='train config file path')
17 |     parser.add_argument('--work_dir', help='the dir to save logs and models')
18 |     parser.add_argument(
19 |         '--resume_from', help='the checkpoint file to resume from')
20 |     parser.add_argument(
21 |         '--validate',
22 |         action='store_true',
23 |         help='whether to evaluate the checkpoint during training')
24 |     parser.add_argument(
25 |         '--gpus',
26 |         type=int,
27 |         default=1,
28 |         help='number of gpus to use '
29 |              '(only applicable to non-distributed training)')
30 |     parser.add_argument('--seed', type=int, default=None, help='random seed')
31 |     parser.add_argument(
32 |         '--launcher',
33 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
34 |         default='none',
35 |         help='job launcher')
36 |     parser.add_argument('--local_rank', type=int, default=0)
37 |     args = parser.parse_args()
38 | 
39 |     return args
40 | 
41 | 
42 | def main():
43 |     args = parse_args()
44 | 
45 |     cfg = Config.fromfile(args.config)
46 |     # set cudnn_benchmark
47 |     if cfg.get('cudnn_benchmark', False):
48 |         torch.backends.cudnn.benchmark = True
49 |     # update configs according to CLI args
50 |     if args.work_dir is not None:
51 |         cfg.work_dir = args.work_dir
52 |     if args.resume_from is not None:
53 |         cfg.resume_from = args.resume_from
54 |     cfg.gpus = args.gpus
55 |     if cfg.checkpoint_config is not None:
56 |         # save mmaction version in checkpoints as meta data
57 |         cfg.checkpoint_config.meta = dict(
58 |             mmact_version=__version__, config=cfg.text)
59 | 
60 |     # init distributed env first, since logger depends on the dist info.
61 |     if args.launcher == 'none':
62 |         distributed = False
63 |     else:
64 |         distributed = True
65 |         init_dist(args.launcher, **cfg.dist_params)
66 | 
67 |     # init logger before other steps
68 |     logger = get_root_logger(cfg.log_level)
69 |     logger.info('Distributed training: {}'.format(distributed))
70 | 
71 |     # set random seeds
72 |     if args.seed is not None:
73 |         logger.info('Set random seed to {}'.format(args.seed))
74 |         set_random_seed(args.seed)
75 | 
76 |     model = build_recognizer(
77 |         cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
78 | 
79 |     train_dataset = get_trimmed_dataset(cfg.data.train)
80 |     train_network(
81 |         model,
82 |         train_dataset,
83 |         cfg,
84 |         distributed=distributed,
85 |         validate=args.validate,
86 |         logger=logger)
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/utils/conv_module.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import torch.nn as nn
 4 | from mmcv.cnn import kaiming_init, constant_init
 5 | 
 6 | from .norm import build_norm_layer
 7 | 
 8 | 
 9 | class ConvModule(nn.Module):
10 | 
11 |     def __init__(self,
12 |                  in_channels,
13 |                  out_channels,
14 |                  kernel_size,
15 |                  stride=1,
16 |                  padding=0,
17 |                  dilation=1,
18 |                  groups=1,
19 |                  bias=True,
20 |                  normalize=None,
21 |                  activation='relu',
22 |                  inplace=True,
23 |                  activate_last=True):
24 |         super(ConvModule, self).__init__()
25 |         self.with_norm = normalize is not None
26 |         self.with_activatation = activation is not None
27 |         self.with_bias = bias
28 |         self.activation = activation
29 |         self.activate_last = activate_last
30 | 
31 |         if self.with_norm and self.with_bias:
32 |             warnings.warn('ConvModule has norm and bias at the same time')
33 | 
34 |         self.conv = nn.Conv2d(
35 |             in_channels,
36 |             out_channels,
37 |             kernel_size,
38 |             stride,
39 |             padding,
40 |             dilation,
41 |             groups,
42 |             bias=bias)
43 | 
44 |         self.in_channels = self.conv.in_channels
45 |         self.out_channels = self.conv.out_channels
46 |         self.kernel_size = self.conv.kernel_size
47 |         self.stride = self.conv.stride
48 |         self.padding = self.conv.padding
49 |         self.dilation = self.conv.dilation
50 |         self.transposed = self.conv.transposed
51 |         self.output_padding = self.conv.output_padding
52 |         self.groups = self.conv.groups
53 | 
54 |         if self.with_norm:
55 |             norm_channels = out_channels if self.activate_last else in_channels
56 |             self.norm_name, norm = build_norm_layer(normalize, norm_channels)
57 |             self.add_module(self.norm_name, norm)
58 | 
59 |         if self.with_activatation:
60 |             assert activation in ['relu'], 'Only ReLU supported.'
61 |             if self.activation == 'relu':
62 |                 self.activate = nn.ReLU(inplace=inplace)
63 | 
64 |         # Default using msra init
65 |         self.init_weights()
66 | 
67 |     @property
68 |     def norm(self):
69 |         return getattr(self, self.norm_name)
70 | 
71 |     def init_weights(self):
72 |         nonlinearity = 'relu' if self.activation is None else self.activation
73 |         kaiming_init(self.conv, nonlinearity=nonlinearity)
74 |         if self.with_norm:
75 |             constant_init(self.norm, 1, bias=0)
76 | 
77 |     def forward(self, x, activate=True, norm=True):
78 |         if self.activate_last:
79 |             x = self.conv(x)
80 |             if norm and self.with_norm:
81 |                 x = self.norm(x)
82 |             if activate and self.with_activatation:
83 |                 x = self.activate(x)
84 |         else:
85 |             if norm and self.with_norm:
86 |                 x = self.norm(x)
87 |             if activate and self.with_activatation:
88 |                 x = self.activate(x)
89 |             x = self.conv(x)
90 |         return x
91 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/cls_heads/cls_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from ...registry import HEADS
 5 | 
 6 | 
 7 | @HEADS.register_module
 8 | class ClsHead(nn.Module):
 9 |     """Simplest classification head"""
10 | 
11 |     def __init__(self,
12 |                  with_avg_pool=True,
13 |                  temporal_feature_size=1,
14 |                  spatial_feature_size=7,
15 |                  dropout_ratio=0.8,
16 |                  in_channels=2048,
17 |                  num_classes=101,
18 |                  fcn_testing=False,
19 |                  init_std=0.01):
20 | 
21 |         super(ClsHead, self).__init__()
22 | 
23 |         self.with_avg_pool = with_avg_pool
24 |         self.dropout_ratio = dropout_ratio
25 |         self.in_channels = in_channels
26 |         self.dropout_ratio = dropout_ratio
27 |         self.temporal_feature_size = temporal_feature_size
28 |         self.spatial_feature_size = spatial_feature_size
29 |         self.init_std = init_std
30 |         self.fcn_testing = fcn_testing
31 | 
32 |         if self.dropout_ratio != 0:
33 |             self.dropout = nn.Dropout(p=self.dropout_ratio)
34 |         else:
35 |             self.dropout = None
36 |         # self.with_avg_pool = fcn_testing
37 |         if self.with_avg_pool:
38 |             self.avg_pool = nn.AvgPool3d((temporal_feature_size, spatial_feature_size, spatial_feature_size), (1, 1, 1),
39 |                                          (0, 0, 0))
40 |         if self.fcn_testing:
41 |             self.new_cls = None
42 |             self.in_channels = in_channels
43 |             self.num_classes = num_classes
44 |         self.fc_cls = nn.Linear(in_channels, num_classes)
45 | 
46 |     def init_weights(self):
47 |         nn.init.normal_(self.fc_cls.weight, 0, self.init_std)
48 |         nn.init.constant_(self.fc_cls.bias, 0)
49 | 
50 |     def forward(self, x):
51 |         if not self.fcn_testing:
52 |             if x.ndimension() == 4:
53 |                 x = x.unsqueeze(2)
54 |             assert x.shape[1] == self.in_channels
55 |             assert x.shape[2] == self.temporal_feature_size
56 |             assert x.shape[3] == self.spatial_feature_size
57 |             assert x.shape[4] == self.spatial_feature_size
58 |             if self.with_avg_pool:
59 |                 x = self.avg_pool(x)
60 |             if self.dropout is not None:
61 |                 x = self.dropout(x)
62 |             x = x.view(x.size(0), -1)
63 |             cls_score = self.fc_cls(x)
64 |             return cls_score
65 |         else:
66 |             if self.with_avg_pool:
67 |                 x = self.avg_pool(x)
68 |             if self.new_cls is None:
69 |                 self.new_cls = nn.Conv3d(self.in_channels, self.num_classes, 1, 1, 0).cuda()
70 |                 self.new_cls.weight.copy_(self.fc_cls.weight.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1))
71 |                 self.new_cls.bias.copy_(self.fc_cls.bias)
72 |                 self.fc_cls = None
73 |             class_map = self.new_cls(x)
74 |             # return class_map.mean([2,3,4])
75 |             return class_map
76 | 
77 |     def loss(self,
78 |              cls_score,
79 |              labels):
80 |         losses = dict()
81 |         losses['loss_cls'] = F.cross_entropy(cls_score, labels)
82 | 
83 |         return losses
84 | 


--------------------------------------------------------------------------------
/mmaction/datasets/loader/build_loader.py:
--------------------------------------------------------------------------------
 1 | # from functools import partial
 2 | #
 3 | # from mmcv.runner import get_dist_info
 4 | # from mmcv.parallel import collate
 5 | # from torch.utils.data import DataLoader
 6 | #
 7 | # from .sampler import GroupSampler, DistributedGroupSampler
 8 | #
 9 | # # https://github.com/pytorch/pytorch/issues/973
10 | # import resource
11 | # rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
12 | # resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
13 | #
14 | #
15 | # def build_dataloader(dataset,
16 | #                      imgs_per_gpu,
17 | #                      workers_per_gpu,
18 | #                      num_gpus=1,
19 | #                      dist=True,
20 | #                      **kwargs):
21 | #     if dist:
22 | #         rank, world_size = get_dist_info()
23 | #         sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size,
24 | #                                           rank)
25 | #         batch_size = imgs_per_gpu
26 | #         num_workers = workers_per_gpu
27 | #     else:
28 | #         if not kwargs.get('shuffle', True):
29 | #             sampler = None
30 | #         else:
31 | #             sampler = GroupSampler(dataset, imgs_per_gpu)
32 | #         batch_size = num_gpus * imgs_per_gpu
33 | #         num_workers = num_gpus * workers_per_gpu
34 | #
35 | #     data_loader = DataLoader(
36 | #         dataset,
37 | #         batch_size=batch_size,
38 | #         sampler=sampler,
39 | #         num_workers=num_workers,
40 | #         collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
41 | #         pin_memory=False,
42 | #         **kwargs)
43 | #
44 | #     return data_loader
45 | from functools import partial
46 | 
47 | from mmcv.runner import get_dist_info
48 | from mmcv.parallel import collate
49 | from torch.utils.data import DataLoader
50 | 
51 | from .sampler import GroupSampler, DistributedGroupSampler, DistributedSampler
52 | 
53 | # https://github.com/pytorch/pytorch/issues/973
54 | import resource
55 | 
56 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
57 | resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
58 | 
59 | 
60 | def build_dataloader(dataset,
61 |                      imgs_per_gpu,
62 |                      workers_per_gpu,
63 |                      num_gpus=1,
64 |                      dist=True,
65 |                      **kwargs):
66 |     shuffle = kwargs.get('shuffle', True)
67 |     if dist:
68 |         rank, world_size = get_dist_info()
69 |         if shuffle:
70 |             sampler = DistributedGroupSampler(dataset, imgs_per_gpu,
71 |                                               world_size, rank)
72 |         else:
73 |             sampler = DistributedSampler(
74 |                 dataset, imgs_per_gpu, world_size, rank, shuffle=False)
75 |         batch_size = imgs_per_gpu
76 |         num_workers = workers_per_gpu
77 |     else:
78 |         sampler = GroupSampler(dataset, imgs_per_gpu) if shuffle else None
79 |         batch_size = num_gpus * imgs_per_gpu
80 |         num_workers = num_gpus * workers_per_gpu
81 | 
82 |     data_loader = DataLoader(
83 |         dataset,
84 |         batch_size=batch_size,
85 |         sampler=sampler,
86 |         num_workers=num_workers,
87 |         collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
88 |         pin_memory=False,
89 |         **kwargs)
90 | 
91 |     return data_loader
92 | 


--------------------------------------------------------------------------------
/mmaction/core/evaluation/eval_hooks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import logging
 4 | import mmcv
 5 | import time
 6 | import torch
 7 | import numpy as np
 8 | import torch.distributed as dist
 9 | from mmcv.runner import Hook, obj_from_dict
10 | from mmcv.parallel import scatter, collate
11 | from torch.utils.data import Dataset
12 | 
13 | from mmaction import datasets
14 | from .accuracy import top_k_accuracy
15 | 
16 | 
17 | class DistEvalHook(Hook):
18 |     def __init__(self, dataset, interval=1):
19 |         if isinstance(dataset, Dataset):
20 |             self.dataset = dataset
21 |         elif isinstance(dataset, dict):
22 |             self.dataset = obj_from_dict(dataset, datasets,
23 |                                          {'test_mode': True})
24 |         else:
25 |             raise TypeError(
26 |                 'dataset must be a Dataset object or a dict, not {}'.format(
27 |                     type(dataset)))
28 |         self.interval = interval
29 | 
30 |     def after_train_epoch(self, runner):
31 |         if not self.every_n_epochs(runner, self.interval):
32 |             return
33 |         runner.model.eval()
34 |         results = [None for _ in range(len(self.dataset))]
35 |         if runner.rank == 0:
36 |             prog_bar = mmcv.ProgressBar(len(self.dataset))
37 |         for idx in range(runner.rank, len(self.dataset), runner.world_size):
38 |             data = self.dataset[idx]
39 |             data_gpu = scatter(
40 |                 collate([data], samples_per_gpu=1),
41 |                 [torch.cuda.current_device()])[0]
42 | 
43 |             # compute output
44 |             with torch.no_grad():
45 |                 result = runner.model(
46 |                     return_loss=False, rescale=True, **data_gpu)
47 |             results[idx] = result
48 | 
49 |             batch_size = runner.world_size
50 |             if runner.rank == 0:
51 |                 for _ in range(batch_size):
52 |                     prog_bar.update()
53 | 
54 |         if runner.rank == 0:
55 |             print('\n')
56 |             dist.barrier()
57 |             for i in range(1, runner.world_size):
58 |                 tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
59 |                 tmp_results = mmcv.load(tmp_file)
60 |                 for idx in range(i, len(results), runner.world_size):
61 |                     results[idx] = tmp_results[idx]
62 |                 os.remove(tmp_file)
63 |             self.evaluate(runner, results)
64 |         else:
65 |             tmp_file = osp.join(runner.work_dir,
66 |                                 'temp_{}.pkl'.format(runner.rank))
67 |             mmcv.dump(results, tmp_file)
68 |             dist.barrier()
69 |         dist.barrier()
70 | 
71 |     def evaluate(self):
72 |         raise NotImplementedError
73 | 
74 | 
75 | class DistEvalTopKAccuracyHook(DistEvalHook):
76 | 
77 |     def __init__(self,
78 |                  dataset,
79 |                  k=(1,)):
80 |         super(DistEvalTopKAccuracyHook, self).__init__(dataset)
81 |         self.k = k
82 | 
83 |     def evaluate(self, runner, results):
84 |         gt_labels = []
85 |         for i in range(len(self.dataset)):
86 |             ann = self.dataset.get_ann_info(i)
87 |             gt_labels.append(ann['label'])
88 | 
89 |         results = [res.squeeze() for res in results]
90 |         top1, top5 = top_k_accuracy(results, gt_labels, k=self.k)
91 |         runner.mode = 'val'
92 |         runner.log_buffer.output['top1_acc'] = top1
93 |         runner.log_buffer.output['top5_acc'] = top5
94 |         runner.log_buffer.ready = True
95 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import time
  4 | from setuptools import find_packages, setup
  5 | 
  6 | 
  7 | def readme():
  8 |     with open('README.md', encoding='utf-8') as f:
  9 |         content = f.read()
 10 |     return content
 11 | 
 12 | 
 13 | MAJOR = 0
 14 | MINOR = 1
 15 | PATCH = 'rc0'
 16 | SUFFIX = ''
 17 | SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
 18 | 
 19 | version_file = 'mmaction/version.py'
 20 | 
 21 | 
 22 | def get_git_hash():
 23 |     def _minimal_ext_cmd(cmd):
 24 |         # construct minimal environment
 25 |         env = {}
 26 |         for k in ['SYSTEMROOT', 'PATH', 'HOME']:
 27 |             v = os.environ.get(k)
 28 |             if v is not None:
 29 |                 env[k] = v
 30 |         # LANGUAGE is used on win32
 31 |         env['LANGUAGE'] = 'C'
 32 |         env['LANG'] = 'C'
 33 |         env['LC_ALL'] = 'C'
 34 |         out = subprocess.Popen(
 35 |             cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
 36 |         return out
 37 | 
 38 |     try:
 39 |         out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
 40 |         sha = out.strip().decode('ascii')
 41 |     except OSError:
 42 |         sha = 'unknown'
 43 | 
 44 |     return sha
 45 | 
 46 | 
 47 | def get_hash():
 48 |     if os.path.exists('.git'):
 49 |         sha = get_git_hash()[:7]
 50 |     elif os.path.exists(version_file):
 51 |         try:
 52 |             from mmaction.version import __version__
 53 |             sha = __version__.split('+')[-1]
 54 |         except ImportError:
 55 |             raise ImportError('Unable to get git version')
 56 |     else:
 57 |         sha = 'unknown'
 58 | 
 59 |     return sha
 60 | 
 61 | 
 62 | def write_version_py():
 63 |     content = """# GENERATED VERSION FILE
 64 | # TIME: {}
 65 | 
 66 | __version__ = '{}'
 67 | short_version = '{}'
 68 | """
 69 |     sha = get_hash()
 70 |     VERSION = SHORT_VERSION + '+' + sha
 71 | 
 72 |     with open(version_file, 'w') as f:
 73 |         f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
 74 | 
 75 | 
 76 | def get_version():
 77 |     with open(version_file, 'r') as f:
 78 |         exec(compile(f.read(), version_file, 'exec'))
 79 |     return locals()['__version__']
 80 | 
 81 | 
 82 | if __name__ == '__main__':
 83 |     write_version_py()
 84 |     setup(
 85 |         name='mmaction',
 86 |         version=get_version(),
 87 |         description='Open MMLab Action Toolbox',
 88 |         long_description=readme(),
 89 |         keywords='computer vision, action recognition',
 90 |         url='https://github.com/open-mmlab/mmaction',
 91 |         packages=find_packages(exclude=('configs', 'tools', 'demo')),
 92 |         package_data={'mmaction.ops': ['*/*.so']},
 93 |         classifiers=[
 94 |             'Development Status :: 4 - Beta',
 95 |             'License :: OSI Approved :: Apache Software License',
 96 |             'Operating System :: OS Independent',
 97 |             'Programming Language :: Python :: 2',
 98 |             'Programming Language :: Python :: 2.7',
 99 |             'Programming Language :: Python :: 3',
100 |             'Programming Language :: Python :: 3.4',
101 |             'Programming Language :: Python :: 3.5',
102 |             'Programming Language :: Python :: 3.6',
103 |         ],
104 |         license='Apache License 2.0',
105 |         setup_requires=['pytest-runner'],
106 |         tests_require=['pytest'],
107 |         install_requires=[
108 |             'mmcv', 'numpy', 'scipy', 'scikit-learn', 'terminaltables', 'lmdb', 'joblib'
109 |         ],
110 |         zip_safe=False)
111 | 


--------------------------------------------------------------------------------
/config_files/sthv1/tsm_baseline.py:
--------------------------------------------------------------------------------
  1 | model = dict(
  2 |     type='TSN2D',
  3 |     backbone=dict(
  4 |         type='ResNet',
  5 |         pretrained='modelzoo://resnet50',
  6 |         nsegments=8,
  7 |         depth=50,
  8 |         out_indices=(3,),
  9 |         tsm=True,
 10 |         bn_eval=False,
 11 |         partial_bn=False),
 12 |     spatial_temporal_module=dict(
 13 |         type='SimpleSpatialModule',
 14 |         spatial_type='avg',
 15 |         spatial_size=7),
 16 |     segmental_consensus=dict(
 17 |         type='SimpleConsensus',
 18 |         consensus_type='avg'),
 19 |     cls_head=dict(
 20 |         type='ClsHead',
 21 |         with_avg_pool=False,
 22 |         temporal_feature_size=1,
 23 |         spatial_feature_size=1,
 24 |         dropout_ratio=0.5,
 25 |         in_channels=2048,
 26 |         num_classes=174))
 27 | train_cfg = None
 28 | test_cfg = None
 29 | # dataset settings
 30 | dataset_type = 'RawFramesDataset'
 31 | data_root = ''
 32 | data_root_val = ''
 33 | 
 34 | img_norm_cfg = dict(
 35 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 36 | 
 37 | data = dict(
 38 |     videos_per_gpu=8,
 39 |     workers_per_gpu=8,
 40 |     train=dict(
 41 |         type=dataset_type,
 42 |         ann_file='data/sthv1/train_videofolder.txt',
 43 |         img_prefix=data_root,
 44 |         img_norm_cfg=img_norm_cfg,
 45 |         num_segments=8,
 46 |         new_length=1,
 47 |         new_step=1,
 48 |         random_shift=True,
 49 |         modality='RGB',
 50 |         image_tmpl='{:05d}.jpg',
 51 |         img_scale=256,
 52 |         input_size=224,
 53 |         flip_ratio=0.5,
 54 |         resize_keep_ratio=True,
 55 |         resize_crop=True,
 56 |         color_jitter=True,
 57 |         color_space_aug=True,
 58 |         oversample=None,
 59 |         max_distort=1,
 60 |         test_mode=False),
 61 |     val=dict(
 62 |         type=dataset_type,
 63 |         ann_file='data/sthv1/val_videofolder.txt',
 64 |         img_prefix=data_root_val,
 65 |         img_norm_cfg=img_norm_cfg,
 66 |         num_segments=8,
 67 |         new_length=1,
 68 |         new_step=1,
 69 |         random_shift=False,
 70 |         modality='RGB',
 71 |         image_tmpl='{:05d}.jpg',
 72 |         img_scale=256,
 73 |         input_size=224,
 74 |         flip_ratio=0,
 75 |         resize_keep_ratio=True,
 76 |         oversample=None,
 77 |         test_mode=False),
 78 |     test=dict(
 79 |         type=dataset_type,
 80 |         ann_file='data/sthv1/val_videofolder.txt',
 81 |         img_prefix=data_root_val,
 82 |         img_norm_cfg=img_norm_cfg,
 83 |         num_segments=16,
 84 |         new_length=1,
 85 |         new_step=1,
 86 |         random_shift=False,
 87 |         modality='RGB',
 88 |         image_tmpl='{:05d}.jpg',
 89 |         img_scale=256,
 90 |         input_size=256,
 91 |         flip_ratio=0,
 92 |         resize_keep_ratio=True,
 93 |         oversample="three_crop",
 94 |         test_mode=True))
 95 | # optimizer
 96 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True)
 97 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
 98 | # learning policy
 99 | lr_config = dict(
100 |     policy='step',
101 |     step=[75, 125])
102 | checkpoint_config = dict(interval=1)
103 | workflow = [('train', 1)]
104 | # yapf:disable
105 | log_config = dict(
106 |     interval=20,
107 |     hooks=[
108 |         dict(type='TextLoggerHook'),
109 |         # dict(type='TensorboardLoggerHook')
110 |     ])
111 | # yapf:enable
112 | # runtime settings
113 | total_epochs = 150
114 | dist_params = dict(backend='nccl')
115 | log_level = 'INFO'
116 | load_from = None
117 | resume_from = None
118 | 


--------------------------------------------------------------------------------
/config_files/sthv2/tsm_baseline.py:
--------------------------------------------------------------------------------
  1 | model = dict(
  2 |     type='TSN2D',
  3 |     backbone=dict(
  4 |         type='ResNet',
  5 |         pretrained='modelzoo://resnet50',
  6 |         nsegments=8,
  7 |         depth=50,
  8 |         out_indices=(3,),
  9 |         tsm=True,
 10 |         bn_eval=False,
 11 |         partial_bn=False),
 12 |     spatial_temporal_module=dict(
 13 |         type='SimpleSpatialModule',
 14 |         spatial_type='avg',
 15 |         spatial_size=7),
 16 |     segmental_consensus=dict(
 17 |         type='SimpleConsensus',
 18 |         consensus_type='avg'),
 19 |     cls_head=dict(
 20 |         type='ClsHead',
 21 |         with_avg_pool=False,
 22 |         temporal_feature_size=1,
 23 |         spatial_feature_size=1,
 24 |         dropout_ratio=0.5,
 25 |         in_channels=2048,
 26 |         num_classes=174))
 27 | train_cfg = None
 28 | test_cfg = None
 29 | # dataset settings
 30 | dataset_type = 'RawFramesDataset'
 31 | data_root = ''
 32 | data_root_val = ''
 33 | 
 34 | img_norm_cfg = dict(
 35 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 36 | 
 37 | data = dict(
 38 |     videos_per_gpu=8,
 39 |     workers_per_gpu=8,
 40 |     train=dict(
 41 |         type=dataset_type,
 42 |         ann_file='data/sthv2/train_videofolder.txt',
 43 |         img_prefix=data_root,
 44 |         img_norm_cfg=img_norm_cfg,
 45 |         num_segments=8,
 46 |         new_length=1,
 47 |         new_step=1,
 48 |         random_shift=True,
 49 |         modality='RGB',
 50 |         image_tmpl='img_{:05d}.jpg',
 51 |         img_scale=256,
 52 |         input_size=224,
 53 |         flip_ratio=0.5,
 54 |         resize_keep_ratio=True,
 55 |         resize_crop=True,
 56 |         color_jitter=True,
 57 |         color_space_aug=True,
 58 |         oversample=None,
 59 |         max_distort=1,
 60 |         test_mode=False),
 61 |     val=dict(
 62 |         type=dataset_type,
 63 |         ann_file='data/sthv2/val_videofolder.txt',
 64 |         img_prefix=data_root_val,
 65 |         img_norm_cfg=img_norm_cfg,
 66 |         num_segments=8,
 67 |         new_length=1,
 68 |         new_step=1,
 69 |         random_shift=False,
 70 |         modality='RGB',
 71 |         image_tmpl='img_{:05d}.jpg',
 72 |         img_scale=256,
 73 |         input_size=224,
 74 |         flip_ratio=0,
 75 |         resize_keep_ratio=True,
 76 |         oversample=None,
 77 |         test_mode=False),
 78 |     test=dict(
 79 |         type=dataset_type,
 80 |         ann_file='data/sthv2/val_videofolder.txt',
 81 |         img_prefix=data_root_val,
 82 |         img_norm_cfg=img_norm_cfg,
 83 |         num_segments=16,
 84 |         new_length=1,
 85 |         new_step=1,
 86 |         random_shift=False,
 87 |         modality='RGB',
 88 |         image_tmpl='img_{:05d}.jpg',
 89 |         img_scale=256,
 90 |         input_size=256,
 91 |         flip_ratio=0,
 92 |         resize_keep_ratio=True,
 93 |         oversample="three_crop",
 94 |         test_mode=True))
 95 | # optimizer
 96 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True)
 97 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
 98 | # learning policy
 99 | lr_config = dict(
100 |     policy='step',
101 |     step=[75, 125])
102 | checkpoint_config = dict(interval=1)
103 | workflow = [('train', 1)]
104 | # yapf:disable
105 | log_config = dict(
106 |     interval=20,
107 |     hooks=[
108 |         dict(type='TextLoggerHook'),
109 |         # dict(type='TensorboardLoggerHook')
110 |     ])
111 | # yapf:enable
112 | # runtime settings
113 | total_epochs = 150
114 | dist_params = dict(backend='nccl')
115 | log_level = 'INFO'
116 | load_from = None
117 | resume_from = None
118 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | This directory provides basic tutorials for the usage of MMAction.
 4 | 
 5 | After installation of codebase and preparation of data, you could use the given scripts for training/evaluating your models.
 6 | 
 7 | ### Test a reference model
 8 | Our codebase supports distributed and non-distributed evaluation mode for reference model. Actually, distributed testing is a little faster than non-distributed testing.  
 9 | ```
10 | # non-distributed testing
11 | python tools/test_recognizer.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] {--gpus ${GPU_NUM}} --ignore_cache --fcn_testing
12 | 
13 | # distributed testing
14 | ./tools/dist_test_recognizer.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --ignore_cache --fcn_testing
15 | ```
16 | Optional arguments:
17 | - `--ignore_cache`: If specified, the results cache will be ignored.
18 | - `--fcn_testing`: If specified, spatially fully-convolutional testing is performed via 3 crops approximation.
19 | - `--flip`: If specified, all frames would be flipped firstly and then fed into models.
20 | 
21 | **Important**: some of our models might requires machine with more than 24G memory.
22 | 
23 | Examples:
24 | Assume that you have already downloaded the checkpoints to the directory `ckpt/`.
25 | 
26 | 1. Test tpn_f8s8 model with non-distributed evaluation mode on 8 GPUs
27 | ```
28 | python ./tools/test_recognizer.py config_files/kinetics400/tpn/r50f8s8.py ckpt/kinetics400_tpn_r50f8s8 --gpus  8  --out ckpt/kinetics400_tpn_r50f8s8.pkl --fcn_testing --ignore_cache
29 | ```
30 | 2. Test tpn_f8s8 model with distributed evaluation mode on 8 GPUs
31 | ```shell
32 | ./tools/dist_test_recognizer.sh config_files/kinetics400/tpn/r50f8s8.py ckpt/kinetics400_tpn_r50f8s8 8  --out ckpt/kinetics400_tpn_r50f8s8.pkl --fcn_testing --ignore_cache
33 | ```
34 | 
35 | ### Train a model
36 |  
37 | Our codebase also supports distributed training and non-distributed training.
38 | 
39 | All outputs (log files and checkpoints) will be saved to the working directory,
40 | which is specified by `work_dir` in the config file.
41 | 
42 | By default we evaluate the model on the validation set after each epoch, you can change the evaluation interval by adding the interval argument in the training config.
43 | ```python
44 | evaluation = dict(interval=10)  # This evaluate the model per 10 epoch.
45 | ```
46 | 
47 | #### Train with a single GPU
48 | ```
49 | python tools/train_recognizer.py ${CONFIG_FILE}
50 | ```
51 | If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.
52 | 
53 | #### Train with multiple GPUs
54 | ```shell
55 | ./tools/dist_train_recognizer.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
56 | ```
57 | 
58 | Optional arguments:
59 | - `--validate`: Perform evaluation at every 1 epoch during the training.
60 | - `--work_dir`: All outputs (log files and checkpoints) will be saved to the working directory. 
61 | - `--resume_from`: Resume from a previous checkpoint file.
62 |  
63 | Difference between `resume_from` and `load_from`: `resume_from` loads both the model weights and optimizer status, and the epoch is also inherited from the specified checkpoint. It is usually used for resuming the training process that is interrupted accidentally. `load_from` only loads the model weights and the training epoch starts from 0. It is usually used for finetuning.
64 | 
65 | **Important**: The default learning rate in config files is for 8 GPUs and 8 video/gpu (batch size = 8*8 = 64). According to the Linear Scaling Rule, you need to set the learning rate proportional to the batch size if you use different GPUs or images per GPU, e.g., lr=0.01 for 8 GPUs * 8 video/gpu and lr=0.04 for 32 GPUs * 8 video/gpu.
66 | 
67 | Here is the example of using 8 GPUs to train Kinetics400_r50_f8s8:
68 | ```shell
69 | ./tools/dist_train_recognizer.sh config_files/kinetics400/tpn/r50f8s8.py 8 --validate 
70 | ```
71 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/baseline/r101f16s4.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet101',
  7 |         depth=101,
  8 |         num_stages=4,
  9 |         out_indices=[3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=True,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     spatial_temporal_module=dict(
 22 |         type='SimpleSpatialTemporalModule',
 23 |         spatial_type='avg',
 24 |         temporal_size=16,
 25 |         spatial_size=7),
 26 |     segmental_consensus=dict(
 27 |         type='SimpleConsensus',
 28 |         consensus_type='avg'),
 29 |     cls_head=dict(
 30 |         type='ClsHead',
 31 |         with_avg_pool=False,
 32 |         temporal_feature_size=1,
 33 |         spatial_feature_size=1,
 34 |         dropout_ratio=0.5,
 35 |         in_channels=2048,
 36 |         num_classes=400))
 37 | train_cfg = None
 38 | test_cfg = None
 39 | # dataset settings
 40 | dataset_type = 'RawFramesDataset'
 41 | data_root = ''
 42 | data_root_val = ''
 43 | img_norm_cfg = dict(
 44 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 45 | data = dict(
 46 |     videos_per_gpu=8,
 47 |     workers_per_gpu=8,
 48 |     train=dict(
 49 |         type=dataset_type,
 50 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt',
 51 |         img_prefix=data_root,
 52 |         img_norm_cfg=img_norm_cfg,
 53 |         input_format="NCTHW",
 54 |         num_segments=1,
 55 |         new_length=16,
 56 |         new_step=4,
 57 |         random_shift=True,
 58 |         modality='RGB',
 59 |         image_tmpl='img_{:05d}.jpg',
 60 |         img_scale=256,
 61 |         resize_keep_ratio=True,
 62 |         input_size=224,
 63 |         flip_ratio=0.5,
 64 |         oversample=None,
 65 |         resize_crop=True,
 66 |         color_jitter=True,
 67 |         color_space_aug=True,
 68 |         max_distort=0,
 69 |         test_mode=False,
 70 |     ),
 71 |     val=dict(
 72 |         type=dataset_type,
 73 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 74 |         img_prefix=data_root_val,
 75 |         img_norm_cfg=img_norm_cfg,
 76 |         input_format="NCTHW",
 77 |         num_segments=1,
 78 |         new_length=16,
 79 |         new_step=4,
 80 |         random_shift=True,
 81 |         modality='RGB',
 82 |         image_tmpl='img_{:05d}.jpg',
 83 |         img_scale=256,
 84 |         input_size=224,
 85 |         flip_ratio=0,
 86 |         resize_keep_ratio=True,
 87 |         oversample=None,
 88 |         test_mode=False,
 89 |     ),
 90 |     test=dict(
 91 |         type=dataset_type,
 92 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 93 |         img_prefix=data_root_val,
 94 |         img_norm_cfg=img_norm_cfg,
 95 |         input_format="NCTHW",
 96 |         num_segments=10,
 97 |         new_length=16,
 98 |         new_step=4,
 99 |         random_shift=True,
100 |         modality='RGB',
101 |         image_tmpl='img_{:05d}.jpg',
102 |         img_scale=256,
103 |         input_size=256,
104 |         flip_ratio=0,
105 |         resize_keep_ratio=True,
106 |         oversample='three_crop',
107 |         test_mode=True,
108 |     ))
109 | # optimizer
110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
112 | # learning policy
113 | lr_config = dict(
114 |     policy='step',
115 |     step=[75, 125])
116 | 
117 | checkpoint_config = dict(interval=1)
118 | workflow = [('train', 1)]
119 | # yapf:disable
120 | log_config = dict(
121 |     interval=20,
122 |     hooks=[
123 |         dict(type='TextLoggerHook'),
124 |         # dict(type='TensorboardLoggerHook')
125 |     ])
126 | # yapf:enable
127 | # runtime settings
128 | total_epochs = 150
129 | dist_params = dict(backend='nccl')
130 | log_level = 'INFO'
131 | load_from = None
132 | resume_from = None
133 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/baseline/r101f8s8.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet101',
  7 |         depth=101,
  8 |         num_stages=4,
  9 |         out_indices=[3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=False,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     spatial_temporal_module=dict(
 22 |         type='SimpleSpatialTemporalModule',
 23 |         spatial_type='avg',
 24 |         temporal_size=8,
 25 |         spatial_size=7),
 26 |     segmental_consensus=dict(
 27 |         type='SimpleConsensus',
 28 |         consensus_type='avg'),
 29 |     cls_head=dict(
 30 |         type='ClsHead',
 31 |         with_avg_pool=False,
 32 |         temporal_feature_size=1,
 33 |         spatial_feature_size=1,
 34 |         dropout_ratio=0.5,
 35 |         in_channels=2048,
 36 |         num_classes=400))
 37 | train_cfg = None
 38 | test_cfg = None
 39 | # dataset settings
 40 | dataset_type = 'RawFramesDataset'
 41 | data_root = ''
 42 | data_root_val = ''
 43 | img_norm_cfg = dict(
 44 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 45 | data = dict(
 46 |     videos_per_gpu=8,
 47 |     workers_per_gpu=8,
 48 |     train=dict(
 49 |         type=dataset_type,
 50 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt',
 51 |         img_prefix=data_root,
 52 |         img_norm_cfg=img_norm_cfg,
 53 |         input_format="NCTHW",
 54 |         num_segments=1,
 55 |         new_length=8,
 56 |         new_step=8,
 57 |         random_shift=True,
 58 |         modality='RGB',
 59 |         image_tmpl='img_{:05d}.jpg',
 60 |         img_scale=256,
 61 |         resize_keep_ratio=True,
 62 |         input_size=224,
 63 |         flip_ratio=0.5,
 64 |         oversample=None,
 65 |         resize_crop=True,
 66 |         color_jitter=True,
 67 |         color_space_aug=True,
 68 |         max_distort=0,
 69 |         test_mode=False,
 70 |     ),
 71 |     val=dict(
 72 |         type=dataset_type,
 73 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 74 |         img_prefix=data_root_val,
 75 |         img_norm_cfg=img_norm_cfg,
 76 |         input_format="NCTHW",
 77 |         num_segments=1,
 78 |         new_length=8,
 79 |         new_step=8,
 80 |         random_shift=True,
 81 |         modality='RGB',
 82 |         image_tmpl='img_{:05d}.jpg',
 83 |         img_scale=256,
 84 |         input_size=224,
 85 |         flip_ratio=0,
 86 |         resize_keep_ratio=True,
 87 |         oversample=None,
 88 |         test_mode=False,
 89 |     ),
 90 |     test=dict(
 91 |         type=dataset_type,
 92 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 93 |         img_prefix=data_root_val,
 94 |         img_norm_cfg=img_norm_cfg,
 95 |         input_format="NCTHW",
 96 |         num_segments=10,
 97 |         new_length=8,
 98 |         new_step=8,
 99 |         random_shift=True,
100 |         modality='RGB',
101 |         image_tmpl='img_{:05d}.jpg',
102 |         img_scale=256,
103 |         input_size=256,
104 |         flip_ratio=0,
105 |         resize_keep_ratio=True,
106 |         oversample='three_crop',
107 |         test_mode=True,
108 |     ))
109 | # optimizer
110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
112 | # learning policy
113 | lr_config = dict(
114 |     policy='step',
115 |     step=[75, 125])
116 | 
117 | checkpoint_config = dict(interval=1)
118 | # workflow = [('train', 5), ('val', 1)]
119 | workflow = [('train', 1)]
120 | # yapf:disable
121 | log_config = dict(
122 |     interval=20,
123 |     hooks=[
124 |         dict(type='TextLoggerHook'),
125 |         # dict(type='TensorboardLoggerHook')
126 |     ])
127 | # yapf:enable
128 | # runtime settings
129 | total_epochs = 150
130 | dist_params = dict(backend='nccl')
131 | log_level = 'INFO'
132 | load_from = None
133 | resume_from = None
134 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/baseline/r50f8s8.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet50',
  7 |         depth=50,
  8 |         num_stages=4,
  9 |         out_indices=[3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=False,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     spatial_temporal_module=dict(
 22 |         type='SimpleSpatialTemporalModule',
 23 |         spatial_type='avg',
 24 |         temporal_size=8,
 25 |         spatial_size=7),
 26 |     segmental_consensus=dict(
 27 |         type='SimpleConsensus',
 28 |         consensus_type='avg'),
 29 |     cls_head=dict(
 30 |         type='ClsHead',
 31 |         with_avg_pool=False,
 32 |         temporal_feature_size=1,
 33 |         spatial_feature_size=1,
 34 |         dropout_ratio=0.5,
 35 |         in_channels=2048,
 36 |         num_classes=400))
 37 | train_cfg = None
 38 | test_cfg = None
 39 | # dataset settings
 40 | dataset_type = 'RawFramesDataset'
 41 | data_root = ''
 42 | data_root_val = ''
 43 | img_norm_cfg = dict(
 44 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 45 | data = dict(
 46 |     videos_per_gpu=8,
 47 |     workers_per_gpu=8,
 48 |     train=dict(
 49 |         type=dataset_type,
 50 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt',
 51 |         img_prefix=data_root,
 52 |         img_norm_cfg=img_norm_cfg,
 53 |         input_format="NCTHW",
 54 |         num_segments=1,
 55 |         new_length=8,
 56 |         new_step=8,
 57 |         random_shift=True,
 58 |         modality='RGB',
 59 |         image_tmpl='img_{:05d}.jpg',
 60 |         img_scale=256,
 61 |         resize_keep_ratio=True,
 62 |         input_size=224,
 63 |         flip_ratio=0.5,
 64 |         oversample=None,
 65 |         resize_crop=True,
 66 |         color_jitter=True,
 67 |         color_space_aug=True,
 68 |         max_distort=0,
 69 |         test_mode=False,
 70 |     ),
 71 |     val=dict(
 72 |         type=dataset_type,
 73 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 74 |         img_prefix=data_root_val,
 75 |         img_norm_cfg=img_norm_cfg,
 76 |         input_format="NCTHW",
 77 |         num_segments=1,
 78 |         new_length=8,
 79 |         new_step=8,
 80 |         random_shift=True,
 81 |         modality='RGB',
 82 |         image_tmpl='img_{:05d}.jpg',
 83 |         img_scale=256,
 84 |         input_size=224,
 85 |         flip_ratio=0,
 86 |         resize_keep_ratio=True,
 87 |         oversample=None,
 88 |         test_mode=False,
 89 |     ),
 90 |     test=dict(
 91 |         type=dataset_type,
 92 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 93 |         img_prefix=data_root_val,
 94 |         img_norm_cfg=img_norm_cfg,
 95 |         input_format="NCTHW",
 96 |         num_segments=10,
 97 |         new_length=8,
 98 |         new_step=8,
 99 |         random_shift=True,
100 |         modality='RGB',
101 |         image_tmpl='img_{:05d}.jpg',
102 |         img_scale=256,
103 |         input_size=256,
104 |         flip_ratio=0,
105 |         resize_keep_ratio=True,
106 |         oversample='three_crop',
107 |         test_mode=True,
108 |     ))
109 | # optimizer
110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
112 | # learning policy
113 | lr_config = dict(
114 |     policy='step',
115 |     step=[75, 125])
116 | 
117 | checkpoint_config = dict(interval=1)
118 | # workflow = [('train', 5), ('val', 1)]
119 | workflow = [('train', 1)]
120 | # yapf:disable
121 | log_config = dict(
122 |     interval=20,
123 |     hooks=[
124 |         dict(type='TextLoggerHook'),
125 |         # dict(type='TensorboardLoggerHook')
126 |     ])
127 | # yapf:enable
128 | # runtime settings
129 | total_epochs = 150
130 | dist_params = dict(backend='nccl')
131 | log_level = 'INFO'
132 | load_from = None
133 | resume_from = None
134 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/baseline/r101f32s2.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet101',
  7 |         depth=101,
  8 |         num_stages=4,
  9 |         out_indices=[3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=True,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     spatial_temporal_module=dict(
 22 |         type='SimpleSpatialTemporalModule',
 23 |         spatial_type='avg',
 24 |         temporal_size=32,
 25 |         spatial_size=7),
 26 |     segmental_consensus=dict(
 27 |         type='SimpleConsensus',
 28 |         consensus_type='avg'),
 29 |     cls_head=dict(
 30 |         type='ClsHead',
 31 |         with_avg_pool=False,
 32 |         temporal_feature_size=1,
 33 |         spatial_feature_size=1,
 34 |         dropout_ratio=0.5,
 35 |         in_channels=2048,
 36 |         num_classes=400))
 37 | train_cfg = None
 38 | test_cfg = None
 39 | # dataset settings
 40 | dataset_type = 'RawFramesDataset'
 41 | data_root = ''
 42 | data_root_val = ''
 43 | img_norm_cfg = dict(
 44 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 45 | data = dict(
 46 |     videos_per_gpu=8,
 47 |     workers_per_gpu=8,
 48 |     train=dict(
 49 |         type=dataset_type,
 50 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt',
 51 |         img_prefix=data_root,
 52 |         img_norm_cfg=img_norm_cfg,
 53 |         input_format="NCTHW",
 54 |         num_segments=1,
 55 |         new_length=32,
 56 |         new_step=2,
 57 |         random_shift=True,
 58 |         modality='RGB',
 59 |         image_tmpl='img_{:05d}.jpg',
 60 |         img_scale=256,
 61 |         resize_keep_ratio=True,
 62 |         input_size=224,
 63 |         flip_ratio=0.5,
 64 |         oversample=None,
 65 |         resize_crop=True,
 66 |         color_jitter=True,
 67 |         color_space_aug=True,
 68 |         max_distort=0,
 69 |         test_mode=False,
 70 |     ),
 71 |     val=dict(
 72 |         type=dataset_type,
 73 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 74 |         img_prefix=data_root_val,
 75 |         img_norm_cfg=img_norm_cfg,
 76 |         input_format="NCTHW",
 77 |         num_segments=1,
 78 |         new_length=32,
 79 |         new_step=2,
 80 |         random_shift=True,
 81 |         modality='RGB',
 82 |         image_tmpl='img_{:05d}.jpg',
 83 |         img_scale=256,
 84 |         input_size=224,
 85 |         flip_ratio=0,
 86 |         resize_keep_ratio=True,
 87 |         oversample=None,
 88 |         test_mode=False,
 89 |     ),
 90 |     test=dict(
 91 |         type=dataset_type,
 92 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 93 |         img_prefix=data_root_val,
 94 |         img_norm_cfg=img_norm_cfg,
 95 |         input_format="NCTHW",
 96 |         num_segments=10,
 97 |         new_length=32,
 98 |         new_step=2,
 99 |         random_shift=True,
100 |         modality='RGB',
101 |         image_tmpl='img_{:05d}.jpg',
102 |         img_scale=256,
103 |         input_size=256,
104 |         flip_ratio=0,
105 |         resize_keep_ratio=True,
106 |         oversample='three_crop',
107 |         test_mode=True,
108 |     ))
109 | # optimizer
110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
112 | # learning policy
113 | lr_config = dict(
114 |     policy='step',
115 |     step=[75, 125])
116 | 
117 | checkpoint_config = dict(interval=1)
118 | # workflow = [('train', 5), ('val', 1)]
119 | workflow = [('train', 1)]
120 | # yapf:disable
121 | log_config = dict(
122 |     interval=20,
123 |     hooks=[
124 |         dict(type='TextLoggerHook'),
125 |         # dict(type='TensorboardLoggerHook')
126 |     ])
127 | # yapf:enable
128 | # runtime settings
129 | total_epochs = 150
130 | dist_params = dict(backend='nccl')
131 | log_level = 'INFO'
132 | load_from = None
133 | resume_from = None
134 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/baseline/r50f16s4.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet50',
  7 |         depth=50,
  8 |         num_stages=4,
  9 |         out_indices=[3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=True,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     spatial_temporal_module=dict(
 22 |         type='SimpleSpatialTemporalModule',
 23 |         spatial_type='avg',
 24 |         temporal_size=16,
 25 |         spatial_size=7),
 26 |     segmental_consensus=dict(
 27 |         type='SimpleConsensus',
 28 |         consensus_type='avg'),
 29 |     cls_head=dict(
 30 |         type='ClsHead',
 31 |         with_avg_pool=False,
 32 |         temporal_feature_size=1,
 33 |         spatial_feature_size=1,
 34 |         dropout_ratio=0.5,
 35 |         in_channels=2048,
 36 |         num_classes=400))
 37 | train_cfg = None
 38 | test_cfg = None
 39 | # dataset settings
 40 | dataset_type = 'RawFramesDataset'
 41 | data_root = ''
 42 | data_root_val = ''
 43 | img_norm_cfg = dict(
 44 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 45 | data = dict(
 46 |     videos_per_gpu=8,
 47 |     workers_per_gpu=8,
 48 |     train=dict(
 49 |         type=dataset_type,
 50 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt',
 51 |         img_prefix=data_root,
 52 |         img_norm_cfg=img_norm_cfg,
 53 |         input_format="NCTHW",
 54 |         num_segments=1,
 55 |         new_length=16,
 56 |         new_step=4,
 57 |         random_shift=True,
 58 |         modality='RGB',
 59 |         image_tmpl='img_{:05d}.jpg',
 60 |         img_scale=256,
 61 |         resize_keep_ratio=True,
 62 |         input_size=224,
 63 |         flip_ratio=0.5,
 64 |         oversample=None,
 65 |         resize_crop=True,
 66 |         color_jitter=True,
 67 |         color_space_aug=True,
 68 |         max_distort=0,
 69 |         test_mode=False,
 70 |     ),
 71 |     val=dict(
 72 |         type=dataset_type,
 73 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 74 |         img_prefix=data_root_val,
 75 |         img_norm_cfg=img_norm_cfg,
 76 |         input_format="NCTHW",
 77 |         num_segments=1,
 78 |         new_length=16,
 79 |         new_step=4,
 80 |         random_shift=True,
 81 |         modality='RGB',
 82 |         image_tmpl='img_{:05d}.jpg',
 83 |         img_scale=256,
 84 |         input_size=224,
 85 |         flip_ratio=0,
 86 |         resize_keep_ratio=True,
 87 |         oversample=None,
 88 |         test_mode=False,
 89 |     ),
 90 |     test=dict(
 91 |         type=dataset_type,
 92 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 93 |         img_prefix=data_root_val,
 94 |         img_norm_cfg=img_norm_cfg,
 95 |         input_format="NCTHW",
 96 |         num_segments=10,
 97 |         new_length=16,
 98 |         new_step=4,
 99 |         random_shift=True,
100 |         modality='RGB',
101 |         image_tmpl='img_{:05d}.jpg',
102 |         img_scale=256,
103 |         input_size=256,
104 |         flip_ratio=0,
105 |         resize_keep_ratio=True,
106 |         oversample='three_crop',
107 |         test_mode=True,
108 |     ))
109 | # optimizer
110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
112 | # learning policy
113 | lr_config = dict(
114 |     policy='step',
115 |     step=[75, 125])
116 | 
117 | checkpoint_config = dict(interval=1)
118 | # workflow = [('train', 5), ('val', 1)]
119 | workflow = [('train', 1)]
120 | # yapf:disable
121 | log_config = dict(
122 |     interval=20,
123 |     hooks=[
124 |         dict(type='TextLoggerHook'),
125 |         # dict(type='TensorboardLoggerHook')
126 |     ])
127 | # yapf:enable
128 | # runtime settings
129 | total_epochs = 150
130 | dist_params = dict(backend='nccl')
131 | log_level = 'INFO'
132 | load_from = None
133 | resume_from = None
134 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/baseline/r50f32s2.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet50',
  7 |         depth=50,
  8 |         num_stages=4,
  9 |         out_indices=[3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=True,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     spatial_temporal_module=dict(
 22 |         type='SimpleSpatialTemporalModule',
 23 |         spatial_type='avg',
 24 |         temporal_size=32,
 25 |         spatial_size=7),
 26 |     segmental_consensus=dict(
 27 |         type='SimpleConsensus',
 28 |         consensus_type='avg'),
 29 |     cls_head=dict(
 30 |         type='ClsHead',
 31 |         with_avg_pool=False,
 32 |         temporal_feature_size=1,
 33 |         spatial_feature_size=1,
 34 |         dropout_ratio=0.5,
 35 |         in_channels=2048,
 36 |         num_classes=400))
 37 | train_cfg = None
 38 | test_cfg = None
 39 | # dataset settings
 40 | dataset_type = 'RawFramesDataset'
 41 | data_root = ''
 42 | data_root_val = ''
 43 | img_norm_cfg = dict(
 44 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 45 | data = dict(
 46 |     videos_per_gpu=8,
 47 |     workers_per_gpu=8,
 48 |     train=dict(
 49 |         type=dataset_type,
 50 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes.txt',
 51 |         img_prefix=data_root,
 52 |         img_norm_cfg=img_norm_cfg,
 53 |         input_format="NCTHW",
 54 |         num_segments=1,
 55 |         new_length=32,
 56 |         new_step=2,
 57 |         random_shift=True,
 58 |         modality='RGB',
 59 |         image_tmpl='img_{:05d}.jpg',
 60 |         img_scale=256,
 61 |         resize_keep_ratio=True,
 62 |         input_size=224,
 63 |         flip_ratio=0.5,
 64 |         oversample=None,
 65 |         resize_crop=True,
 66 |         color_jitter=True,
 67 |         color_space_aug=True,
 68 |         max_distort=0,
 69 |         test_mode=False,
 70 |     ),
 71 |     val=dict(
 72 |         type=dataset_type,
 73 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 74 |         img_prefix=data_root_val,
 75 |         img_norm_cfg=img_norm_cfg,
 76 |         input_format="NCTHW",
 77 |         num_segments=1,
 78 |         new_length=32,
 79 |         new_step=2,
 80 |         random_shift=True,
 81 |         modality='RGB',
 82 |         image_tmpl='img_{:05d}.jpg',
 83 |         img_scale=256,
 84 |         input_size=224,
 85 |         flip_ratio=0,
 86 |         resize_keep_ratio=True,
 87 |         oversample=None,
 88 |         test_mode=False,
 89 |     ),
 90 |     test=dict(
 91 |         type=dataset_type,
 92 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes.txt',
 93 |         img_prefix=data_root_val,
 94 |         img_norm_cfg=img_norm_cfg,
 95 |         input_format="NCTHW",
 96 |         num_segments=10,
 97 |         new_length=32,
 98 |         new_step=2,
 99 |         random_shift=True,
100 |         modality='RGB',
101 |         image_tmpl='img_{:05d}.jpg',
102 |         img_scale=256,
103 |         input_size=256,
104 |         flip_ratio=0,
105 |         resize_keep_ratio=True,
106 |         oversample='three_crop',
107 |         test_mode=True,
108 |     ))
109 | # optimizer
110 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
111 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
112 | # learning policy
113 | lr_config = dict(
114 |     policy='step',
115 |     step=[75, 125])
116 | 
117 | checkpoint_config = dict(interval=1)
118 | # workflow = [('train', 5), ('val', 1)]
119 | workflow = [('train', 1)]
120 | # yapf:disable
121 | log_config = dict(
122 |     interval=20,
123 |     hooks=[
124 |         dict(type='TextLoggerHook'),
125 |         # dict(type='TensorboardLoggerHook')
126 |     ])
127 | # yapf:enable
128 | # runtime settings
129 | total_epochs = 150
130 | dist_params = dict(backend='nccl')
131 | log_level = 'INFO'
132 | load_from = None
133 | resume_from = None
134 | 


--------------------------------------------------------------------------------
/docs/assets/style.css:
--------------------------------------------------------------------------------
  1 | /* Homepage Style */
  2 | 
  3 | /* Body */
  4 | body {
  5 |     background: #e3e5e8;
  6 |     color: #ffffff;
  7 |     font-family: 'Lato', Verdana, Helvetica, sans-serif;
  8 |     font-weight: 300;
  9 |     font-size: 14pt;
 10 | }
 11 | 
 12 | /* Headings */
 13 | h1 {
 14 |     font-size: 30pt;
 15 | }
 16 | 
 17 | h2 {
 18 |     font-size: 22pt;
 19 | }
 20 | 
 21 | h3 {
 22 |     font-size: 14pt;
 23 | }
 24 | 
 25 | /* Hyperlinks */
 26 | a {
 27 |     text-decoration: none;
 28 | }
 29 | 
 30 | a:link {
 31 |     color: #1772d0;
 32 | }
 33 | 
 34 | a:visited {
 35 |     color: #1772d0;
 36 | }
 37 | 
 38 | a:active {
 39 |     color: red;
 40 | }
 41 | 
 42 | a:hover {
 43 |     color: #f09228;
 44 | }
 45 | 
 46 | pre {
 47 |     background: #fcfcfc;
 48 |     border: 0;
 49 |     font-size: 12pt;
 50 |     margin: 5pt auto;
 51 | }
 52 | 
 53 | /* Container */
 54 | .container {
 55 |     width: 768pt;
 56 |     min-height: 100pt;
 57 |     margin: 15pt auto;
 58 |     padding: 20pt;
 59 |     border: 1pt hidden #000;
 60 |     text-align: justify;
 61 |     color: #000000;
 62 |     background: #ffffff;
 63 | }
 64 | 
 65 | .container .title {
 66 |     text-align: center;
 67 |     font-size: 22pt;
 68 |     margin: 5pt auto;
 69 | }
 70 | 
 71 | .container .author {
 72 |     text-align: center;
 73 |     font-size: 16pt;
 74 |     margin: 20pt auto;
 75 | }
 76 | 
 77 | .container .institution {
 78 |     text-align: center;
 79 |     font-size: 16pt;
 80 |     margin: 20pt auto;
 81 | }
 82 | 
 83 | .container .link {
 84 |     text-align: center;
 85 |     font-size: 16pt;
 86 |     margin: 20pt auto;
 87 | }
 88 | 
 89 | .container .teaser {
 90 |     text-align: center;
 91 | }
 92 | 
 93 | .container .teaser img {
 94 |     text-align: center;
 95 |     margin: 20pt auto;
 96 |     width: 95%;
 97 | }
 98 | 
 99 | .container .body {
100 |     text-align: justify;
101 |     font-size: 14pt;
102 |     margin: 10pt auto;
103 | }
104 | 
105 | .container .bibtex {
106 |     text-align: left;
107 |     font-size: 22pt;
108 |     margin: 5pt auto;
109 | }
110 | 
111 | .container .ref {
112 |     text-align: left;
113 |     font-size: 18pt;
114 |     font-weight: bold;
115 |     margin: 15pt auto;
116 | }
117 | 
118 | .container .citation {
119 |     margin: 8pt auto;
120 |     font-size: 14pt;
121 |     clear: both;
122 | }
123 | 
124 | .container .citation img {
125 |     float: left;
126 |     margin: 0 8pt 8pt 0; /*top right bottom left*/
127 |     width: 120pt;
128 | }
129 | 
130 | /* Homepage */
131 | /* Followings can be removed for single project page. */
132 | .homepage {
133 |     width: 768pt;
134 |     min-height: 100pt;
135 |     margin: 15pt auto;
136 |     padding: 20pt;
137 |     border: 1pt hidden #000;
138 |     text-align: justify;
139 |     color: #000000;
140 |     background: #ffffff;
141 | }
142 | 
143 | .homepage .header {
144 |     margin-top: 30pt;
145 |     margin-bottom: 60pt;
146 |     margin-right: 70pt;
147 |     font-size: 28pt;
148 |     text-align: center;
149 | }
150 | 
151 | .homepage .header img {
152 |     height: 80pt;
153 |     float: left;
154 |     object-fit: cover;
155 |     margin-left: 20pt;
156 | }
157 | 
158 | .homepage .section {
159 |     text-align: left;
160 |     font-size: 25pt;
161 |     font-weight: bolder;
162 |     margin: 50pt 20pt 20pt 20pt; /*top right bottom left*/
163 | }
164 | 
165 | .homepage .project {
166 |     height: 130pt;
167 |     outline: thin dotted #666666;
168 |     margin: 10pt 20pt 10pt 20pt; /*top right bottom left*/
169 | }
170 | 
171 | .homepage .project .image {
172 |     height: 120pt;
173 |     width: 160pt;
174 |     float: left;
175 |     text-align: center;
176 |     vertical-align: top;
177 | }
178 | 
179 | .homepage .project .image img {
180 |     height: 120pt;
181 |     width: 160pt;
182 |     object-fit: cover;
183 |     border-radius: 6pt;
184 |     box-shadow: 1pt 1pt 2pt #888888;
185 |     -moz-box-shadow: 1pt 1pt 2pt #888888;
186 |     -webkit-box-shadow: 1pt 1pt 2pt #888888;
187 |     margin: 5pt;
188 | }
189 | 
190 | .homepage .project .info {
191 |     font-size: 16pt;
192 |     text-align: left;
193 |     margin: 10pt 20pt 0 180pt; /*top right bottom left*/
194 | }
195 | 
196 | .homepage .avatar {
197 |     margin: -10pt 20pt 320pt 0pt; /*top right bottom left*/
198 | }
199 | 
200 | .homepage .avatar table {
201 |     float: left;
202 |     width: auto;
203 |     height: auto;
204 |     margin: 10pt auto;
205 |     text-align: center;
206 |     font-size: 16pt;
207 |     border-collapse: separate;
208 |     border-spacing: 20pt 10pt;
209 | }
210 | 
211 | .homepage .avatar img {
212 |     height: 100pt;
213 |     width: 100pt;
214 |     object-fit: cover;
215 | }
216 | 


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en">
  3 | 
  4 | 
  5 | <!-- === Header Starts === -->
  6 | <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  8 | 
  9 |     <title>TPN</title>
 10 | 
 11 |     <link href="./assets/bootstrap.min.css" rel="stylesheet">
 12 |     <link href="./assets/font.css" rel="stylesheet" type="text/css">
 13 |     <link href="./assets/style.css" rel="stylesheet" type="text/css">
 14 | </head>
 15 | <!-- === Header Ends === -->
 16 | 
 17 | 
 18 | <body>
 19 | 
 20 | 
 21 | <!-- === Home Section Starts === -->
 22 | <div class="container">
 23 |     <div class="title" style="margin: 20pt 50pt;">
 24 |         Temporal Pyramid Network for Action Recognition
 25 |     </div>
 26 |     <div class="author">
 27 |         <a href="http://ceyuan.me">Ceyuan Yang</a><sup>*,1</sup>,&nbsp;
 28 |         <a href="https://justimyhxu.github.io/academic.html">Yinghao Xu</a><sup>*,1</sup>,&nbsp;
 29 |         <a href="https://shijianping.me/">Jianping Shi</a><sup>2</sup>,&nbsp;
 30 |         <a href="http://daibo.info/">Bo Dai</a><sup>1</sup>,&nbsp;
 31 |         <a href="http://bzhou.ie.cuhk.edu.hk/">Bolei Zhou</a><sup>1</sup>&nbsp;
 32 |     </div>
 33 |     <div class="institution">
 34 |         <sup>1</sup>The Chinese University of Hong Kong,
 35 |         <sup>2</sup>SenseTime Group Limited <br>
 36 |     </div>
 37 |     <div class="link">
 38 |         <a href="https://arxiv.org/pdf/2004.03548.pdf" target="_blank">[Paper]</a>&nbsp;
 39 |         <a href="https://github.com/decisionforce/TPN" target="_blank">[Code]</a>
 40 |     </div>
 41 |     <div class="teaser">
 42 |         <img src="figures/framework.png">
 43 |     </div>
 44 | </div>
 45 | <!-- === Home Section Ends === -->
 46 | 
 47 | 
 48 | <!--====== Overview Section Starts ======-->
 49 | <div class="container">
 50 |     <div class="title">Overview</div>
 51 |     <div class="body">
 52 |         Visual tempo characterizes the dynamics and the temporal scale of an action, which actually describes how fast
 53 |         an action goes.
 54 |         Modeling such visual tempos of different actions facilitates their recognition.
 55 |         In this work we propose a generic Temporal Pyramid Network (TPN) at the feature-level, which can be flexibly
 56 |         integrated into 2D or 3D backbone networks in a plug-and-play manner.
 57 |         TPN also shows consistent improvements over other challenging baselines on several action recognition datasets.
 58 |         A further analysis also reveals that TPN gains most of its improvements on action classes that have large
 59 |         variances in their visual tempos, validating the effectiveness of TPN.
 60 |     </div>
 61 | </div>
 62 | <!--====== Overview Section Ends ======-->
 63 | 
 64 | 
 65 | <!--====== Results Section Starts ======-->
 66 | <div class="container">
 67 |     <div class="title">Results</div>
 68 |     <div class="body">
 69 |         <li><b>Quantitive Results</b></li>
 70 |         <p>
 71 |             Our TPN could achieve 78.9%, 49.0% and 62.0% top-1 accuracy on the mainstream benchmarks of action
 72 |             recognition i.e., Kinetics-400, Something-Something V1 and V2 respectively, which basically outperforms
 73 |             other state-of-the-art methods. More detailed comparison and ablation studie are presented in our paper.
 74 |         </p>
 75 |         <li><b>Empirical Study</b></li>
 76 |         <p><i>Per-class Performance Gain vs. Per-class Variance of Visual Tempos :</i>
 77 |             Figure 4 indicates that the performance gain is clearly positively correlated with the variance of visual
 78 |             tempos. This study has strongly verified our motivation that TPN could bring a significant improvement for
 79 |             such actions with large variances of visual tempo.</p>
 80 |         <p><i>Robustness of TPN to Visual Tempo Variation :</i>
 81 |             Figure 5 suggests that TPN helps improve the robustness of I3D-50, resulting in a curve with moderater
 82 |             fluctuations. More discussion is presented in our experimental section.
 83 |         </p>
 84 |         <div class="teaser">
 85 |             <img src="figures/empirical.png">
 86 |         </div>
 87 | 
 88 |     </div>
 89 | </div>
 90 | <!--====== Results Section Ends ======-->
 91 | 
 92 | 
 93 | <!--====== References Section Starts ======-->
 94 | <div class="container">
 95 |     <div class="bibtex">Bibtex</div>
 96 |     <pre>
 97 | @inproceedings{yang2020tpn,
 98 |   title   = {Temporal Pyramid Network for Action Recognition}},
 99 |   author  = {Yang, Ceyuan and Xu, Yinghao and Shi, Jianping and Dai, Bo and Zhou, Bolei},
100 |   journal = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
101 |   year    = {2020}
102 | }
103 | </pre>
104 | 
105 | </body>
106 | </html>
107 | 


--------------------------------------------------------------------------------
/config_files/sthv1/tsm_tpn.py:
--------------------------------------------------------------------------------
  1 | model = dict(
  2 |     type='TSN2D',
  3 |     backbone=dict(
  4 |         type='ResNet',
  5 |         pretrained='modelzoo://resnet50',
  6 |         depth=50,
  7 |         nsegments=8,
  8 |         out_indices=(2, 3),
  9 |         tsm=True,
 10 |         bn_eval=False,
 11 |         partial_bn=False),
 12 |     necks=dict(
 13 |         type='TPN',
 14 |         in_channels=[1024, 2048],
 15 |         out_channels=1024,
 16 |         spatial_modulation_config=dict(
 17 |             inplanes=[1024, 2048],
 18 |             planes=2048,
 19 |         ),
 20 |         temporal_modulation_config=dict(
 21 |             scales=(8, 8),
 22 |             param=dict(
 23 |                 inplanes=-1,
 24 |                 planes=-1,
 25 |                 downsample_scale=-1,
 26 |             )),
 27 |         upsampling_config=dict(
 28 |             scale=(1, 1, 1),
 29 |         ),
 30 |         downsampling_config=dict(
 31 |             scales=(1, 1, 1),
 32 |             param=dict(
 33 |                 inplanes=-1,
 34 |                 planes=-1,
 35 |                 downsample_scale=-1,
 36 |             )),
 37 |         level_fusion_config=dict(
 38 |             in_channels=[1024, 1024],
 39 |             mid_channels=[1024, 1024],
 40 |             out_channels=2048,
 41 |             ds_scales=[(1, 1, 1), (1, 1, 1)],
 42 |         ),
 43 |         aux_head_config=dict(
 44 |             inplanes=-1,
 45 |             planes=174,
 46 |             loss_weight=0.5
 47 |         ),
 48 |     ),
 49 |     spatial_temporal_module=dict(
 50 |         type='SimpleSpatialModule',
 51 |         spatial_type='avg',
 52 |         spatial_size=7),
 53 |     segmental_consensus=dict(
 54 |         type='SimpleConsensus',
 55 |         consensus_type='avg'),
 56 |     cls_head=dict(
 57 |         type='ClsHead',
 58 |         with_avg_pool=False,
 59 |         temporal_feature_size=1,
 60 |         spatial_feature_size=1,
 61 |         dropout_ratio=0.5,
 62 |         in_channels=2048,
 63 |         num_classes=174))
 64 | train_cfg = None
 65 | test_cfg = None
 66 | # dataset settings
 67 | dataset_type = 'RawFramesDataset'
 68 | data_root = ''
 69 | data_root_val = ''
 70 | 
 71 | img_norm_cfg = dict(
 72 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 73 | 
 74 | data = dict(
 75 |     videos_per_gpu=8,
 76 |     workers_per_gpu=8,
 77 |     train=dict(
 78 |         type=dataset_type,
 79 |         ann_file='data/sthv1/train_videofolder.txt',
 80 |         img_prefix=data_root,
 81 |         img_norm_cfg=img_norm_cfg,
 82 |         num_segments=8,
 83 |         new_length=1,
 84 |         new_step=1,
 85 |         random_shift=True,
 86 |         modality='RGB',
 87 |         image_tmpl='{:05d}.jpg',
 88 |         img_scale=256,
 89 |         input_size=224,
 90 |         flip_ratio=0.5,
 91 |         resize_keep_ratio=True,
 92 |         resize_crop=True,
 93 |         color_jitter=True,
 94 |         color_space_aug=True,
 95 |         oversample=None,
 96 |         max_distort=1,
 97 |         test_mode=False),
 98 |     val=dict(
 99 |         type=dataset_type,
100 |         ann_file='data/sthv1/val_videofolder.txt',
101 |         img_prefix=data_root_val,
102 |         img_norm_cfg=img_norm_cfg,
103 |         num_segments=8,
104 |         new_length=1,
105 |         new_step=1,
106 |         random_shift=False,
107 |         modality='RGB',
108 |         image_tmpl='{:05d}.jpg',
109 |         img_scale=256,
110 |         input_size=224,
111 |         flip_ratio=0,
112 |         resize_keep_ratio=True,
113 |         oversample=None,
114 |         test_mode=False),
115 |     test=dict(
116 |         type=dataset_type,
117 |         ann_file='data/sthv1/val_videofolder.txt',
118 |         img_prefix=data_root_val,
119 |         img_norm_cfg=img_norm_cfg,
120 |         num_segments=16,
121 |         new_length=1,
122 |         new_step=1,
123 |         random_shift=False,
124 |         modality='RGB',
125 |         image_tmpl='{:05d}.jpg',
126 |         img_scale=256,
127 |         input_size=256,
128 |         flip_ratio=0,
129 |         resize_keep_ratio=True,
130 |         oversample="three_crop",
131 |         test_mode=True))
132 | # optimizer
133 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True)
134 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
135 | # learning policy
136 | lr_config = dict(
137 |     policy='step',
138 |     step=[75, 125])
139 | checkpoint_config = dict(interval=1)
140 | workflow = [('train', 1)]
141 | # yapf:disable
142 | log_config = dict(
143 |     interval=20,
144 |     hooks=[
145 |         dict(type='TextLoggerHook'),
146 |         # dict(type='TensorboardLoggerHook')
147 |     ])
148 | # yapf:enable
149 | # runtime settings
150 | total_epochs = 150
151 | dist_params = dict(backend='nccl')
152 | log_level = 'INFO'
153 | load_from = None
154 | resume_from = None
155 | 


--------------------------------------------------------------------------------
/config_files/sthv2/tsm_tpn.py:
--------------------------------------------------------------------------------
  1 | model = dict(
  2 |     type='TSN2D',
  3 |     backbone=dict(
  4 |         type='ResNet',
  5 |         pretrained='modelzoo://resnet50',
  6 |         depth=50,
  7 |         nsegments=8,
  8 |         out_indices=(2, 3),
  9 |         tsm=True,
 10 |         bn_eval=False,
 11 |         partial_bn=False),
 12 |     necks=dict(
 13 |         type='TPN',
 14 |         in_channels=[1024, 2048],
 15 |         out_channels=1024,
 16 |         spatial_modulation_config=dict(
 17 |             inplanes=[1024, 2048],
 18 |             planes=2048,
 19 |         ),
 20 |         temporal_modulation_config=dict(
 21 |             scales=(8, 8),
 22 |             param=dict(
 23 |                 inplanes=-1,
 24 |                 planes=-1,
 25 |                 downsample_scale=-1,
 26 |             )),
 27 |         upsampling_config=dict(
 28 |             scale=(1, 1, 1),
 29 |         ),
 30 |         downsampling_config=dict(
 31 |             scales=(1, 1, 1),
 32 |             param=dict(
 33 |                 inplanes=-1,
 34 |                 planes=-1,
 35 |                 downsample_scale=-1,
 36 |             )),
 37 |         level_fusion_config=dict(
 38 |             in_channels=[1024, 1024],
 39 |             mid_channels=[1024, 1024],
 40 |             out_channels=2048,
 41 |             ds_scales=[(1, 1, 1), (1, 1, 1)],
 42 |         ),
 43 |         aux_head_config=dict(
 44 |             inplanes=-1,
 45 |             planes=174,
 46 |             loss_weight=0.5
 47 |         ),
 48 |     ),
 49 |     spatial_temporal_module=dict(
 50 |         type='SimpleSpatialModule',
 51 |         spatial_type='avg',
 52 |         spatial_size=7),
 53 |     segmental_consensus=dict(
 54 |         type='SimpleConsensus',
 55 |         consensus_type='avg'),
 56 |     cls_head=dict(
 57 |         type='ClsHead',
 58 |         with_avg_pool=False,
 59 |         temporal_feature_size=1,
 60 |         spatial_feature_size=1,
 61 |         dropout_ratio=0.5,
 62 |         in_channels=2048,
 63 |         num_classes=174))
 64 | train_cfg = None
 65 | test_cfg = None
 66 | # dataset settings
 67 | dataset_type = 'RawFramesDataset'
 68 | data_root = ''
 69 | data_root_val = ''
 70 | 
 71 | img_norm_cfg = dict(
 72 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 73 | 
 74 | data = dict(
 75 |     videos_per_gpu=8,
 76 |     workers_per_gpu=8,
 77 |     train=dict(
 78 |         type=dataset_type,
 79 |         ann_file='data/sthv2/train_videofolder.txt',
 80 |         img_prefix=data_root,
 81 |         img_norm_cfg=img_norm_cfg,
 82 |         num_segments=8,
 83 |         new_length=1,
 84 |         new_step=1,
 85 |         random_shift=True,
 86 |         modality='RGB',
 87 |         image_tmpl='img_{:05d}.jpg',
 88 |         img_scale=256,
 89 |         input_size=224,
 90 |         flip_ratio=0.5,
 91 |         resize_keep_ratio=True,
 92 |         resize_crop=True,
 93 |         color_jitter=True,
 94 |         color_space_aug=True,
 95 |         oversample=None,
 96 |         max_distort=1,
 97 |         test_mode=False),
 98 |     val=dict(
 99 |         type=dataset_type,
100 |         ann_file='data/sthv2/val_videofolder.txt',
101 |         img_prefix=data_root_val,
102 |         img_norm_cfg=img_norm_cfg,
103 |         num_segments=8,
104 |         new_length=1,
105 |         new_step=1,
106 |         random_shift=False,
107 |         modality='RGB',
108 |         image_tmpl='img_{:05d}.jpg',
109 |         img_scale=256,
110 |         input_size=224,
111 |         flip_ratio=0,
112 |         resize_keep_ratio=True,
113 |         oversample=None,
114 |         test_mode=False),
115 |     test=dict(
116 |         type=dataset_type,
117 |         ann_file='data/sthv2/val_videofolder.txt',
118 |         img_prefix=data_root_val,
119 |         img_norm_cfg=img_norm_cfg,
120 |         num_segments=16,
121 |         new_length=1,
122 |         new_step=1,
123 |         random_shift=False,
124 |         modality='RGB',
125 |         image_tmpl='img_{:05d}.jpg',
126 |         img_scale=256,
127 |         input_size=256,
128 |         flip_ratio=0,
129 |         resize_keep_ratio=True,
130 |         oversample="three_crop",
131 |         test_mode=True))
132 | # optimizer
133 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True)
134 | optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
135 | # learning policy
136 | lr_config = dict(
137 |     policy='step',
138 |     step=[75, 125])
139 | checkpoint_config = dict(interval=1)
140 | workflow = [('train', 1)]
141 | # yapf:disable
142 | log_config = dict(
143 |     interval=20,
144 |     hooks=[
145 |         dict(type='TextLoggerHook'),
146 |         # dict(type='TensorboardLoggerHook')
147 |     ])
148 | # yapf:enable
149 | # runtime settings
150 | total_epochs = 150
151 | dist_params = dict(backend='nccl')
152 | log_level = 'INFO'
153 | load_from = None
154 | resume_from = None
155 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r50f8s8.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet50',
  7 |         depth=50,
  8 |         num_stages=4,
  9 |         out_indices=[2, 3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=False,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     necks=dict(
 22 |         type='TPN',
 23 |         in_channels=[1024, 2048],
 24 |         out_channels=1024,
 25 |         spatial_modulation_config=dict(
 26 |             inplanes=[1024, 2048],
 27 |             planes=2048,
 28 |         ),
 29 |         temporal_modulation_config=dict(
 30 |             scales=(8, 8),
 31 |             param=dict(
 32 |                 inplanes=-1,
 33 |                 planes=-1,
 34 |                 downsample_scale=-1,
 35 |             )),
 36 |         upsampling_config=dict(
 37 |             scale=(1, 1, 1),
 38 |         ),
 39 |         downsampling_config=dict(
 40 |             scales=(1, 1, 1),
 41 |             param=dict(
 42 |                 inplanes=-1,
 43 |                 planes=-1,
 44 |                 downsample_scale=-1,
 45 |             )),
 46 |         level_fusion_config=dict(
 47 |             in_channels=[1024, 1024],
 48 |             mid_channels=[1024, 1024],
 49 |             out_channels=2048,
 50 |             ds_scales=[(1, 1, 1), (1, 1, 1)],
 51 |         ),
 52 |         aux_head_config=dict(
 53 |             inplanes=-1,
 54 |             planes=400,
 55 |             loss_weight=0.5
 56 |         ),
 57 |     ),
 58 |     spatial_temporal_module=dict(
 59 |         type='SimpleSpatialTemporalModule',
 60 |         spatial_type='avg',
 61 |         temporal_size=1,
 62 |         spatial_size=7),
 63 |     segmental_consensus=dict(
 64 |         type='SimpleConsensus',
 65 |         consensus_type='avg'),
 66 |     cls_head=dict(
 67 |         type='ClsHead',
 68 |         with_avg_pool=False,
 69 |         temporal_feature_size=1,
 70 |         spatial_feature_size=1,
 71 |         dropout_ratio=0.5,
 72 |         in_channels=2048,
 73 |         num_classes=400))
 74 | train_cfg = None
 75 | test_cfg = None
 76 | # dataset settings
 77 | dataset_type = 'RawFramesDataset'
 78 | data_root = ''
 79 | data_root_val = ''
 80 | img_norm_cfg = dict(
 81 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 82 | data = dict(
 83 |     videos_per_gpu=8,
 84 |     workers_per_gpu=8,
 85 |     train=dict(
 86 |         type=dataset_type,
 87 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
 88 |         img_prefix=data_root,
 89 |         img_norm_cfg=img_norm_cfg,
 90 |         input_format="NCTHW",
 91 |         num_segments=1,
 92 |         new_length=8,
 93 |         new_step=8,
 94 |         random_shift=True,
 95 |         modality='RGB',
 96 |         image_tmpl='img_{:05d}.jpg',
 97 |         img_scale=256,
 98 |         resize_keep_ratio=True,
 99 |         input_size=224,
100 |         flip_ratio=0.5,
101 |         oversample=None,
102 |         resize_crop=True,
103 |         color_jitter=True,
104 |         color_space_aug=True,
105 |         max_distort=0,
106 |         test_mode=False,
107 |     ),
108 |     val=dict(
109 |         type=dataset_type,
110 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 |         img_prefix=data_root_val,
112 |         img_norm_cfg=img_norm_cfg,
113 |         input_format="NCTHW",
114 |         num_segments=1,
115 |         new_length=8,
116 |         new_step=8,
117 |         random_shift=True,
118 |         modality='RGB',
119 |         image_tmpl='img_{:05d}.jpg',
120 |         img_scale=256,
121 |         input_size=224,
122 |         flip_ratio=0,
123 |         resize_keep_ratio=True,
124 |         oversample=None,
125 |         test_mode=False,
126 |     ),
127 |     test=dict(
128 |         type=dataset_type,
129 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 |         img_prefix=data_root_val,
131 |         img_norm_cfg=img_norm_cfg,
132 |         input_format="NCTHW",
133 |         num_segments=10,
134 |         new_length=8,
135 |         new_step=8,
136 |         random_shift=True,
137 |         modality='RGB',
138 |         image_tmpl='img_{:05d}.jpg',
139 |         img_scale=256,
140 |         input_size=256,
141 |         flip_ratio=0,
142 |         resize_keep_ratio=True,
143 |         oversample='three_crop',
144 |         test_mode=True,
145 |     ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 |     policy='step',
152 |     step=[75, 125])
153 | 
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 |     interval=20,
160 |     hooks=[
161 |         dict(type='TextLoggerHook'),
162 |         # dict(type='TensorboardLoggerHook')
163 |     ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r101f16s4.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet101',
  7 |         depth=101,
  8 |         num_stages=4,
  9 |         out_indices=[2, 3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=True,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     necks=dict(
 22 |         type='TPN',
 23 |         in_channels=[1024, 2048],
 24 |         out_channels=1024,
 25 |         spatial_modulation_config=dict(
 26 |             inplanes=[1024, 2048],
 27 |             planes=2048,
 28 |         ),
 29 |         temporal_modulation_config=dict(
 30 |             scales=(8, 16),
 31 |             param=dict(
 32 |                 inplanes=-1,
 33 |                 planes=-1,
 34 |                 downsample_scale=-1,
 35 |             )),
 36 |         upsampling_config=dict(
 37 |             scale=(1, 1, 1),
 38 |         ),
 39 |         downsampling_config=dict(
 40 |             scales=(2, 1, 1),
 41 |             param=dict(
 42 |                 inplanes=-1,
 43 |                 planes=-1,
 44 |                 downsample_scale=-1,
 45 |             )),
 46 |         level_fusion_config=dict(
 47 |             in_channels=[1024, 1024],
 48 |             mid_channels=[1024, 1024],
 49 |             out_channels=2048,
 50 |             ds_scales=[(2, 1, 1), (1, 1, 1)],
 51 |         ),
 52 |         aux_head_config=dict(
 53 |             inplanes=-1,
 54 |             planes=400,
 55 |             loss_weight=0.5
 56 |         ),
 57 |     ),
 58 |     spatial_temporal_module=dict(
 59 |         type='SimpleSpatialTemporalModule',
 60 |         spatial_type='avg',
 61 |         temporal_size=1,
 62 |         spatial_size=7),
 63 |     segmental_consensus=dict(
 64 |         type='SimpleConsensus',
 65 |         consensus_type='avg'),
 66 |     cls_head=dict(
 67 |         type='ClsHead',
 68 |         with_avg_pool=False,
 69 |         temporal_feature_size=1,
 70 |         spatial_feature_size=1,
 71 |         dropout_ratio=0.5,
 72 |         in_channels=2048,
 73 |         num_classes=400))
 74 | train_cfg = None
 75 | test_cfg = None
 76 | # dataset settings
 77 | dataset_type = 'RawFramesDataset'
 78 | data_root = ''
 79 | data_root_val = ''
 80 | img_norm_cfg = dict(
 81 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 82 | data = dict(
 83 |     videos_per_gpu=8,
 84 |     workers_per_gpu=8,
 85 |     train=dict(
 86 |         type=dataset_type,
 87 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
 88 |         img_prefix=data_root,
 89 |         img_norm_cfg=img_norm_cfg,
 90 |         input_format="NCTHW",
 91 |         num_segments=1,
 92 |         new_length=16,
 93 |         new_step=4,
 94 |         random_shift=True,
 95 |         modality='RGB',
 96 |         image_tmpl='img_{:05d}.jpg',
 97 |         img_scale=256,
 98 |         resize_keep_ratio=True,
 99 |         input_size=224,
100 |         flip_ratio=0.5,
101 |         oversample=None,
102 |         resize_crop=True,
103 |         color_jitter=True,
104 |         color_space_aug=True,
105 |         max_distort=0,
106 |         test_mode=False,
107 |     ),
108 |     val=dict(
109 |         type=dataset_type,
110 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 |         img_prefix=data_root_val,
112 |         img_norm_cfg=img_norm_cfg,
113 |         input_format="NCTHW",
114 |         num_segments=1,
115 |         new_length=16,
116 |         new_step=4,
117 |         random_shift=True,
118 |         modality='RGB',
119 |         image_tmpl='img_{:05d}.jpg',
120 |         img_scale=256,
121 |         input_size=224,
122 |         flip_ratio=0,
123 |         resize_keep_ratio=True,
124 |         oversample=None,
125 |         test_mode=False,
126 |     ),
127 |     test=dict(
128 |         type=dataset_type,
129 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 |         img_prefix=data_root_val,
131 |         img_norm_cfg=img_norm_cfg,
132 |         input_format="NCTHW",
133 |         num_segments=10,
134 |         new_length=16,
135 |         new_step=4,
136 |         random_shift=True,
137 |         modality='RGB',
138 |         image_tmpl='img_{:05d}.jpg',
139 |         img_scale=256,
140 |         input_size=256,
141 |         flip_ratio=0,
142 |         resize_keep_ratio=True,
143 |         oversample='three_crop',
144 |         test_mode=True,
145 |     ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 |     policy='step',
152 |     step=[75, 125])
153 | 
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 |     interval=20,
160 |     hooks=[
161 |         dict(type='TextLoggerHook'),
162 |         # dict(type='TensorboardLoggerHook')
163 |     ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r101f8s8.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet101',
  7 |         depth=101,
  8 |         num_stages=4,
  9 |         out_indices=[2, 3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=False,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     necks=dict(
 22 |         type='TPN',
 23 |         in_channels=[1024, 2048],
 24 |         out_channels=1024,
 25 |         spatial_modulation_config=dict(
 26 |             inplanes=[1024, 2048],
 27 |             planes=2048,
 28 |         ),
 29 |         temporal_modulation_config=dict(
 30 |             scales=(4, 8),
 31 |             param=dict(
 32 |                 inplanes=-1,
 33 |                 planes=-1,
 34 |                 downsample_scale=-1,
 35 |             )),
 36 |         upsampling_config=dict(
 37 |             scale=(1, 1, 1),
 38 |         ),
 39 |         downsampling_config=dict(
 40 |             scales=(2, 1, 1),
 41 |             param=dict(
 42 |                 inplanes=-1,
 43 |                 planes=-1,
 44 |                 downsample_scale=-1,
 45 |             )),
 46 |         level_fusion_config=dict(
 47 |             in_channels=[1024, 1024],
 48 |             mid_channels=[1024, 1024],
 49 |             out_channels=2048,
 50 |             ds_scales=[(2, 1, 1), (1, 1, 1)],
 51 |         ),
 52 |         aux_head_config=dict(
 53 |             inplanes=-1,
 54 |             planes=400,
 55 |             loss_weight=0.5
 56 |         ),
 57 |     ),
 58 |     spatial_temporal_module=dict(
 59 |         type='SimpleSpatialTemporalModule',
 60 |         spatial_type='avg',
 61 |         temporal_size=1,
 62 |         spatial_size=7),
 63 |     segmental_consensus=dict(
 64 |         type='SimpleConsensus',
 65 |         consensus_type='avg'),
 66 |     cls_head=dict(
 67 |         type='ClsHead',
 68 |         with_avg_pool=False,
 69 |         temporal_feature_size=1,
 70 |         spatial_feature_size=1,
 71 |         dropout_ratio=0.5,
 72 |         in_channels=2048,
 73 |         num_classes=400))
 74 | train_cfg = None
 75 | test_cfg = None
 76 | # dataset settings
 77 | dataset_type = 'RawFramesDataset'
 78 | data_root = ''
 79 | data_root_val = ''
 80 | img_norm_cfg = dict(
 81 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 82 | data = dict(
 83 |     videos_per_gpu=8,
 84 |     workers_per_gpu=8,
 85 |     train=dict(
 86 |         type=dataset_type,
 87 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
 88 |         img_prefix=data_root,
 89 |         img_norm_cfg=img_norm_cfg,
 90 |         input_format="NCTHW",
 91 |         num_segments=1,
 92 |         new_length=8,
 93 |         new_step=8,
 94 |         random_shift=True,
 95 |         modality='RGB',
 96 |         image_tmpl='img_{:05d}.jpg',
 97 |         img_scale=256,
 98 |         resize_keep_ratio=True,
 99 |         input_size=224,
100 |         flip_ratio=0.5,
101 |         oversample=None,
102 |         resize_crop=True,
103 |         color_jitter=True,
104 |         color_space_aug=True,
105 |         max_distort=0,
106 |         test_mode=False,
107 |     ),
108 |     val=dict(
109 |         type=dataset_type,
110 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 |         img_prefix=data_root_val,
112 |         img_norm_cfg=img_norm_cfg,
113 |         input_format="NCTHW",
114 |         num_segments=1,
115 |         new_length=8,
116 |         new_step=8,
117 |         random_shift=True,
118 |         modality='RGB',
119 |         image_tmpl='img_{:05d}.jpg',
120 |         img_scale=256,
121 |         input_size=224,
122 |         flip_ratio=0,
123 |         resize_keep_ratio=True,
124 |         oversample=None,
125 |         test_mode=False,
126 |     ),
127 |     test=dict(
128 |         type=dataset_type,
129 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 |         img_prefix=data_root_val,
131 |         img_norm_cfg=img_norm_cfg,
132 |         input_format="NCTHW",
133 |         num_segments=10,
134 |         new_length=8,
135 |         new_step=8,
136 |         random_shift=True,
137 |         modality='RGB',
138 |         image_tmpl='img_{:05d}.jpg',
139 |         img_scale=256,
140 |         input_size=256,
141 |         flip_ratio=0,
142 |         resize_keep_ratio=True,
143 |         oversample='three_crop',
144 |         test_mode=True,
145 |     ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 |     policy='step',
152 |     step=[75, 125])
153 | 
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 |     interval=20,
160 |     hooks=[
161 |         dict(type='TextLoggerHook'),
162 |         # dict(type='TensorboardLoggerHook')
163 |     ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r50f16s4.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet50',
  7 |         depth=50,
  8 |         num_stages=4,
  9 |         out_indices=[2, 3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=True,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     necks=dict(
 22 |         type='TPN',
 23 |         in_channels=[1024, 2048],
 24 |         out_channels=1024,
 25 |         spatial_modulation_config=dict(
 26 |             inplanes=[1024, 2048],
 27 |             planes=2048,
 28 |         ),
 29 |         temporal_modulation_config=dict(
 30 |             scales=(16, 16),
 31 |             param=dict(
 32 |                 inplanes=-1,
 33 |                 planes=-1,
 34 |                 downsample_scale=-1,
 35 |             )),
 36 |         upsampling_config=dict(
 37 |             scale=(1, 1, 1),
 38 |         ),
 39 |         downsampling_config=dict(
 40 |             scales=(1, 1, 1),
 41 |             param=dict(
 42 |                 inplanes=-1,
 43 |                 planes=-1,
 44 |                 downsample_scale=-1,
 45 |             )),
 46 |         level_fusion_config=dict(
 47 |             in_channels=[1024, 1024],
 48 |             mid_channels=[1024, 1024],
 49 |             out_channels=2048,
 50 |             ds_scales=[(1, 1, 1), (1, 1, 1)],
 51 |         ),
 52 |         aux_head_config=dict(
 53 |             inplanes=-1,
 54 |             planes=400,
 55 |             loss_weight=0.5
 56 |         ),
 57 |     ),
 58 |     spatial_temporal_module=dict(
 59 |         type='SimpleSpatialTemporalModule',
 60 |         spatial_type='avg',
 61 |         temporal_size=1,
 62 |         spatial_size=7),
 63 |     segmental_consensus=dict(
 64 |         type='SimpleConsensus',
 65 |         consensus_type='avg'),
 66 |     cls_head=dict(
 67 |         type='ClsHead',
 68 |         with_avg_pool=False,
 69 |         temporal_feature_size=1,
 70 |         spatial_feature_size=1,
 71 |         dropout_ratio=0.5,
 72 |         in_channels=2048,
 73 |         num_classes=400))
 74 | train_cfg = None
 75 | test_cfg = None
 76 | # dataset settings
 77 | dataset_type = 'RawFramesDataset'
 78 | data_root = ''
 79 | data_root_val = ''
 80 | img_norm_cfg = dict(
 81 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 82 | data = dict(
 83 |     videos_per_gpu=8,
 84 |     workers_per_gpu=8,
 85 |     train=dict(
 86 |         type=dataset_type,
 87 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
 88 |         img_prefix=data_root,
 89 |         img_norm_cfg=img_norm_cfg,
 90 |         input_format="NCTHW",
 91 |         num_segments=1,
 92 |         new_length=16,
 93 |         new_step=4,
 94 |         random_shift=True,
 95 |         modality='RGB',
 96 |         image_tmpl='img_{:05d}.jpg',
 97 |         img_scale=256,
 98 |         resize_keep_ratio=True,
 99 |         input_size=224,
100 |         flip_ratio=0.5,
101 |         oversample=None,
102 |         resize_crop=True,
103 |         color_jitter=True,
104 |         color_space_aug=True,
105 |         max_distort=0,
106 |         test_mode=False,
107 |     ),
108 |     val=dict(
109 |         type=dataset_type,
110 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 |         img_prefix=data_root_val,
112 |         img_norm_cfg=img_norm_cfg,
113 |         input_format="NCTHW",
114 |         num_segments=1,
115 |         new_length=16,
116 |         new_step=4,
117 |         random_shift=True,
118 |         modality='RGB',
119 |         image_tmpl='img_{:05d}.jpg',
120 |         img_scale=256,
121 |         input_size=224,
122 |         flip_ratio=0,
123 |         resize_keep_ratio=True,
124 |         oversample=None,
125 |         test_mode=False,
126 |     ),
127 |     test=dict(
128 |         type=dataset_type,
129 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 |         img_prefix=data_root_val,
131 |         img_norm_cfg=img_norm_cfg,
132 |         input_format="NCTHW",
133 |         num_segments=10,
134 |         new_length=16,
135 |         new_step=4,
136 |         random_shift=True,
137 |         modality='RGB',
138 |         image_tmpl='img_{:05d}.jpg',
139 |         img_scale=256,
140 |         input_size=256,
141 |         flip_ratio=0,
142 |         resize_keep_ratio=True,
143 |         oversample='three_crop',
144 |         test_mode=True,
145 |     ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 |     policy='step',
152 |     step=[75, 125])
153 | 
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 |     interval=20,
160 |     hooks=[
161 |         dict(type='TextLoggerHook'),
162 |         # dict(type='TensorboardLoggerHook')
163 |     ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r101f32s2.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet101',
  7 |         depth=101,
  8 |         num_stages=4,
  9 |         out_indices=[2, 3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=True,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     necks=dict(
 22 |         type='TPN',
 23 |         in_channels=[1024, 2048],
 24 |         out_channels=1024,
 25 |         spatial_modulation_config=dict(
 26 |             inplanes=[1024, 2048],
 27 |             planes=2048,
 28 |         ),
 29 |         temporal_modulation_config=dict(
 30 |             scales=(16, 32),
 31 |             param=dict(
 32 |                 inplanes=-1,
 33 |                 planes=-1,
 34 |                 downsample_scale=-1,
 35 |             )),
 36 |         upsampling_config=dict(
 37 |             scale=(1, 1, 1),
 38 |         ),
 39 |         downsampling_config=dict(
 40 |             scales=(2, 1, 1),
 41 |             param=dict(
 42 |                 inplanes=-1,
 43 |                 planes=-1,
 44 |                 downsample_scale=-1,
 45 |             )),
 46 |         level_fusion_config=dict(
 47 |             in_channels=[1024, 1024],
 48 |             mid_channels=[1024, 1024],
 49 |             out_channels=2048,
 50 |             ds_scales=[(2, 1, 1), (1, 1, 1)],
 51 |         ),
 52 |         aux_head_config=dict(
 53 |             inplanes=-1,
 54 |             planes=400,
 55 |             loss_weight=0.5
 56 |         ),
 57 |     ),
 58 |     spatial_temporal_module=dict(
 59 |         type='SimpleSpatialTemporalModule',
 60 |         spatial_type='avg',
 61 |         temporal_size=1,
 62 |         spatial_size=7),
 63 |     segmental_consensus=dict(
 64 |         type='SimpleConsensus',
 65 |         consensus_type='avg'),
 66 |     cls_head=dict(
 67 |         type='ClsHead',
 68 |         with_avg_pool=False,
 69 |         temporal_feature_size=1,
 70 |         spatial_feature_size=1,
 71 |         dropout_ratio=0.5,
 72 |         in_channels=2048,
 73 |         num_classes=400))
 74 | train_cfg = None
 75 | test_cfg = None
 76 | # dataset settings
 77 | dataset_type = 'RawFramesDataset'
 78 | data_root = ''
 79 | data_root_val = ''
 80 | img_norm_cfg = dict(
 81 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 82 | data = dict(
 83 |     videos_per_gpu=8,
 84 |     workers_per_gpu=8,
 85 |     train=dict(
 86 |         type=dataset_type,
 87 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
 88 |         img_prefix=data_root,
 89 |         img_norm_cfg=img_norm_cfg,
 90 |         input_format="NCTHW",
 91 |         num_segments=1,
 92 |         new_length=32,
 93 |         new_step=2,
 94 |         random_shift=True,
 95 |         modality='RGB',
 96 |         image_tmpl='img_{:05d}.jpg',
 97 |         img_scale=256,
 98 |         resize_keep_ratio=True,
 99 |         input_size=224,
100 |         flip_ratio=0.5,
101 |         oversample=None,
102 |         resize_crop=True,
103 |         color_jitter=True,
104 |         color_space_aug=True,
105 |         max_distort=0,
106 |         test_mode=False,
107 |     ),
108 |     val=dict(
109 |         type=dataset_type,
110 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 |         img_prefix=data_root_val,
112 |         img_norm_cfg=img_norm_cfg,
113 |         input_format="NCTHW",
114 |         num_segments=1,
115 |         new_length=32,
116 |         new_step=2,
117 |         random_shift=True,
118 |         modality='RGB',
119 |         image_tmpl='img_{:05d}.jpg',
120 |         img_scale=256,
121 |         input_size=224,
122 |         flip_ratio=0,
123 |         resize_keep_ratio=True,
124 |         oversample=None,
125 |         test_mode=False,
126 |     ),
127 |     test=dict(
128 |         type=dataset_type,
129 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
130 |         img_prefix=data_root_val,
131 |         img_norm_cfg=img_norm_cfg,
132 |         input_format="NCTHW",
133 |         num_segments=10,
134 |         new_length=32,
135 |         new_step=2,
136 |         random_shift=True,
137 |         modality='RGB',
138 |         image_tmpl='img_{:05d}.jpg',
139 |         img_scale=256,
140 |         input_size=256,
141 |         flip_ratio=0,
142 |         resize_keep_ratio=True,
143 |         oversample='three_crop',
144 |         test_mode=True,
145 |     ))
146 | # optimizer
147 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
148 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
149 | # learning policy
150 | lr_config = dict(
151 |     policy='step',
152 |     step=[75, 125])
153 | 
154 | checkpoint_config = dict(interval=1)
155 | # workflow = [('train', 5), ('val', 1)]
156 | workflow = [('train', 1)]
157 | # yapf:disable
158 | log_config = dict(
159 |     interval=20,
160 |     hooks=[
161 |         dict(type='TextLoggerHook'),
162 |         # dict(type='TensorboardLoggerHook')
163 |     ])
164 | # yapf:enable
165 | # runtime settings
166 | total_epochs = 150
167 | dist_params = dict(backend='nccl')
168 | log_level = 'INFO'
169 | load_from = None
170 | resume_from = None
171 | 


--------------------------------------------------------------------------------
/config_files/kinetics400/tpn/r50f32s2.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='TSN3D',
  4 |     backbone=dict(
  5 |         type='ResNet_SlowFast',
  6 |         pretrained='modelzoo://resnet50',
  7 |         depth=50,
  8 |         num_stages=4,
  9 |         out_indices=[2, 3],
 10 |         frozen_stages=-1,
 11 |         inflate_freq=(0, 0, 1, 1),
 12 |         inflate_style='3x1x1',
 13 |         conv1_kernel_t=1,
 14 |         conv1_stride_t=1,
 15 |         pool1_kernel_t=1,
 16 |         pool1_stride_t=1,
 17 |         with_cp=True,
 18 |         bn_eval=False,
 19 |         partial_bn=False,
 20 |         style='pytorch'),
 21 |     necks=dict(
 22 |         type='TPN',
 23 |         in_channels=[1024, 2048],
 24 |         out_channels=1024,
 25 |         spatial_modulation_config=dict(
 26 |             inplanes=[1024, 2048],
 27 |             planes=2048,
 28 |         ),
 29 |         temporal_modulation_config=dict(
 30 |             scales=(32, 32),
 31 |             param=dict(
 32 |                 inplanes=-1,
 33 |                 planes=-1,
 34 |                 downsample_scale=-1,
 35 |             )),
 36 |         upsampling_config=dict(
 37 |             scale=(1, 1, 1),
 38 |         ),
 39 |         downsampling_config=dict(
 40 |             scales=(1, 1, 1),
 41 |             param=dict(
 42 |                 inplanes=-1,
 43 |                 planes=-1,
 44 |                 downsample_scale=-1,
 45 |             )),
 46 |         level_fusion_config=dict(
 47 |             in_channels=[1024, 1024],
 48 |             mid_channels=[1024, 1024],
 49 |             out_channels=2048,
 50 |             ds_scales=[(1, 1, 1), (1, 1, 1)],
 51 |         ),
 52 |         aux_head_config=dict(
 53 |             inplanes=-1,
 54 |             planes=400,
 55 |             loss_weight=0.5
 56 |         ),
 57 |     ),
 58 |     spatial_temporal_module=dict(
 59 |         type='SimpleSpatialTemporalModule',
 60 |         spatial_type='avg',
 61 |         temporal_size=1,
 62 |         spatial_size=7),
 63 |     segmental_consensus=dict(
 64 |         type='SimpleConsensus',
 65 |         consensus_type='avg'),
 66 |     cls_head=dict(
 67 |         type='ClsHead',
 68 |         with_avg_pool=False,
 69 |         temporal_feature_size=1,
 70 |         spatial_feature_size=1,
 71 |         dropout_ratio=0.5,
 72 |         in_channels=2048,
 73 |         num_classes=400))
 74 | train_cfg = None
 75 | test_cfg = None
 76 | # dataset settings
 77 | dataset_type = 'RawFramesDataset'
 78 | data_root = ''
 79 | data_root_val = ''
 80 | img_norm_cfg = dict(
 81 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 82 | data = dict(
 83 |     videos_per_gpu=8,
 84 |     workers_per_gpu=8,
 85 |     train=dict(
 86 |         type=dataset_type,
 87 |         ann_file='data/kinetics400/kinetics400_train_list_rawframes_checked.txt',
 88 |         img_prefix=data_root,
 89 |         img_norm_cfg=img_norm_cfg,
 90 |         input_format="NCTHW",
 91 |         num_segments=1,
 92 |         new_length=32,
 93 |         new_step=2,
 94 |         random_shift=True,
 95 |         modality='RGB',
 96 |         image_tmpl='img_{:05d}.jpg',
 97 |         img_scale=256,
 98 |         resize_keep_ratio=True,
 99 |         input_size=224,
100 |         flip_ratio=0.5,
101 |         oversample=None,
102 |         resize_crop=True,
103 |         color_jitter=True,
104 |         color_space_aug=True,
105 |         max_distort=0,
106 |         test_mode=False,
107 |     ),
108 |     val=dict(
109 |         type=dataset_type,
110 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
111 |         img_prefix=data_root_val,
112 |         img_norm_cfg=img_norm_cfg,
113 |         input_format="NCTHW",
114 |         num_segments=1,
115 |         new_length=32,
116 |         new_step=2,
117 |         random_shift=True,
118 |         modality='RGB',
119 |         image_tmpl='img_{:05d}.jpg',
120 |         img_scale=256,
121 |         input_size=224,
122 |         div_255=False,
123 |         flip_ratio=0,
124 |         resize_keep_ratio=True,
125 |         oversample=None,
126 |         test_mode=False,
127 |     ),
128 |     test=dict(
129 |         type=dataset_type,
130 |         ann_file='data/kinetics400/kinetics400_val_list_rawframes_checked.txt',
131 |         img_prefix=data_root_val,
132 |         img_norm_cfg=img_norm_cfg,
133 |         input_format="NCTHW",
134 |         num_segments=10,
135 |         new_length=32,
136 |         new_step=2,
137 |         random_shift=True,
138 |         modality='RGB',
139 |         image_tmpl='img_{:05d}.jpg',
140 |         img_scale=256,
141 |         input_size=256,
142 |         flip_ratio=0,
143 |         resize_keep_ratio=True,
144 |         oversample='three_crop',
145 |         test_mode=True,
146 |     ))
147 | # optimizer
148 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=True)
149 | optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
150 | # learning policy
151 | lr_config = dict(
152 |     policy='step',
153 |     step=[75, 125])
154 | 
155 | checkpoint_config = dict(interval=1)
156 | # workflow = [('train', 5), ('val', 1)]
157 | workflow = [('train', 1)]
158 | # yapf:disable
159 | log_config = dict(
160 |     interval=20,
161 |     hooks=[
162 |         dict(type='TextLoggerHook'),
163 |         # dict(type='TensorboardLoggerHook')
164 |     ])
165 | # yapf:enable
166 | # runtime settings
167 | total_epochs = 150
168 | dist_params = dict(backend='nccl')
169 | log_level = 'INFO'
170 | load_from = None
171 | resume_from = None
172 | 


--------------------------------------------------------------------------------
/mmaction/models/recognizers/TSN3D.py:
--------------------------------------------------------------------------------
  1 | from .base import BaseRecognizer
  2 | from .. import builder
  3 | from ..registry import RECOGNIZERS
  4 | 
  5 | import torch
  6 | 
  7 | 
  8 | @RECOGNIZERS.register_module
  9 | class TSN3D(BaseRecognizer):
 10 | 
 11 |     def __init__(self,
 12 |                  backbone,
 13 |                  necks=None,
 14 |                  spatial_temporal_module=None,
 15 |                  segmental_consensus=None,
 16 |                  fcn_testing=False,
 17 |                  flip=False,
 18 |                  cls_head=None,
 19 |                  train_cfg=None,
 20 |                  test_cfg=None):
 21 | 
 22 |         super(TSN3D, self).__init__()
 23 |         self.backbone = builder.build_backbone(backbone)
 24 | 
 25 |         if necks is not None:
 26 |             self.necks = builder.build_neck(necks)
 27 |         else:
 28 |             self.necks = None
 29 | 
 30 |         if spatial_temporal_module is not None:
 31 |             self.spatial_temporal_module = builder.build_spatial_temporal_module(
 32 |                 spatial_temporal_module)
 33 |         else:
 34 |             raise NotImplementedError
 35 | 
 36 |         if segmental_consensus is not None:
 37 |             self.segmental_consensus = builder.build_segmental_consensus(
 38 |                 segmental_consensus)
 39 |         else:
 40 |             raise NotImplementedError
 41 | 
 42 |         if cls_head is not None:
 43 |             self.cls_head = builder.build_head(cls_head)
 44 |         else:
 45 |             raise NotImplementedError
 46 | 
 47 |         self.train_cfg = train_cfg
 48 |         self.test_cfg = test_cfg
 49 |         self.fcn_testing = fcn_testing
 50 |         self.flip = flip
 51 |         self.init_weights()
 52 | 
 53 |     @property
 54 |     def with_spatial_temporal_module(self):
 55 |         return hasattr(self, 'spatial_temporal_module') and self.spatial_temporal_module is not None
 56 | 
 57 |     @property
 58 |     def with_segmental_consensus(self):
 59 |         return hasattr(self, 'segmental_consensus') and self.segmental_consensus is not None
 60 | 
 61 |     @property
 62 |     def with_cls_head(self):
 63 |         return hasattr(self, 'cls_head') and self.cls_head is not None
 64 | 
 65 |     def init_weights(self):
 66 |         super(TSN3D, self).init_weights()
 67 |         self.backbone.init_weights()
 68 | 
 69 |         if self.with_spatial_temporal_module:
 70 |             self.spatial_temporal_module.init_weights()
 71 | 
 72 |         if self.with_segmental_consensus:
 73 |             self.segmental_consensus.init_weights()
 74 | 
 75 |         if self.with_cls_head:
 76 |             self.cls_head.init_weights()
 77 | 
 78 |         if self.necks is not None:
 79 |             self.necks.init_weights()
 80 | 
 81 |     def extract_feat(self, img_group):
 82 |         x = self.backbone(img_group)
 83 |         return x
 84 | 
 85 |     def forward_train(self,
 86 |                       num_modalities,
 87 |                       img_meta,
 88 |                       gt_label,
 89 |                       **kwargs):
 90 |         assert num_modalities == 1
 91 |         img_group = kwargs['img_group_0']
 92 | 
 93 |         bs = img_group.shape[0]
 94 |         img_group = img_group.reshape((-1,) + img_group.shape[2:])
 95 |         num_seg = img_group.shape[0] // bs
 96 | 
 97 |         x = self.extract_feat(img_group)
 98 | 
 99 |         if self.necks is not None:
100 |             x, aux_losses = self.necks(x, gt_label.squeeze())
101 | 
102 |         if self.with_spatial_temporal_module:
103 |             x = self.spatial_temporal_module(x)
104 |         if self.with_segmental_consensus:
105 |             x = x.reshape((-1, num_seg) + x.shape[1:])
106 |             x = self.segmental_consensus(x)
107 |             x = x.squeeze(1)
108 |         losses = dict()
109 |         if self.with_cls_head:
110 |             cls_score = self.cls_head(x)
111 |             gt_label = gt_label.squeeze()
112 |             loss_cls = self.cls_head.loss(cls_score, gt_label)
113 |             losses.update(loss_cls)
114 |         if self.necks is not None:
115 |             if aux_losses is not None:
116 |                 losses.update(aux_losses)
117 | 
118 |         return losses
119 | 
120 |     def forward_test(self,
121 |                      num_modalities,
122 |                      img_meta,
123 |                      **kwargs):
124 |         assert num_modalities == 1
125 |         img_group = kwargs['img_group_0']
126 | 
127 |         bs = img_group.shape[0]
128 |         img_group = img_group.reshape((-1,) + img_group.shape[2:])
129 |         num_seg = img_group.shape[0] // bs
130 | 
131 |         if self.flip:
132 |             img_group = self.extract_feat(torch.flip(img_group, [-1]))
133 |         x = self.extract_feat(img_group)
134 |         if self.necks is not None:
135 |             x, _ = self.necks(x)
136 |         if self.fcn_testing:
137 |             if self.with_cls_head:
138 |                 x = self.cls_head(x)
139 |                 prob1 = torch.nn.functional.softmax(x.mean([2, 3, 4]), 1).mean(0, keepdim=True).detach().cpu().numpy()
140 |                 return prob1
141 | 
142 |         if self.with_spatial_temporal_module:
143 |             x = self.spatial_temporal_module(x)
144 |         if self.with_segmental_consensus:
145 |             x = x.reshape((-1, num_seg) + x.shape[1:])
146 |             x = self.segmental_consensus(x)
147 |             x = x.squeeze(1)
148 |         if self.with_cls_head:
149 |             x = self.cls_head(x)
150 | 
151 |         return x.cpu().numpy()
152 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/spatial_temporal_modules/non_local.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from mmcv.cnn import constant_init, kaiming_init
  5 | from ...registry import SPATIAL_TEMPORAL_MODULES
  6 | 
  7 | 
  8 | @SPATIAL_TEMPORAL_MODULES.register_module
  9 | class NonLocalModule(nn.Module):
 10 |     def __init__(self, in_channels=1024, nonlocal_type="gaussian", dim=3, embed=True, embed_dim=None, sub_sample=True,
 11 |                  use_bn=True):
 12 |         super(NonLocalModule, self).__init__()
 13 | 
 14 |         assert nonlocal_type in ['gaussian', 'dot', 'concat']
 15 |         assert dim == 2 or dim == 3
 16 |         self.nonlocal_type = nonlocal_type
 17 |         self.embed = embed
 18 |         self.embed_dim = embed_dim if embed_dim is not None else in_channels // 2
 19 |         self.sub_sample = sub_sample
 20 |         self.use_bn = use_bn
 21 | 
 22 |         if self.embed:
 23 |             if dim == 2:
 24 |                 self.theta = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
 25 |                 self.phi = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
 26 |                 self.g = nn.Conv2d(in_channels, self.embed_dim, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
 27 |             elif dim == 3:
 28 |                 self.theta = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1),
 29 |                                        padding=(0, 0, 0))
 30 |                 self.phi = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1),
 31 |                                      padding=(0, 0, 0))
 32 |                 self.g = nn.Conv3d(in_channels, self.embed_dim, kernel_size=(1, 1, 1), stride=(1, 1, 1),
 33 |                                    padding=(0, 0, 0))
 34 | 
 35 |         if self.nonlocal_type == 'gaussian':
 36 |             self.softmax = nn.Softmax(dim=2)
 37 |         elif self.nonlocal_type == 'concat':
 38 |             if dim == 2:
 39 |                 self.concat_proj = nn.Sequential(
 40 |                     nn.Conv2d(self.embed_dim * 2, 1, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0)),
 41 |                     nn.ReLU())
 42 |             elif dim == 3:
 43 |                 self.concat_proj = nn.Sequential(
 44 |                     nn.Conv3d(self.embed_dim * 2, 1, kernel_size=(1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0)),
 45 |                     nn.ReLU())
 46 | 
 47 |         if sub_sample:
 48 |             if dim == 2:
 49 |                 self.max_pool = nn.MaxPool2d(kernel_size=(2, 2))
 50 |             elif dim == 3:
 51 |                 self.max_pool = nn.MaxPool3d(kernel_size=(1, 2, 2))
 52 |             self.g = nn.Sequential(self.max_pool, self.g)
 53 |             self.phi = nn.Sequential(self.max_pool, self.phi)
 54 | 
 55 |         if dim == 2:
 56 |             self.W = nn.Conv2d(self.embed_dim, in_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
 57 |         elif dim == 3:
 58 |             self.W = nn.Conv3d(self.embed_dim, in_channels, kernel_size=(1, 1, 1), stride=(1, 1, 1), padding=(0, 0, 0))
 59 | 
 60 |         if use_bn:
 61 |             if dim == 2:
 62 |                 self.bn = nn.BatchNorm2d(in_channels, eps=1e-05, momentum=0.9, affine=True)
 63 |             elif dim == 3:
 64 |                 self.bn = nn.BatchNorm3d(in_channels, eps=1e-05, momentum=0.9, affine=True)
 65 |             self.W = nn.Sequential(self.W, self.bn)
 66 | 
 67 |     def init_weights(self):
 68 |         for m in self.modules():
 69 |             if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv3d):
 70 |                 kaiming_init(m)
 71 |             elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm3d):
 72 |                 constant_init(m, 0)
 73 | 
 74 |     def forward(self, input):
 75 |         if self.embed:
 76 |             theta = self.theta(input)
 77 |             phi = self.phi(input)
 78 |             g = self.g(input)
 79 |         else:
 80 |             theta = input
 81 |             phi = input
 82 |             g = input
 83 | 
 84 |         if self.nonlocal_type in ['gaussian', 'dot']:
 85 |             # reshape [BxC'xTxHxW] to [BxC'x(T)HW]
 86 |             theta = theta.reshape(theta.shape[:2] + (-1,))
 87 |             phi = phi.reshape(theta.shape[:2] + (-1,))
 88 |             g = g.reshape(theta.shape[:2] + (-1,))
 89 |             theta_phi = torch.matmul(theta.transpose(1, 2), phi)
 90 |             if self.nonlocal_type == 'gaussian':
 91 |                 p = self.softmax(theta_phi)
 92 |             elif self.nonlocal_type == 'dot':
 93 |                 N = theta_phi.size(-1)
 94 |                 p = theta_phi / N
 95 |         elif self.non_local_type == 'concat':
 96 |             # reshape [BxC'xTxHxW] to [BxC'x(T)HWx1]
 97 |             theta = theta.reshape(theta.shape[:2] + (-1, 1))
 98 |             # reshape [BxC'xTxHxW] to [BxC'x1x(T)HW]
 99 |             phi = phi.reshape(theta.shape[:2] + (1, -1))
100 |             theta_x = theta.repeat(1, 1, 1, phi.size(3))
101 |             phi_x = phi.repeat(1, 1, theta.size(2), 1)
102 |             theta_phi = torch.cat([theta_x, phi_x], dim=1)
103 |             theta_phi = self.concat_proj(theta_phi)
104 |             theta_phi = theta_phi.squeeze()
105 |             N = theta_phi.size(-1)
106 |             p = theta_phi / N
107 |         else:
108 |             NotImplementedError
109 | 
110 |         # BxC'xddd , Bxdxddd => BxC'xd
111 |         y = torch.matmul(g, p.transpose(1, 2))
112 |         y = y.reshape(y.shape[:2] + input.shape[2:])
113 |         z = self.W(y) + input
114 | 
115 |         return z
116 | 


--------------------------------------------------------------------------------
/MODELZOO.md:
--------------------------------------------------------------------------------
 1 | # Model Zoo
 2 | 
 3 | ## Pretrained Models
 4 | All pretrained models can be downloaded from [Google Drive](https://drive.google.com/drive/folders/1UnqZ48doF0UTYjH6iZCXQW3HlDocbBxl). After downloading, put them into `ckpt/`.
 5 | 
 6 | ## Main Results
 7 | We report our methods on Kinetics-400, Something-Something V1 and V2. All the numbers including baselines and TPN are obtained via fully-convolutional testing. 
 8 | 
 9 | ### Kinetics-400
10 | Since the number of Kinetics-400 videos is slightly different (might lead to a performance drop), we report all results on our own dataset. Our data contains 240403 training videos and 19769 validation videos which are rescaled to 240*320 resolution. Note that the trimmed time of [Non-Local](https://github.com/facebookresearch/video-nonlocal-net/blob/master/DATASET.md) data and the resolution of [MMAction](https://github.com/open-mmlab/mmaction/blob/master/MODEL_ZOO.md) data are different from ours. But the improvements of TPN are consistent. In order to ensure the reproduction, we will find a proper way to release our validation set. All the following results on Kinetics-400 also take flip augmentation testing (~0.1% fluctuation). We sample F frames with a stride of S frames (denote FxS). 
11 | 
12 | 
13 | | Model | Frames | TPN | Top-1 | Weights | Config | 
14 | | :---: | :------: | :--------: | :------: | :------: | :------ |
15 | |R50    | 8 x 8    | -   | 74.9 | [link](https://drive.google.com/open?id=1uKHvZsY_heFHTBl6RXo02I7-W_aLBhFI) | config_files/kinetics400/baseline/r50f8s8.py |
16 | |R50    | 8 x 8    | Yes | 76.1 | [link](https://drive.google.com/open?id=1KoISwdKDlfzZdEsLItygcvPGkKNwWyR-) | config_files/kinetics400/tpn/r50f8s8.py |
17 | |R50    | 16 x 4   | -   | 76.1 | [link](https://drive.google.com/open?id=1Qgck89mUVs9gyUzalbYJPfJPwQEPbyI9) | config_files/kinetics400/baseline/r50f16s4.py |
18 | |R50    | 16 x 4   | Yes | 77.3 | [link](https://drive.google.com/open?id=1TY39uBR-ckUw3aiabeFLNpR9uPSxt--H) | config_files/kinetics400/tpn/r50f16s4.py |
19 | |R50    | 32 x 2   | -   | 75.7 | [link](https://drive.google.com/open?id=1oJ1sTzMeLPXHtnutJAAD8gWfm0b3NYpi) | config_files/kinetics400/baseline/r50f32s2.py |
20 | |R50    | 32 x 2   | Yes | 77.7 | [link](https://drive.google.com/open?id=1TjeqcTJ2tReDz4VnLR8ajSHySre9sZDd) | config_files/kinetics400/tpn/r50f32s2.py |
21 | |R101   | 8 x 8    | -   | 76.0 | [link](https://drive.google.com/open?id=1dqLWiI3DFHAPIzGtEY_jfI66nthw2GEX) | config_files/kinetics400/baseline/r101f8s8.py |
22 | |R101   | 8 x 8    | Yes | 77.2 | [link](https://drive.google.com/open?id=1B4Vsld-JzQe4QmXeZHd0TolMPNyZypXI) | config_files/kinetics400/tpn/r101f8s8.py |
23 | |R101   | 16 x 4   | -   | 77.0 | [link](https://drive.google.com/open?id=1tj2Y0OChKW7RoElXXmBeU63dph40kEyJ) | config_files/kinetics400/baseline/r101f16s4.py |
24 | |R101   | 16 x 4   | Yes | 78.1 | [link](https://drive.google.com/open?id=1mT4kuaYuAGA-Zjagc56vByMQdvx0bE-H) | config_files/kinetics400/tpn/r101f16s4.py |
25 | |R101   | 32 x 2   | -   | 77.4 | [link](https://drive.google.com/open?id=1IAobiYS3PhXC1sA_MCdudGCdHRWcWc9J) | config_files/kinetics400/baseline/r101f32s2.py |
26 | |R101   | 32 x 2   | Yes | 78.9 | [link](https://drive.google.com/open?id=1OPudI7CzJzpdeI0YpwLgZB59VCzcoidp) | config_files/kinetics400/tpn/r101f32s2.py |
27 | 
28 | We also train our TPN on [MMAction](https://github.com/open-mmlab/mmaction/blob/master/MODEL_ZOO.md) data, the performance will increase due to the raw resolution and ratio.
29 | 
30 | | Model | Frames | TPN | Top-1 | Weights | Config |
31 | | :---: | :------: | :--------: | :------: | :------: | :------ |
32 | |R50    | 8 x 8    | Yes  | 76.7 | [link](https://drive.google.com/open?id=1pCY4oiWK3hs6MwaPZ8QVPMb-qDCV56w5) | config_files/kinetics400/baseline/r50f8s8.py |
33 | |R101   | 8 x 8    | Yes  | 78.2 | [link](https://drive.google.com/open?id=1DeVp7cf-dk-x6Um4NouLq5tFniTge0Bd) | config_files/kinetics400/baseline/r101f8s8.py |
34 | 
35 | All models are trained on 32 GPUs with 150 epochs. More details could be found in `config_files`.
36 | 
37 | ### Something-Something
38 | Something-Something is a more stable benchmark and the whole data could be downloaded from their [website](https://20bn.com/datasets/something-something). We report our results on both V1 and V2. All numbers are obtained by following the standard protocol i.e., 3 crops * 2 clips. [TSM](https://github.com/mit-han-lab/temporal-shift-module) serves as our backbone network. 
39 | Different from original [repo](https://github.com/mit-han-lab/temporal-shift-module) of TSM which takes Kinetics-pretrain, our implementation is initialized by imagenet-pretrain and trained with longer schedule. We use **the same** hyper-parameters of training for both baseline and TPN. Therefore, the improvements come from TPN design instead of other training tricks. We take the uniform sampling for training and validation.
40 | 
41 | | Model | Dataset Version | Frames | TPN | Top-1 | Weights | Config |
42 | | :---: | :------: |    :------: | :--------: | :------: | :------: | :------ |
43 | |TSM50  | V1       | 8  | -   | 48.2 | [link](https://drive.google.com/open?id=1x7iwL2Op0qxaUluyQCPOVVEEH53cavhL) | config_files/sthv1/tsm_baseline.py |  
44 | |TSM50  | V1       | 8  | Yes | 50.7 | [link](https://drive.google.com/open?id=1NVjsCYgNXKUKAn33XCxV2YEIaWXlEnLS) | config_files/sthv1/tsm_tpn.py      |
45 | |TSM50  | V2       | 8  | -   | 62.3 | [link](https://drive.google.com/open?id=1fU1b9WySld5knJ8E2bMXfuyRenoViSEX) | config_files/sthv2/tsm_baseline.py |
46 | |TSM50  | V2       | 8  | Yes | 64.7 | [link](https://drive.google.com/open?id=15HHKGIhksTf0dSmgxrTsoHzZxF6n7eRa) | config_files/sthv2/tsm_tpn.py      |
47 | 
48 | If you have any problem about how to reproduce our results, please contact Ceyuan Yang (yc019@ie.cuhk.edu.hk) or Yinghao Xu (xy119@ie.cuhk.edu.hk).
49 | 
50 | 


--------------------------------------------------------------------------------
/mmaction/datasets/loader/sampler.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import math
  4 | import torch
  5 | import numpy as np
  6 | 
  7 | from torch.distributed import get_world_size, get_rank
  8 | from torch.utils.data import Sampler
  9 | from torch.utils.data import DistributedSampler as _DistributedSampler
 10 | 
 11 | 
 12 | class DistributedSampler(_DistributedSampler):
 13 | 
 14 |     def __init__(self, dataset, imgs_per_gpu, num_replicas=None, rank=None, shuffle=True):
 15 |         super().__init__(dataset, num_replicas=num_replicas, rank=rank)
 16 |         self.shuffle = shuffle
 17 |         self.samples_per_gpu = imgs_per_gpu
 18 | 
 19 |         self.num_samples = int(
 20 |             math.ceil(len(dataset) * 1.0 / self.samples_per_gpu /
 21 |                       self.num_replicas)) * self.samples_per_gpu
 22 |         self.total_size = self.num_samples * self.num_replicas
 23 | 
 24 |     def __iter__(self):
 25 |         # deterministically shuffle based on epoch
 26 |         if self.shuffle:
 27 |             g = torch.Generator()
 28 |             g.manual_seed(self.epoch)
 29 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
 30 |         else:
 31 |             indices = torch.arange(len(self.dataset)).tolist()
 32 | 
 33 |         # add extra samples to make it evenly divisible
 34 |         indices += indices[:(self.total_size - len(indices))]
 35 |         assert len(indices) == self.total_size
 36 | 
 37 |         # subsample
 38 |         indices = indices[self.rank:self.total_size:self.num_replicas]
 39 |         assert len(indices) == self.num_samples
 40 | 
 41 |         return iter(indices)
 42 | 
 43 | 
 44 | class GroupSampler(Sampler):
 45 | 
 46 |     def __init__(self, dataset, samples_per_gpu=1):
 47 |         assert hasattr(dataset, 'flag')
 48 |         self.dataset = dataset
 49 |         self.samples_per_gpu = samples_per_gpu
 50 |         self.flag = dataset.flag.astype(np.int64)
 51 |         self.group_sizes = np.bincount(self.flag)
 52 |         self.num_samples = 0
 53 |         for i, size in enumerate(self.group_sizes):
 54 |             self.num_samples += int(np.ceil(
 55 |                 size / self.samples_per_gpu)) * self.samples_per_gpu
 56 | 
 57 |     def __iter__(self):
 58 |         indices = []
 59 |         for i, size in enumerate(self.group_sizes):
 60 |             if size == 0:
 61 |                 continue
 62 |             indice = np.where(self.flag == i)[0]
 63 |             assert len(indice) == size
 64 |             np.random.shuffle(indice)
 65 |             num_extra = int(np.ceil(size / self.samples_per_gpu)
 66 |                             ) * self.samples_per_gpu - len(indice)
 67 |             indice = np.concatenate([indice, indice[:num_extra]])
 68 |             indices.append(indice)
 69 |         indices = np.concatenate(indices)
 70 |         indices = [
 71 |             indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
 72 |             for i in np.random.permutation(
 73 |                 range(len(indices) // self.samples_per_gpu))
 74 |         ]
 75 |         indices = np.concatenate(indices)
 76 |         indices = torch.from_numpy(indices).long()
 77 |         assert len(indices) == self.num_samples
 78 |         return iter(indices)
 79 | 
 80 |     def __len__(self):
 81 |         return self.num_samples
 82 | 
 83 | 
 84 | class DistributedGroupSampler(Sampler):
 85 |     """Sampler that restricts data loading to a subset of the dataset.
 86 |     It is especially useful in conjunction with
 87 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 88 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 89 |     and load a subset of the original dataset that is exclusive to it.
 90 |     .. note::
 91 |         Dataset is assumed to be of constant size.
 92 |     Arguments:
 93 |         dataset: Dataset used for sampling.
 94 |         num_replicas (optional): Number of processes participating in
 95 |             distributed training.
 96 |         rank (optional): Rank of the current process within num_replicas.
 97 |     """
 98 | 
 99 |     def __init__(self,
100 |                  dataset,
101 |                  samples_per_gpu=1,
102 |                  num_replicas=None,
103 |                  rank=None):
104 |         if num_replicas is None:
105 |             num_replicas = get_world_size()
106 |         if rank is None:
107 |             rank = get_rank()
108 |         self.dataset = dataset
109 |         self.samples_per_gpu = samples_per_gpu
110 |         self.num_replicas = num_replicas
111 |         self.rank = rank
112 |         self.epoch = 0
113 | 
114 |         assert hasattr(self.dataset, 'flag')
115 |         self.flag = self.dataset.flag
116 |         self.group_sizes = np.bincount(self.flag)
117 | 
118 |         self.num_samples = 0
119 |         for i, j in enumerate(self.group_sizes):
120 |             self.num_samples += int(
121 |                 math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
122 |                           self.num_replicas)) * self.samples_per_gpu
123 |         self.total_size = self.num_samples * self.num_replicas
124 | 
125 |     def __iter__(self):
126 |         # deterministically shuffle based on epoch
127 |         g = torch.Generator()
128 |         g.manual_seed(self.epoch)
129 | 
130 |         indices = []
131 |         for i, size in enumerate(self.group_sizes):
132 |             if size > 0:
133 |                 indice = np.where(self.flag == i)[0]
134 |                 assert len(indice) == size
135 |                 indice = indice[list(torch.randperm(int(size),
136 |                                                     generator=g))].tolist()
137 |                 extra = int(
138 |                     math.ceil(
139 |                         size * 1.0 / self.samples_per_gpu / self.num_replicas)
140 |                 ) * self.samples_per_gpu * self.num_replicas - len(indice)
141 |                 indice += indice[:extra]
142 |                 indices += indice
143 | 
144 |         assert len(indices) == self.total_size
145 | 
146 |         indices = [
147 |             indices[j] for i in list(
148 |                 torch.randperm(
149 |                     len(indices) // self.samples_per_gpu, generator=g))
150 |             for j in range(i * self.samples_per_gpu, (i + 1) *
151 |                            self.samples_per_gpu)
152 |         ]
153 | 
154 |         # subsample
155 |         offset = self.num_samples * self.rank
156 |         indices = indices[offset:offset + self.num_samples]
157 |         assert len(indices) == self.num_samples
158 | 
159 |         return iter(indices)
160 | 
161 |     def __len__(self):
162 |         return self.num_samples
163 | 
164 |     def set_epoch(self, epoch):
165 |         self.epoch = epoch
166 | 


--------------------------------------------------------------------------------
/test_video.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import cv2
  4 | import argparse
  5 | import functools
  6 | import subprocess
  7 | import warnings
  8 | from scipy.special import softmax
  9 | import moviepy.editor as mpy
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | import mmcv
 14 | from mmcv.runner import load_checkpoint
 15 | from mmcv.parallel import collate, scatter
 16 | 
 17 | from mmaction.models import build_recognizer
 18 | from mmaction.datasets.transforms import GroupImageTransform
 19 | 
 20 | 
 21 | def init_recognizer(config, checkpoint=None, label_file=None, device='cuda:0'):
 22 |     if isinstance(config, str):
 23 |         config = mmcv.Config.fromfile(config)
 24 |     elif not isinstance(config, mmcv.Config):
 25 |         raise TypeError('config must be a filename or Config object, '
 26 |                         'but got {}'.format(type(config)))
 27 |     config.model.backbone.pretrained = None
 28 |     config.model.spatial_temporal_module.spatial_size = 8
 29 |     model = build_recognizer(
 30 |         config.model, train_cfg=None, test_cfg=config.test_cfg)
 31 |     if checkpoint is not None:
 32 |         checkpoint = load_checkpoint(model, checkpoint)
 33 |         if label_file is not None:
 34 |             classes = [line.rstrip() for line in open(label_file, 'r').readlines()]
 35 |             model.CLASSES = classes
 36 |         else:
 37 |             if 'CLASSES' in checkpoint['meta']:
 38 |                 model.CLASSES = checkpoint['meta']['CLASSES']
 39 |             else:
 40 |                 warnings.warn('Class names are not saved in the checkpoint\'s '
 41 |                               'meta data, use something-something-v2 classes by default.')
 42 |                 model.CLASSES = get_classes('something=something-v2')
 43 |     model.cfg = config  # save the config in the model for convenience
 44 |     model.to(device)
 45 |     model.eval()
 46 |     return model
 47 | 
 48 | 
 49 | def inference_recognizer(model, frames):
 50 |     cfg = model.cfg
 51 |     device = next(model.parameters()).device  # model device
 52 |     # build the data pipeline
 53 |     test_transform = GroupImageTransform(
 54 |         crop_size=cfg.data.test.input_size,
 55 |         oversample=None,
 56 |         resize_crop=False,
 57 |         **dict(mean=[123.675, 116.28, 103.53],
 58 |                std=[58.395, 57.12, 57.375], to_rgb=True))
 59 |     # prepare data
 60 |     frames, *l = test_transform(
 61 |         frames, (cfg.data.test.img_scale, cfg.data.test.img_scale),
 62 |         crop_history=None,
 63 |         flip=False,
 64 |         keep_ratio=False,
 65 |         div_255=False,
 66 |         is_flow=False)
 67 |     data = dict(img_group_0=frames,
 68 |                 num_modalities=1,
 69 |                 img_meta={})
 70 |     data = scatter(collate([data], samples_per_gpu=1), [device])[0]
 71 |     # forward the model
 72 |     with torch.no_grad():
 73 |         result = model(return_loss=False, rescale=True, **data)
 74 |     return result
 75 | 
 76 | 
 77 | def extract_frames(video_file, num_frames=8):
 78 |     try:
 79 |         os.makedirs(os.path.join(os.getcwd(), 'frames'))
 80 |     except OSError:
 81 |         pass
 82 |     fps = subprocess.check_output(['ffprobe', '-v', 'error',
 83 |                                    '-select_streams',
 84 |                                    'v', '-of', 'default=noprint_wrappers=1:nokey=1',
 85 |                                    '-show_entries',
 86 |                                    ' stream=r_frame_rate',
 87 |                                    video_file]).decode('utf-8').strip().split('/')[0]
 88 |     fps = int(fps)
 89 | 
 90 |     output = subprocess.Popen(['ffmpeg', '-i', video_file,
 91 |                                '-loglevel', 'panic',
 92 |                                'frames/%d.jpg']).communicate()
 93 |     frame_paths = [os.path.join('frames', frame)
 94 |                    for frame in sorted(os.listdir('frames'), key=lambda x: int(x.split('.')[0]))]
 95 | 
 96 |     seg_frames, raw_frames = load_frames(frame_paths)
 97 |     subprocess.call(['rm', '-rf', 'frames'])
 98 | 
 99 |     return seg_frames, raw_frames, fps
100 | 
101 | 
102 | def load_frames(frame_paths, num_frames=8):
103 |     frames = [mmcv.imread(frame) for frame in frame_paths]
104 |     if len(frames) >= num_frames:
105 |         return frames[::int(np.floor(len(frames) / float(num_frames)))][:num_frames].copy(), frames.copy()
106 |     else:
107 |         raise ValueError('Video must have at least {} frames'.format(num_frames))
108 | 
109 | 
110 | def render_frames(frames, prediction):
111 |     rendered_frames = []
112 |     for frame in frames:
113 |         img = np.array(frame[:, :, ::-1])
114 |         height, width, _ = img.shape
115 |         cv2.putText(img=img, text=prediction, org=(1, int(height / 8)), fontFace=cv2.FONT_HERSHEY_TRIPLEX,
116 |                     fontScale=0.6, color=(255, 255, 255), lineType=cv2.LINE_8, bottomLeftOrigin=False)
117 |         rendered_frames.append(img)
118 |     return rendered_frames
119 | 
120 | 
121 | # options
122 | parser = argparse.ArgumentParser(description="test TPN on a single video")
123 | parser.add_argument('config', type=str, default=None, help='model init config')
124 | parser.add_argument('checkpoint', type=str, default=None)
125 | parser.add_argument('--label_file', type=str, default='demo/category.txt')
126 | parser.add_argument('--video_file', type=str, default='demo/demo.mp4')
127 | parser.add_argument('--frame_folder', type=str, default=None)
128 | parser.add_argument('--rendered_output', type=str, default='demo/demo_pred.mp4')
129 | args = parser.parse_args()
130 | 
131 | # Obtain video frames
132 | if args.frame_folder is not None:
133 |     print('Loading frames in {}'.format(args.frame_folder))
134 |     import glob
135 | 
136 |     # Here, make sure after sorting the frame paths have the correct temporal order
137 |     frame_paths = sorted(glob.glob(os.path.join(args.frame_folder, '*.jpg')))
138 |     seg_frames, raw_frames = load_frames(frame_paths)
139 |     fps = 4
140 | else:
141 |     print('Extracting frames using ffmpeg...')
142 |     seg_frames, raw_frames, fps = extract_frames(args.video_file, 8)
143 | 
144 | model = init_recognizer(args.config, checkpoint=args.checkpoint, label_file=args.label_file)
145 | results = inference_recognizer(model, seg_frames)
146 | prob = softmax(results.squeeze())
147 | idx = np.argsort(-prob)
148 | # Output the prediction.
149 | video_name = args.frame_folder if args.frame_folder is not None else args.video_file
150 | print('RESULT ON ' + video_name)
151 | for i in range(0, 5):
152 |     print('{:.3f} -> {}'.format(prob[idx[i]], model.CLASSES[idx[i]]))
153 | 
154 | # Render output frames with prediction text.
155 | if args.rendered_output is not None:
156 |     prediction = model.CLASSES[idx[0]]
157 |     rendered_frames = render_frames(raw_frames, prediction)
158 |     clip = mpy.ImageSequenceClip(rendered_frames, fps=fps)
159 |     clip.write_videofile(args.rendered_output)
160 | 


--------------------------------------------------------------------------------
/mmaction/models/tenons/segmental_consensuses/stpp.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from ...registry import SEGMENTAL_CONSENSUSES
  5 | import numpy as np
  6 | 
  7 | 
  8 | def parse_stage_config(stage_cfg):
  9 |     if isinstance(stage_cfg, int):
 10 |         return (stage_cfg,), stage_cfg
 11 |     elif isinstance(stage_cfg, tuple) or isinstance(stage_cfg, list):
 12 |         return stage_cfg, sum(stage_cfg)
 13 |     else:
 14 |         raise ValueError("Incorrect STPP config {}".format(stage_cfg))
 15 | 
 16 | 
 17 | @SEGMENTAL_CONSENSUSES.register_module
 18 | class StructuredTemporalPyramidPooling(nn.Module):
 19 |     def __init__(self, standalong_classifier=False, stpp_cfg=(1, (1, 2), 1), num_seg=(2, 5, 2)):
 20 |         super(StructuredTemporalPyramidPooling, self).__init__()
 21 | 
 22 |         self.sc = standalong_classifier
 23 | 
 24 |         starting_parts, starting_mult = parse_stage_config(stpp_cfg[0])
 25 |         course_parts, course_mult = parse_stage_config(stpp_cfg[1])
 26 |         ending_parts, ending_mult = parse_stage_config(stpp_cfg[2])
 27 | 
 28 |         self.feat_multiplier = starting_mult + course_mult + ending_mult
 29 |         self.parts = (starting_parts, course_parts, ending_parts)
 30 |         self.norm_num = (starting_mult, course_mult, ending_mult)
 31 | 
 32 |         self.num_seg = num_seg
 33 | 
 34 |     def init_weights(self):
 35 |         pass
 36 | 
 37 |     def forward(self, input, scaling):
 38 |         x1 = self.num_seg[0]
 39 |         x2 = x1 + self.num_seg[1]
 40 |         n_seg = x2 + self.num_seg[2]
 41 | 
 42 |         feat_dim = input.size(1)
 43 |         src = input.view(-1, n_seg, feat_dim)
 44 |         num_sample = src.size(0)
 45 | 
 46 |         scaling = scaling.view(-1, 2)
 47 | 
 48 |         def get_stage_stpp(stage_feat, stage_parts, norm_num, scaling):
 49 |             stage_stpp = []
 50 |             stage_len = stage_feat.size(1)
 51 |             for n_part in stage_parts:
 52 |                 ticks = torch.arange(0, stage_len + 1e-5, stage_len / n_part)
 53 |                 for i in range(n_part):
 54 |                     part_feat = stage_feat[:, int(ticks[i]):int(ticks[i + 1]), :].mean(dim=1) / norm_num
 55 |                     if scaling is not None:
 56 |                         part_feat = part_feat * scaling.view(num_sample, 1)
 57 |                     stage_stpp.append(part_feat)
 58 |             return stage_stpp
 59 | 
 60 |         feature_parts = []
 61 |         feature_parts.extend(get_stage_stpp(src[:, :x1, :], self.parts[0], self.norm_num[0], scaling[:, 0]))
 62 |         feature_parts.extend(get_stage_stpp(src[:, x1:x2, :], self.parts[1], self.norm_num[1], None))
 63 |         feature_parts.extend(get_stage_stpp(src[:, x2:, :], self.parts[2], self.norm_num[2], scaling[:, 1]))
 64 |         stpp_feat = torch.cat(feature_parts, dim=1)
 65 |         if not self.sc:
 66 |             return stpp_feat, stpp_feat
 67 |         else:
 68 |             course_feat = src[:, x1:x2, :].mean(dim=1)
 69 |             return course_feat, stpp_feat
 70 | 
 71 | 
 72 | @SEGMENTAL_CONSENSUSES.register_module
 73 | class STPPReorganized(nn.Module):
 74 |     def __init__(self, feat_dim, act_score_len,
 75 |                  comp_score_len, reg_score_len,
 76 |                  standalong_classifier=False,
 77 |                  with_regression=True,
 78 |                  stpp_cfg=(1, (1, 2), 1)):
 79 |         super(STPPReorganized, self).__init__()
 80 | 
 81 |         self.sc = standalong_classifier
 82 |         self.feat_dim = feat_dim
 83 |         self.act_score_len = act_score_len
 84 |         self.comp_score_len = comp_score_len
 85 |         self.reg_score_len = reg_score_len
 86 |         self.with_regression = with_regression
 87 | 
 88 |         starting_parts, starting_mult = parse_stage_config(stpp_cfg[0])
 89 |         course_parts, course_mult = parse_stage_config(stpp_cfg[1])
 90 |         ending_parts, ending_mult = parse_stage_config(stpp_cfg[2])
 91 | 
 92 |         self.feat_multiplier = starting_mult + course_mult + ending_mult
 93 |         self.stpp_cfg = (starting_parts, course_parts, ending_parts)
 94 | 
 95 |         self.act_slice = slice(0, self.act_score_len if self.sc else (self.act_score_len * self.feat_multiplier))
 96 |         self.comp_slice = slice(self.act_slice.stop, self.act_slice.stop + self.comp_score_len * self.feat_multiplier)
 97 |         self.reg_slice = slice(self.comp_slice.stop, self.comp_slice.stop + self.reg_score_len * self.feat_multiplier)
 98 | 
 99 |     def init_weights(self):
100 |         pass
101 | 
102 |     def forward(self, input, proposal_ticks, scaling):
103 |         assert input.size(1) == self.feat_dim
104 |         n_ticks = proposal_ticks.size(0)
105 | 
106 |         out_act_scores = torch.zeros((n_ticks, self.act_score_len)).type_as(input)
107 |         raw_act_scores = input[:, self.act_slice]
108 | 
109 |         out_comp_scores = torch.zeros((n_ticks, self.comp_score_len)).type_as(input)
110 |         raw_comp_scores = input[:, self.comp_slice]
111 | 
112 |         if self.with_regression:
113 |             out_reg_scores = torch.zeros((n_ticks, self.reg_score_len)).type_as(input)
114 |             raw_reg_scores = input[:, self.reg_slice]
115 |         else:
116 |             out_reg_scores = None
117 |             raw_reg_scores = None
118 | 
119 |         def pspool(out_scores, index, raw_scores, ticks, scaling, score_len, stpp_cfg):
120 |             offset = 0
121 |             for stage_idx, stage_cfg in enumerate(stpp_cfg):
122 |                 if stage_idx == 0:
123 |                     s = scaling[0]
124 |                 elif stage_idx == len(stpp_cfg) - 1:
125 |                     s = scaling[1]
126 |                 else:
127 |                     s = 1.0
128 | 
129 |                 stage_cnt = sum(stage_cfg)
130 |                 left = ticks[stage_idx]
131 |                 right = max(ticks[stage_idx] + 1, ticks[stage_idx + 1])
132 | 
133 |                 if right <= 0 or left >= raw_scores.size(0):
134 |                     offset += stage_cnt
135 |                     continue
136 |                 for n_part in stage_cfg:
137 |                     part_ticks = np.arange(left, right + 1e-5, (right - left) / n_part)
138 |                     for i in range(n_part):
139 |                         pl = int(part_ticks[i])
140 |                         pr = int(part_ticks[i + 1])
141 |                         if pr - pl >= 1:
142 |                             out_scores[index, :] += raw_scores[pl:pr,
143 |                                                     offset * score_len: (offset + 1) * score_len].mean(dim=0) * s
144 |                         offset += 1
145 | 
146 |         for i in range(n_ticks):
147 |             ticks = proposal_ticks[i].cpu().numpy()
148 |             if self.sc:
149 |                 out_act_scores[i, :] = raw_act_scores[ticks[1]: max(ticks[1] + 1, ticks[2]), :].mean(dim=0)
150 |             else:
151 |                 pspool(out_act_scores, i, raw_act_scores, ticks, scaling[i], self.act_score_len, self.stpp_cfg)
152 | 
153 |             pspool(out_comp_scores, i, raw_comp_scores, ticks, scaling[i], self.comp_score_len, self.stpp_cfg)
154 | 
155 |             if self.with_regression:
156 |                 pspool(out_reg_scores, i, raw_reg_scores, ticks, scaling[i], self.reg_score_len, self.stpp_cfg)
157 | 
158 |         return out_act_scores, out_comp_scores, out_reg_scores
159 | 


--------------------------------------------------------------------------------
/mmaction/models/recognizers/TSN2D.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | from .base import BaseRecognizer
  3 | from .. import builder
  4 | from ..registry import RECOGNIZERS
  5 | import torch
  6 | import numpy as np
  7 | 
  8 | 
  9 | @RECOGNIZERS.register_module
 10 | class TSN2D(BaseRecognizer):
 11 | 
 12 |     def __init__(self,
 13 |                  backbone,
 14 |                  necks=None,
 15 |                  modality='RGB',
 16 |                  in_channels=3,
 17 |                  spatial_temporal_module=None,
 18 |                  segmental_consensus=None,
 19 |                  fcn_testing=False,
 20 |                  flip=False,
 21 |                  cls_head=None,
 22 |                  train_cfg=None,
 23 |                  test_cfg=None):
 24 | 
 25 |         super(TSN2D, self).__init__()
 26 |         self.backbone = builder.build_backbone(backbone)
 27 |         self.modality = modality
 28 |         self.in_channels = in_channels
 29 |         if necks is not None:
 30 |             self.necks = builder.build_neck(necks)
 31 |         else:
 32 |             self.necks = None
 33 | 
 34 |         if spatial_temporal_module is not None:
 35 |             self.spatial_temporal_module = builder.build_spatial_temporal_module(
 36 |                 spatial_temporal_module)
 37 |         else:
 38 |             raise NotImplementedError
 39 | 
 40 |         if segmental_consensus is not None:
 41 |             self.segmental_consensus = builder.build_segmental_consensus(
 42 |                 segmental_consensus)
 43 |         else:
 44 |             raise NotImplementedError
 45 | 
 46 |         if cls_head is not None:
 47 |             self.cls_head = builder.build_head(cls_head)
 48 |         else:
 49 |             raise NotImplementedError
 50 | 
 51 |         self.train_cfg = train_cfg
 52 |         self.test_cfg = test_cfg
 53 |         self.fcn_testing = fcn_testing
 54 |         self.flip = flip
 55 |         assert modality in ['RGB', 'Flow', 'RGBDiff']
 56 | 
 57 |         self.init_weights()
 58 | 
 59 |     @property
 60 |     def with_spatial_temporal_module(self):
 61 |         return hasattr(self, 'spatial_temporal_module') and self.spatial_temporal_module is not None
 62 | 
 63 |     @property
 64 |     def with_segmental_consensus(self):
 65 |         return hasattr(self, 'segmental_consensus') and self.segmental_consensus is not None
 66 | 
 67 |     @property
 68 |     def with_cls_head(self):
 69 |         return hasattr(self, 'cls_head') and self.cls_head is not None
 70 | 
 71 |     def init_weights(self):
 72 |         super(TSN2D, self).init_weights()
 73 |         self.backbone.init_weights()
 74 | 
 75 |         if self.with_spatial_temporal_module:
 76 |             self.spatial_temporal_module.init_weights()
 77 | 
 78 |         if self.with_segmental_consensus:
 79 |             self.segmental_consensus.init_weights()
 80 | 
 81 |         if self.with_cls_head:
 82 |             self.cls_head.init_weights()
 83 | 
 84 |         if self.necks is not None:
 85 |             self.necks.init_weights()
 86 | 
 87 |     def extract_feat(self, img_group):
 88 |         x = self.backbone(img_group)
 89 |         return x
 90 | 
 91 |     def forward_train(self,
 92 |                       num_modalities,
 93 |                       img_meta,
 94 |                       gt_label,
 95 |                       **kwargs):
 96 |         assert num_modalities == 1
 97 |         img_group = kwargs['img_group_0']
 98 | 
 99 |         bs = img_group.shape[0]
100 |         img_group = img_group.reshape(
101 |             (-1, self.in_channels) + img_group.shape[3:])
102 |         num_seg = img_group.shape[0] // bs
103 | 
104 |         x = self.extract_feat(img_group)
105 |         if self.necks is not None:
106 |             x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x]
107 |             x, aux_losses = self.necks(x, gt_label.squeeze())
108 |             x = x.squeeze(2)
109 |             num_seg = 1
110 | 
111 |         if self.with_spatial_temporal_module:
112 |             x = self.spatial_temporal_module(x)
113 |         x = x.reshape((-1, num_seg) + x.shape[1:])
114 |         if self.with_segmental_consensus:
115 |             x = self.segmental_consensus(x)
116 |             x = x.squeeze(1)
117 |         losses = dict()
118 |         if self.with_cls_head:
119 |             cls_score = self.cls_head(x)
120 |             gt_label = gt_label.squeeze()
121 |             loss_cls = self.cls_head.loss(cls_score, gt_label)
122 |             losses.update(loss_cls)
123 |         if self.necks is not None:
124 |             if aux_losses is not None:
125 |                 losses.update(aux_losses)
126 |         return losses
127 | 
128 |     def forward_test(self,
129 |                      num_modalities,
130 |                      img_meta,
131 |                      **kwargs):
132 |         if not self.fcn_testing:
133 |             # 1crop * 1clip 
134 |             assert num_modalities == 1
135 |             img_group = kwargs['img_group_0']
136 | 
137 |             bs = img_group.shape[0]
138 |             img_group = img_group.reshape(
139 |                 (-1, self.in_channels) + img_group.shape[3:])
140 |             num_seg = img_group.shape[0] // bs
141 | 
142 |             x = self.extract_feat(img_group)
143 | 
144 |             if self.necks is not None:
145 |                 x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x]
146 |                 x, _ = self.necks(x)
147 |                 x = x.squeeze(2)
148 |                 num_seg = 1
149 | 
150 |             if self.with_spatial_temporal_module:
151 |                 x = self.spatial_temporal_module(x)
152 |             x = x.reshape((-1, num_seg) + x.shape[1:])
153 |             if self.with_segmental_consensus:
154 |                 x = self.segmental_consensus(x)
155 |                 x = x.squeeze(1)
156 |             if self.with_cls_head:
157 |                 x = self.cls_head(x)
158 | 
159 |             return x.cpu().numpy()
160 |         else:
161 |             # fcn testing
162 |             assert num_modalities == 1
163 |             img_group = kwargs['img_group_0']
164 | 
165 |             bs = img_group.shape[0]
166 |             img_group = img_group.reshape(
167 |                 (-1, self.in_channels) + img_group.shape[3:])
168 |             # standard protocol i.e. 3 crops * 2 clips
169 |             num_seg = self.backbone.nsegments * 2
170 |             # 3 crops to cover full resolution
171 |             num_crops = 3
172 |             img_group = img_group.reshape((num_crops, num_seg) + img_group.shape[1:])
173 | 
174 |             x1 = img_group[:, ::2, :, :, :]
175 |             x2 = img_group[:, 1::2, :, :, :]
176 |             img_group = torch.cat([x1, x2], 0)
177 |             num_seg = num_seg // 2
178 |             num_clips = img_group.shape[0]
179 |             img_group = img_group.view(num_clips * num_seg, img_group.shape[2], img_group.shape[3], img_group.shape[4])
180 | 
181 |             if self.flip:
182 |                 img_group = self.extract_feat(torch.flip(img_group, [-1]))
183 |             x = self.extract_feat(img_group)
184 |             if self.necks is not None:
185 |                 x = [each.reshape((-1, num_seg) + each.shape[1:]).transpose(1, 2) for each in x]
186 |                 x, _ = self.necks(x)
187 |             else:
188 |                 x = x.reshape((-1, num_seg) + x.shape[1:]).transpose(1, 2)
189 |             x = self.cls_head(x)
190 | 
191 |             prob = torch.nn.functional.softmax(x.mean([2, 3, 4]), 1).mean(0, keepdim=True).detach().cpu().numpy()
192 |             return prob
193 | 


--------------------------------------------------------------------------------
/mmaction/losses/losses.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | 
  4 | 
  5 | def weighted_nll_loss(pred, label, weight, avg_factor=None):
  6 |     if avg_factor is None:
  7 |         avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
  8 |     raw = F.nll_loss(pred, label, reduction='none')
  9 |     return torch.sum(raw * weight)[None] / avg_factor
 10 | 
 11 | 
 12 | def weighted_cross_entropy(pred, label, weight, avg_factor=None, reduce=True):
 13 |     if avg_factor is None:
 14 |         avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
 15 |     raw = F.cross_entropy(pred, label, reduction='none')
 16 |     if reduce:
 17 |         return torch.sum(raw * weight)[None] / avg_factor
 18 |     else:
 19 |         return raw * weight / avg_factor
 20 | 
 21 | 
 22 | def weighted_binary_cross_entropy(pred, label, weight, avg_factor=None):
 23 |     if pred.dim() != label.dim():
 24 |         label, weight = _expand_binary_labels(label, weight, pred.size(-1))
 25 |     if avg_factor is None:
 26 |         avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
 27 |     return F.binary_cross_entropy_with_logits(
 28 |         pred, label.float(), weight.float(),
 29 |         reduction='sum')[None] / avg_factor
 30 | 
 31 | 
 32 | def smooth_l1_loss(pred, target, beta=1.0, reduction='mean'):
 33 |     assert beta > 0
 34 |     assert pred.size() == target.size() and target.numel() > 0
 35 |     diff = torch.abs(pred - target)
 36 |     loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
 37 |                        diff - 0.5 * beta)
 38 |     reduction_enum = F._Reduction.get_enum(reduction)
 39 |     # none: 0, mean: 1, sum: 2
 40 |     if reduction_enum == 0:
 41 |         return loss
 42 |     elif reduction_enum == 1:
 43 |         return loss.sum() / pred.numel()
 44 |     elif reduction_enum == 2:
 45 |         return loss.sum()
 46 | 
 47 | 
 48 | def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None):
 49 |     if avg_factor is None:
 50 |         avg_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6
 51 |     loss = smooth_l1_loss(pred, target, beta, reduction='none')
 52 |     return torch.sum(loss * weight)[None] / avg_factor
 53 | 
 54 | 
 55 | def accuracy(pred, target, topk=1):
 56 |     if isinstance(topk, int):
 57 |         topk = (topk,)
 58 |         return_single = True
 59 |     else:
 60 |         return_single = False
 61 | 
 62 |     maxk = max(topk)
 63 |     _, pred_label = pred.topk(maxk, 1, True, True)
 64 |     pred_label = pred_label.t()
 65 |     correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
 66 | 
 67 |     res = []
 68 |     for k in topk:
 69 |         correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
 70 |         res.append(correct_k.mul_(100.0 / pred.size(0)))
 71 |     return res[0] if return_single else res
 72 | 
 73 | 
 74 | def _expand_binary_labels(labels, label_weights, label_channels):
 75 |     bin_labels = labels.new_full((labels.size(0), label_channels), 0)
 76 |     inds = torch.nonzero(labels >= 1).squeeze()
 77 |     if inds.numel() > 0:
 78 |         bin_labels[inds, labels[inds] - 1] = 1
 79 |     bin_label_weights = label_weights.view(-1, 1).expand(
 80 |         label_weights.size(0), label_channels)
 81 |     return bin_labels, bin_label_weights
 82 | 
 83 | 
 84 | def weighted_multilabel_binary_cross_entropy(
 85 |         pred, label, weight, avg_factor=None):
 86 |     label, weight = _expand_multilabel_binary_labels(
 87 |         label, weight, pred.size(-1))
 88 |     if avg_factor is None:
 89 |         avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
 90 |     return F.binary_cross_entropy_with_logits(
 91 |         pred, label.float(), weight.float(),
 92 |         reduction='sum')[None] / avg_factor
 93 | 
 94 | 
 95 | def _expand_multilabel_binary_labels(labels, label_weights, label_channels):
 96 |     bin_labels = labels.new_full((labels.size(0), label_channels), 0)
 97 |     inds = torch.nonzero(labels >= 1)
 98 |     if inds.numel() > 0:
 99 |         for ind in inds:
100 |             # note that labels starts from 1
101 |             bin_labels[ind[0], labels[ind[0], ind[1]] - 1] = 1
102 |             # bin_labels[ind[0], 0] = 1
103 |     bin_label_weights = label_weights
104 |     return bin_labels, bin_label_weights
105 | 
106 | 
107 | def multilabel_accuracy(pred, target, topk=1, thr=0.5):
108 |     if topk is None:
109 |         topk = ()
110 |     elif isinstance(topk, int):
111 |         topk = (topk,)
112 | 
113 |     pred = pred.sigmoid()
114 |     pred_bin_labels = pred.new_full((pred.size(0),), 0, dtype=torch.long)
115 |     pred_vec_labels = pred.new_full(pred.size(), 0, dtype=torch.long)
116 |     for i in range(pred.size(0)):
117 |         inds = torch.nonzero(pred[i, 1:] > thr).squeeze() + 1
118 |         if inds.numel() > 0:
119 |             pred_vec_labels[i, inds] = 1
120 |             # pred_bin_labels[i] = 1
121 |         if pred[i, 0] > thr:
122 |             pred_bin_labels[i] = 1
123 |     target_bin_labels = target.new_full(
124 |         (target.size(0),), 0, dtype=torch.long)
125 |     target_vec_labels = target.new_full(target.size(), 0, dtype=torch.long)
126 |     for i in range(target.size(0)):
127 |         inds = torch.nonzero(target[i, :] >= 1).squeeze()
128 |         if inds.numel() > 0:
129 |             target_vec_labels[i, target[i, inds]] = 1
130 |             target_bin_labels[i] = 1
131 |     # overall accuracy
132 |     correct = pred_bin_labels.eq(target_bin_labels)
133 |     acc = correct.float().sum(0, keepdim=True).mul_(100.0 / correct.size(0))
134 | 
135 |     # def overlap(tensor1, tensor2):
136 |     #     indices = tensor1.new_zeros(tensor1).astype(torch.uint8)
137 |     #     for elem in tensor2:
138 |     #         indices = indices | (tensor1 == elem)
139 |     #     return tensor1[indices]
140 | 
141 |     # recall@thr
142 |     recall_thr, prec_thr = recall_prec(pred_vec_labels, target_vec_labels)
143 | 
144 |     # recall@k
145 |     recalls = []
146 |     precs = []
147 |     for k in topk:
148 |         _, pred_label = pred.topk(k, 1, True, True)
149 |         pred_vec_labels = pred.new_full(pred.size(), 0, dtype=torch.long)
150 |         for i in range(pred.size(0)):
151 |             pred_vec_labels[i, pred_label[i]] = 1
152 |         recall_k, prec_k = recall_prec(pred_vec_labels, target_vec_labels)
153 |         recalls.append(recall_k)
154 |         precs.append(prec_k)
155 | 
156 |     return acc, recall_thr, prec_thr, recalls, precs
157 | 
158 | 
159 | def recall_prec(pred_vec, target_vec):
160 |     """
161 |     Args:
162 |         pred_vec: <torch.tensor> (n, C+1), each element is either 0 or 1
163 |         target_vec: <torch.tensor> (n, C+1), each element is either 0 or 1
164 | 
165 |     Returns:
166 |         recall
167 |         prec
168 |     """
169 |     recall = pred_vec.new_full((pred_vec.size(0),), 0).float()
170 |     prec = pred_vec.new_full((pred_vec.size(0),), 0).float()
171 |     num_pos = 0
172 |     for i in range(target_vec.size(0)):
173 |         if target_vec[i, :].float().sum(0) == 0:
174 |             continue
175 |         correct_labels = pred_vec[i, :] & target_vec[i, :]
176 |         recall[i] = correct_labels.float().sum(0, keepdim=True) / \
177 |                     target_vec[i, :].float().sum(0, keepdim=True)
178 |         prec[i] = correct_labels.float().sum(0, keepdim=True) / \
179 |                   (pred_vec[i, :].float().sum(0, keepdim=True) + 1e-6)
180 |         num_pos += 1
181 |     recall = recall.float().sum(0, keepdim=True).mul_(100. / num_pos)
182 |     prec = prec.float().sum(0, keepdim=True).mul_(100. / num_pos)
183 |     return recall, prec
184 | 


--------------------------------------------------------------------------------
/mmaction/apis/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import re
  3 | from collections import OrderedDict
  4 | 
  5 | import torch
  6 | from mmcv.runner import Runner, DistSamplerSeedHook, obj_from_dict
  7 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
  8 | 
  9 | from mmaction.core import (DistOptimizerHook, DistEvalTopKAccuracyHook,
 10 |                            )
 11 | from mmaction.datasets import build_dataloader
 12 | from .env import get_root_logger
 13 | 
 14 | 
 15 | def parse_losses(losses):
 16 |     log_vars = OrderedDict()
 17 |     for loss_name, loss_value in losses.items():
 18 |         if isinstance(loss_value, torch.Tensor):
 19 |             log_vars[loss_name] = loss_value.mean()
 20 |         elif isinstance(loss_value, list):
 21 |             log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
 22 |         else:
 23 |             raise TypeError(
 24 |                 '{} is not a tensor or list of tensors'.format(loss_name))
 25 | 
 26 |     loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
 27 | 
 28 |     log_vars['loss'] = loss
 29 |     for name in log_vars:
 30 |         log_vars[name] = log_vars[name].item()
 31 | 
 32 |     return loss, log_vars
 33 | 
 34 | 
 35 | def batch_processor(model, data, train_mode):
 36 |     losses = model(**data)
 37 |     loss, log_vars = parse_losses(losses)
 38 | 
 39 |     outputs = dict(
 40 |         loss=loss, log_vars=log_vars,
 41 |         num_samples=len(data['img_group_0'].data))
 42 | 
 43 |     return outputs
 44 | 
 45 | 
 46 | def train_network(model,
 47 |                   dataset,
 48 |                   cfg,
 49 |                   distributed=False,
 50 |                   validate=False,
 51 |                   logger=None):
 52 |     if logger is None:
 53 |         logger = get_root_logger(cfg.log_level)
 54 | 
 55 |     # start training
 56 |     if distributed:
 57 |         _dist_train(model, dataset, cfg, validate=validate)
 58 |     else:
 59 |         _non_dist_train(model, dataset, cfg, validate=validate)
 60 | 
 61 | 
 62 | def build_optimizer(model, optimizer_cfg):
 63 |     """Build optimizer from configs.
 64 |     Args:
 65 |         model (:obj:`nn.Module`): The model with parameters to be optimized.
 66 |         optimizer_cfg (dict): The config dict of the optimizer.
 67 |             Positional fields are:
 68 |                 - type: class name of the optimizer.
 69 |                 - lr: base learning rate.
 70 |             Optional fields are:
 71 |                 - any arguments of the corresponding optimizer type, e.g.,
 72 |                   weight_decay, momentum, etc.
 73 |                 - paramwise_options: a dict with 3 accepted fileds
 74 |                   (bias_lr_mult, bias_decay_mult, norm_decay_mult).
 75 |                   `bias_lr_mult` and `bias_decay_mult` will be multiplied to
 76 |                   the lr and weight decay respectively for all bias parameters
 77 |                   (except for the normalization layers), and
 78 |                   `norm_decay_mult` will be multiplied to the weight decay
 79 |                   for all weight and bias parameters of normalization layers.
 80 |     Returns:
 81 |         torch.optim.Optimizer: The initialized optimizer.
 82 |     """
 83 |     if hasattr(model, 'module'):
 84 |         model = model.module
 85 | 
 86 |     optimizer_cfg = optimizer_cfg.copy()
 87 |     paramwise_options = optimizer_cfg.pop('paramwise_options', None)
 88 |     # if no paramwise option is specified, just use the global setting
 89 |     if paramwise_options is None:
 90 |         return obj_from_dict(optimizer_cfg, torch.optim,
 91 |                              dict(params=model.parameters()))
 92 |     else:
 93 |         assert isinstance(paramwise_options, dict)
 94 |         # get base lr and weight decay
 95 |         base_lr = optimizer_cfg['lr']
 96 |         base_wd = optimizer_cfg.get('weight_decay', None)
 97 |         # weight_decay must be explicitly specified if mult is specified
 98 |         if ('bias_decay_mult' in paramwise_options
 99 |                 or 'norm_decay_mult' in paramwise_options):
100 |             assert base_wd is not None
101 |         # get param-wise options
102 |         bias_lr_mult = paramwise_options.get('bias_lr_mult', 1.)
103 |         bias_decay_mult = paramwise_options.get('bias_decay_mult', 1.)
104 |         norm_decay_mult = paramwise_options.get('norm_decay_mult', 1.)
105 |         # set param-wise lr and weight decay
106 |         params = []
107 |         for name, param in model.named_parameters():
108 |             param_group = {'params': [param]}
109 |             if not param.requires_grad:
110 |                 # FP16 training needs to copy gradient/weight between master
111 |                 # weight copy and model weight, it is convenient to keep all
112 |                 # parameters here to align with model.parameters()
113 |                 params.append(param_group)
114 |                 continue
115 | 
116 |             # for norm layers, overwrite the weight decay of weight and bias
117 |             # TODO: obtain the norm layer prefixes dynamically
118 |             if re.search(r'(bn|gn)(\d+)?.(weight|bias)', name):
119 |                 if base_wd is not None:
120 |                     param_group['weight_decay'] = base_wd * norm_decay_mult
121 |             # for other layers, overwrite both lr and weight decay of bias
122 |             elif name.endswith('.bias'):
123 |                 param_group['lr'] = base_lr * bias_lr_mult
124 |                 if base_wd is not None:
125 |                     param_group['weight_decay'] = base_wd * bias_decay_mult
126 |             # otherwise use the global settings
127 | 
128 |             params.append(param_group)
129 | 
130 |         optimizer_cls = getattr(torch.optim, optimizer_cfg.pop('type'))
131 |         return optimizer_cls(params, **optimizer_cfg)
132 | 
133 | 
134 | def _dist_train(model, dataset, cfg, validate=False):
135 |     # prepare data loaders
136 |     data_loaders = [
137 |         build_dataloader(
138 |             dataset,
139 |             cfg.data.videos_per_gpu,
140 |             cfg.data.workers_per_gpu,
141 |             dist=True)
142 |     ]
143 |     # put model on gpus
144 |     model = MMDistributedDataParallel(model.cuda())
145 |     # build runner
146 |     # build runner
147 |     optimizer = build_optimizer(model, cfg.optimizer)
148 | 
149 |     runner = Runner(model, batch_processor, optimizer, cfg.work_dir,
150 |                     cfg.log_level)
151 |     # register hooks
152 |     optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
153 |     runner.register_training_hooks(cfg.lr_config, optimizer_config,
154 |                                    cfg.checkpoint_config, cfg.log_config)
155 |     runner.register_hook(DistSamplerSeedHook())
156 |     # register eval hooks
157 |     if validate:
158 |         if cfg.data.val.type in ['RawFramesDataset']:
159 |             runner.register_hook(
160 |                 DistEvalTopKAccuracyHook(cfg.data.val, k=(1, 5)))
161 | 
162 |     if cfg.resume_from:
163 |         runner.resume(cfg.resume_from)
164 |     elif cfg.load_from:
165 |         runner.load_checkpoint(cfg.load_from)
166 |     runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
167 | 
168 | 
169 | def _non_dist_train(model, dataset, cfg, validate=False):
170 |     # prepare data loaders
171 |     data_loaders = [
172 |         build_dataloader(
173 |             dataset,
174 |             cfg.data.videos_per_gpu,
175 |             cfg.data.workers_per_gpu,
176 |             cfg.gpus,
177 |             dist=False)
178 |     ]
179 |     # put model on gpus
180 |     model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
181 |     # build runner
182 |     runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
183 |                     cfg.log_level)
184 |     runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
185 |                                    cfg.checkpoint_config, cfg.log_config)
186 | 
187 |     if cfg.resume_from:
188 |         runner.resume(cfg.resume_from)
189 |     elif cfg.load_from:
190 |         runner.load_checkpoint(cfg.load_from)
191 |     runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
192 | 


--------------------------------------------------------------------------------
/demo/category.txt:
--------------------------------------------------------------------------------
  1 | Approaching something with your camera
  2 | Attaching something to something
  3 | Bending something so that it deforms
  4 | Bending something until it breaks
  5 | Burying something in something
  6 | Closing something
  7 | Covering something with something
  8 | Digging something out of something
  9 | Dropping something behind something
 10 | Dropping something in front of something
 11 | Dropping something into something
 12 | Dropping something next to something
 13 | Dropping something onto something
 14 | Failing to put something into something because something does not fit
 15 | Folding something
 16 | Hitting something with something
 17 | Holding something
 18 | Holding something behind something
 19 | Holding something in front of something
 20 | Holding something next to something
 21 | Holding something over something
 22 | Laying something on the table on its side, not upright
 23 | Letting something roll along a flat surface
 24 | Letting something roll down a slanted surface
 25 | Letting something roll up a slanted surface, so it rolls back down
 26 | Lifting a surface with something on it but not enough for it to slide down
 27 | Lifting a surface with something on it until it starts sliding down
 28 | Lifting something up completely without letting it drop down
 29 | Lifting something up completely, then letting it drop down
 30 | Lifting something with something on it
 31 | Lifting up one end of something without letting it drop down
 32 | Lifting up one end of something, then letting it drop down
 33 | Moving away from something with your camera
 34 | Moving part of something
 35 | Moving something across a surface until it falls down
 36 | Moving something across a surface without it falling down
 37 | Moving something and something away from each other
 38 | Moving something and something closer to each other
 39 | Moving something and something so they collide with each other
 40 | Moving something and something so they pass each other
 41 | Moving something away from something
 42 | Moving something away from the camera
 43 | Moving something closer to something
 44 | Moving something down
 45 | Moving something towards the camera
 46 | Moving something up
 47 | Opening something
 48 | Picking something up
 49 | Piling something up
 50 | Plugging something into something
 51 | Plugging something into something but pulling it right out as you remove your hand
 52 | Poking a hole into some substance
 53 | Poking a hole into something soft
 54 | Poking a stack of something so the stack collapses
 55 | Poking a stack of something without the stack collapsing
 56 | Poking something so it slightly moves
 57 | Poking something so lightly that it doesn't or almost doesn't move
 58 | Poking something so that it falls over
 59 | Poking something so that it spins around
 60 | Pouring something into something
 61 | Pouring something into something until it overflows
 62 | Pouring something onto something
 63 | Pouring something out of something
 64 | Pretending or failing to wipe something off of something
 65 | Pretending or trying and failing to twist something
 66 | Pretending to be tearing something that is not tearable
 67 | Pretending to close something without actually closing it
 68 | Pretending to open something without actually opening it
 69 | Pretending to pick something up
 70 | Pretending to poke something
 71 | Pretending to pour something out of something, but something is empty
 72 | Pretending to put something behind something
 73 | Pretending to put something into something
 74 | Pretending to put something next to something
 75 | Pretending to put something on a surface
 76 | Pretending to put something onto something
 77 | Pretending to put something underneath something
 78 | Pretending to scoop something up with something
 79 | Pretending to spread air onto something
 80 | Pretending to sprinkle air onto something
 81 | Pretending to squeeze something
 82 | Pretending to take something from somewhere
 83 | Pretending to take something out of something
 84 | Pretending to throw something
 85 | Pretending to turn something upside down
 86 | Pulling something from behind of something
 87 | Pulling something from left to right
 88 | Pulling something from right to left
 89 | Pulling something onto something
 90 | Pulling something out of something
 91 | Pulling two ends of something but nothing happens
 92 | Pulling two ends of something so that it gets stretched
 93 | Pulling two ends of something so that it separates into two pieces
 94 | Pushing something from left to right
 95 | Pushing something from right to left
 96 | Pushing something off of something
 97 | Pushing something onto something
 98 | Pushing something so it spins
 99 | Pushing something so that it almost falls off but doesn't
100 | Pushing something so that it falls off the table
101 | Pushing something so that it slightly moves
102 | Pushing something with something
103 | Putting number of something onto something
104 | Putting something and something on the table
105 | Putting something behind something
106 | Putting something in front of something
107 | Putting something into something
108 | Putting something next to something
109 | Putting something on a flat surface without letting it roll
110 | Putting something on a surface
111 | Putting something on the edge of something so it is not supported and falls down
112 | Putting something onto a slanted surface but it doesn't glide down
113 | Putting something onto something
114 | Putting something onto something else that cannot support it so it falls down
115 | Putting something similar to other things that are already on the table
116 | Putting something that can't roll onto a slanted surface, so it slides down
117 | Putting something that can't roll onto a slanted surface, so it stays where it is
118 | Putting something that cannot actually stand upright upright on the table, so it falls on its side
119 | Putting something underneath something
120 | Putting something upright on the table
121 | Putting something, something and something on the table
122 | Removing something, revealing something behind
123 | Rolling something on a flat surface
124 | Scooping something up with something
125 | Showing a photo of something to the camera
126 | Showing something behind something
127 | Showing something next to something
128 | Showing something on top of something
129 | Showing something to the camera
130 | Showing that something is empty
131 | Showing that something is inside something
132 | Something being deflected from something
133 | Something colliding with something and both are being deflected
134 | Something colliding with something and both come to a halt
135 | Something falling like a feather or paper
136 | Something falling like a rock
137 | Spilling something behind something
138 | Spilling something next to something
139 | Spilling something onto something
140 | Spinning something so it continues spinning
141 | Spinning something that quickly stops spinning
142 | Spreading something onto something
143 | Sprinkling something onto something
144 | Squeezing something
145 | Stacking number of something
146 | Stuffing something into something
147 | Taking one of many similar things on the table
148 | Taking something from somewhere
149 | Taking something out of something
150 | Tearing something into two pieces
151 | Tearing something just a little bit
152 | Throwing something
153 | Throwing something against something
154 | Throwing something in the air and catching it
155 | Throwing something in the air and letting it fall
156 | Throwing something onto a surface
157 | Tilting something with something on it slightly so it doesn't fall down
158 | Tilting something with something on it until it falls off
159 | Tipping something over
160 | Tipping something with something in it over, so something in it falls out
161 | Touching (without moving) part of something
162 | Trying but failing to attach something to something because it doesn't stick
163 | Trying to bend something unbendable so nothing happens
164 | Trying to pour something into something, but missing so it spills next to it
165 | Turning something upside down
166 | Turning the camera downwards while filming something
167 | Turning the camera left while filming something
168 | Turning the camera right while filming something
169 | Turning the camera upwards while filming something
170 | Twisting (wringing) something wet until water comes out
171 | Twisting something
172 | Uncovering something
173 | Unfolding something
174 | Wiping something off of something


--------------------------------------------------------------------------------
/tools/test_recognizer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | import mmcv
  5 | import tempfile
  6 | import os.path as osp
  7 | import torch.distributed as dist
  8 | import shutil
  9 | from mmcv.runner import load_checkpoint, parallel_test, obj_from_dict, get_dist_info
 10 | from mmcv.parallel import scatter, collate, MMDataParallel, MMDistributedDataParallel
 11 | from mmaction.apis import init_dist
 12 | from mmaction import datasets
 13 | from mmaction.datasets import build_dataloader
 14 | from mmaction.models import build_recognizer, recognizers
 15 | from mmaction.core.evaluation.accuracy import (softmax, top_k_accuracy, non_mean_class_accuracy,
 16 |                                                mean_class_accuracy)
 17 | 
 18 | 
 19 | def single_test(model, data_loader):
 20 |     model.eval()
 21 |     results = []
 22 |     dataset = data_loader.dataset
 23 |     prog_bar = mmcv.ProgressBar(len(dataset))
 24 |     for i, data in enumerate(data_loader):
 25 |         with torch.no_grad():
 26 |             data['get_logit'] = True
 27 |             result = model(return_loss=False, **data)
 28 |         results.append(result)
 29 | 
 30 |         batch_size = data['img_group_0'].data[0].size(0)
 31 |         for _ in range(batch_size):
 32 |             prog_bar.update()
 33 |     return results
 34 | 
 35 | 
 36 | def _data_func(data, device_id):
 37 |     data = scatter(collate([data], samples_per_gpu=1), [device_id])[0]
 38 |     return dict(return_loss=False, rescale=True, **data)
 39 | 
 40 | 
 41 | def multi_gpu_test(model, data_loader, tmpdir=None):
 42 |     model.eval()
 43 |     results = []
 44 |     dataset = data_loader.dataset
 45 |     rank, world_size = get_dist_info()
 46 |     if rank == 0:
 47 |         prog_bar = mmcv.ProgressBar(len(dataset))
 48 |     for i, data in enumerate(data_loader):
 49 |         with torch.no_grad():
 50 |             # data['get_logit'] = True
 51 |             result = model(return_loss=False, rescale=True, **data)
 52 |         results.append(result)
 53 | 
 54 |         if rank == 0:
 55 |             batch_size = data['img_group_0'].data[0].size(0)
 56 |             for _ in range(batch_size * world_size):
 57 |                 prog_bar.update()
 58 | 
 59 |     # collect results from all ranks
 60 |     results = collect_results(results, len(dataset), tmpdir)
 61 | 
 62 |     return results
 63 | 
 64 | 
 65 | def collect_results(result_part, size, tmpdir=None):
 66 |     rank, world_size = get_dist_info()
 67 |     # create a tmp dir if it is not specified
 68 |     if tmpdir is None:
 69 |         MAX_LEN = 512
 70 |         # 32 is whitespace
 71 |         dir_tensor = torch.full(
 72 |             (MAX_LEN,), 32, dtype=torch.uint8, device='cuda')
 73 |         if rank == 0:
 74 |             tmpdir = tempfile.mkdtemp()
 75 |             print('temp_dir', tmpdir)
 76 |             tmpdir = torch.tensor(
 77 |                 bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
 78 |             dir_tensor[:len(tmpdir)] = tmpdir
 79 |         dist.broadcast(dir_tensor, 0)
 80 |         tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
 81 |     else:
 82 |         mmcv.mkdir_or_exist(tmpdir)
 83 |     # dump the part result to the dir
 84 |     mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank)))
 85 |     dist.barrier()
 86 |     # collect all parts
 87 |     if rank != 0:
 88 |         return None
 89 |     else:
 90 |         # load results of all parts from tmp dir
 91 |         part_list = []
 92 |         for i in range(world_size):
 93 |             part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i))
 94 |             part_list.append(mmcv.load(part_file))
 95 |         # sort the results
 96 |         ordered_results = []
 97 |         for res in zip(*part_list):
 98 |             ordered_results.extend(list(res))
 99 |         # the dataloader may pad some samples
100 |         ordered_results = ordered_results[:size]
101 |         # remove tmp dir
102 |         shutil.rmtree(tmpdir)
103 |         return ordered_results
104 | 
105 | 
106 | def parse_args():
107 |     parser = argparse.ArgumentParser(description='Test an action recognizer')
108 |     parser.add_argument('config', help='test config file path')
109 |     parser.add_argument('checkpoint', help='checkpoinls'
110 |                                            't file')
111 |     parser.add_argument(
112 |         '--gpus', default=8, type=int, help='GPU number used for testing')
113 |     parser.add_argument(
114 |         '--proc_per_gpu',
115 |         default=1,
116 |         type=int,
117 |         help='Number of processes per GPU')
118 |     parser.add_argument('--out', help='output result file')
119 |     parser.add_argument('--log', help='output log file')
120 |     parser.add_argument('--fcn_testing', action='store_true', default=False,
121 |                         help='whether to use fcn testing')
122 |     parser.add_argument('--flip', action='store_true', default=False,
123 |                         help='whether to flip videos')
124 |     parser.add_argument('--tmpdir', help='tmp dir for writing some results')
125 |     parser.add_argument(
126 |         '--launcher',
127 |         choices=['none', 'pytorch', 'slurm', 'mpi'],
128 |         default='none',
129 |         help='job launcher')
130 |     parser.add_argument('--local_rank', type=int, default=0)
131 |     parser.add_argument(
132 |         '--ignore_cache', action='store_true', help='whether to ignore cache')
133 |     args = parser.parse_args()
134 |     print('args==>>', args)
135 |     return args
136 | 
137 | 
138 | def main():
139 |     args = parse_args()
140 | 
141 |     assert args.out, ('Please specify the output path for results')
142 | 
143 |     if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
144 |         raise ValueError('The output file must be a pkl file.')
145 | 
146 |     cfg = mmcv.Config.fromfile(args.config)
147 |     # set cudnn_benchmark
148 |     if cfg.get('cudnn_benchmark', False):
149 |         torch.backends.cudnn.benchmark = True
150 |     cfg.data.test.test_mode = True
151 | 
152 |     if args.launcher == 'none':
153 |         distributed = False
154 |     else:
155 |         distributed = True
156 |         init_dist(args.launcher, **cfg.dist_params)
157 | 
158 |     if cfg.model.get('necks', None) is not None:
159 |         cfg.model.necks.aux_head_config = None
160 | 
161 |     if cfg.data.test.oversample == 'three_crop':
162 |         cfg.model.spatial_temporal_module.spatial_size = 8
163 |     if args.fcn_testing:
164 |         cfg.model['cls_head'].update({'fcn_testing': True})
165 |         cfg.model.update({'fcn_testing': True})
166 |     if args.flip:
167 |         cfg.model.update({'flip': True})
168 | 
169 |     dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True))
170 | 
171 |     if args.ignore_cache and args.out is not None:
172 |         if not distributed:
173 |             if args.gpus == 1:
174 |                 model = build_recognizer(
175 |                     cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
176 |                 load_checkpoint(model, args.checkpoint, strict=False, map_location='cpu')
177 |                 model = MMDataParallel(model, device_ids=[0])
178 | 
179 |                 data_loader = build_dataloader(
180 |                     dataset,
181 |                     imgs_per_gpu=1,
182 |                     workers_per_gpu=cfg.data.workers_per_gpu,
183 |                     num_gpus=1,
184 |                     dist=False,
185 |                     shuffle=False)
186 |                 outputs = single_test(model, data_loader)
187 |             else:
188 |                 model_args = cfg.model.copy()
189 |                 model_args.update(train_cfg=None, test_cfg=cfg.test_cfg)
190 |                 model_type = getattr(recognizers, model_args.pop('type'))
191 | 
192 |                 outputs = parallel_test(
193 |                     model_type,
194 |                     model_args,
195 |                     args.checkpoint,
196 |                     dataset,
197 |                     _data_func,
198 |                     range(args.gpus),
199 |                     workers_per_gpu=args.proc_per_gpu)
200 |         else:
201 |             data_loader = build_dataloader(
202 |                 dataset,
203 |                 imgs_per_gpu=1,
204 |                 workers_per_gpu=cfg.data.workers_per_gpu,
205 |                 dist=distributed,
206 |                 shuffle=False)
207 |             model = build_recognizer(
208 |                 cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
209 |             load_checkpoint(model, args.checkpoint, strict=False, map_location='cpu')
210 |             model = MMDistributedDataParallel(model.cuda())
211 |             outputs = multi_gpu_test(model, data_loader, args.tmpdir)
212 |     else:
213 |         try:
214 |             if distributed:
215 |                 rank, _ = get_dist_info()
216 |                 if rank == 0:
217 |                     outputs = mmcv.load(args.out)
218 |             else:
219 |                 outputs = mmcv.load(args.out)
220 |         except:
221 |             raise FileNotFoundError
222 | 
223 |     rank, _ = get_dist_info()
224 |     if args.out:
225 |         if rank == 0:
226 |             print('writing results to {}'.format(args.out))
227 |             mmcv.dump(outputs, args.out)
228 |             gt_labels = []
229 |             for i in range(len(dataset)):
230 |                 ann = dataset.get_ann_info(i)
231 |                 gt_labels.append(ann['label'])
232 | 
233 |             results = []
234 |             for res in outputs:
235 |                 res_list = [res[i] for i in range(res.shape[0])]
236 |                 results += res_list
237 |             results = results[:len(gt_labels)]
238 |             print('results_length', len(results))
239 |             top1, top5 = top_k_accuracy(results, gt_labels, k=(1, 5))
240 |             mean_acc = mean_class_accuracy(results, gt_labels)
241 |             non_mean_acc = non_mean_class_accuracy(results, gt_labels)
242 |             if args.log:
243 |                 f = open(args.log, 'w')
244 |                 f.write(f'Testing ckpt from {args.checkpoint}\n')
245 |                 f.write(f'Testing config from {args.config}\n')
246 |                 f.write("Mean Class Accuracy = {:.04f}\n".format(mean_acc * 100))
247 |                 f.write("Top-1 Accuracy = {:.04f}\n".format(top1 * 100))
248 |                 f.write("Top-5 Accuracy = {:.04f}\n".format(top5 * 100))
249 |                 f.close()
250 |             else:
251 |                 print("Mean Class Accuracy = {:.02f}".format(mean_acc * 100))
252 |                 print("Top-1 Accuracy = {:.02f}".format(top1 * 100))
253 |                 print("Top-5 Accuracy = {:.02f}".format(top5 * 100))
254 |                 print("Non mean Class Accuracy", non_mean_acc)
255 |                 print('saving non_mean acc')
256 | 
257 | 
258 | if __name__ == '__main__':
259 |     main()
260 | 


--------------------------------------------------------------------------------