├── slowfast
    ├── __init__.py
    ├── utils
    │   ├── ava_evaluation
    │   │   ├── __init__.py
    │   │   ├── README.md
    │   │   ├── np_box_mask_list.py
    │   │   ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt
    │   │   ├── np_box_ops.py
    │   │   ├── np_mask_ops.py
    │   │   ├── np_box_list.py
    │   │   ├── metrics.py
    │   │   └── label_map_util.py
    │   ├── __init__.py
    │   ├── env.py
    │   ├── weight_init_helper.py
    │   ├── multiprocessing.py
    │   ├── metrics.py
    │   ├── logging.py
    │   ├── bn_helper.py
    │   ├── lr_policy.py
    │   ├── aia_model_loading.py
    │   ├── benchmark.py
    │   ├── parser.py
    │   ├── setup_moxing_env.py
    │   ├── c2_model_loading.py
    │   ├── multigrid.py
    │   └── distributed.py
    ├── config
    │   └── __init__.py
    ├── visualization
    │   └── __init__.py
    ├── csrc
    │   ├── vision.cpp
    │   ├── cpu
    │   │   ├── vision.h
    │   │   └── ROIAlign_cpu.cpp
    │   ├── cuda
    │   │   └── vision.h
    │   └── ROIAlign.h
    ├── models
    │   ├── __init__.py
    │   ├── losses.py
    │   ├── build.py
    │   ├── detection_helper.py
    │   ├── backbones
    │   │   ├── __init__.py
    │   │   ├── x3d.py
    │   │   ├── regnet.py
    │   │   └── resnet.py
    │   ├── operators.py
    │   ├── optimizer.py
    │   └── nonlocal_helper.py
    └── datasets
    │   ├── __init__.py
    │   ├── build.py
    │   ├── video_container.py
    │   ├── multigrid_helper.py
    │   ├── loader.py
    │   ├── ava_helper.py
    │   └── ssv1.py
├── .gitignore
├── configs
    ├── Kinetics
    │   ├── SLOW_8x8_R101_50.yaml
    │   ├── SLOW_PROG_36x8_R101_50.yaml
    │   ├── SLOW_PROG_36x8_R50.yaml
    │   ├── SLOWFAST_PROG_36x8_R50.yaml
    │   ├── SLOW_8x8_R50.yaml
    │   ├── SLOW_8x8_R101.yaml
    │   └── SLOWFAST_8x8_R50.yaml
    └── Charades
    │   ├── SLOWFAST_PROG_76x8_R50_K400.yaml
    │   ├── SLOWFAST_PROG_76x8_R101_K400.yaml
    │   ├── SLOWFAST_PROG_76x8_R101_K600.yaml
    │   ├── SLOW_PROG_76x8_R50_K400.yaml
    │   ├── SLOW_PROG_76x8_R101_K400.yaml
    │   ├── SLOW_16x8_R50_K400.yaml
    │   ├── SLOW_16x8_R101_K400.yaml
    │   ├── SLOWFAST_16x8_R50_K400.yaml
    │   ├── SLOWFAST_16x8_R101_K400.yaml
    │   └── SLOWFAST_16x8_R101_K600.yaml
├── requirements.txt
├── README.md
├── MODEL_ZOO.md
├── tools
    ├── run_net.py
    ├── eval_ava.py
    ├── visualize_log.py
    └── test_net.py
└── setup.py


/slowfast/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/slowfast/config/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 


--------------------------------------------------------------------------------
/slowfast/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 


--------------------------------------------------------------------------------
/slowfast/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | .idea/
 3 | __pycache__/
 4 | 
 5 | *.so
 6 | build/
 7 | data/
 8 | scripts/
 9 | logs
10 | slowfast.egg-info/
11 | 


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/README.md:
--------------------------------------------------------------------------------
1 | The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet).
2 | 


--------------------------------------------------------------------------------
/configs/Kinetics/SLOW_8x8_R101_50.yaml:
--------------------------------------------------------------------------------
1 | _BASE_: SLOW_8x8_R101.yaml
2 | 
3 | RESNET:
4 |   NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]]
5 | 
6 | LOGS:
7 |   DIR: logs/Kinetics/SLOW_8x8_R101_50


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | av
 2 | filelock
 3 | fvcore
 4 | moviepy
 5 | opencv-python
 6 | pandas
 7 | psutil
 8 | pycocotools
 9 | simplejson
10 | sklearn
11 | tensorboardX
12 | torch
13 | torchvision
14 | tqdm


--------------------------------------------------------------------------------
/slowfast/csrc/vision.cpp:
--------------------------------------------------------------------------------
1 | #include "ROIAlign.h"
2 | 
3 | 
4 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
5 |   m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
6 |   m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
7 | }


--------------------------------------------------------------------------------
/slowfast/models/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 
4 | from .build import MODEL_REGISTRY, build_model  # noqa
5 | from .backbones.resnet import ResNet # noqa
6 | from .backbones.regnet import RegNet # noqa
7 | from .backbones.slowfast import SlowFast  # noqa
8 | from .backbones.x3d import X3D


--------------------------------------------------------------------------------
/slowfast/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 
4 | from .ava_dataset import Ava  # noqa
5 | from .build import DATASET_REGISTRY, build_dataset  # noqa
6 | from .charades import Charades  # noqa
7 | from .kinetics import Kinetics  # noqa
8 | from .ssv1 import Ssv1  # noqa
9 | from .ssv2 import Ssv2  # noqa


--------------------------------------------------------------------------------
/slowfast/utils/env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Set up Environment."""
 5 | 
 6 | import slowfast.utils.logging as logging
 7 | 
 8 | _ENV_SETUP_DONE = False
 9 | 
10 | 
11 | def setup_environment():
12 |     global _ENV_SETUP_DONE
13 |     if _ENV_SETUP_DONE:
14 |         return
15 |     _ENV_SETUP_DONE = True
16 | 


--------------------------------------------------------------------------------
/slowfast/csrc/cpu/vision.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | 
 5 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
 6 |                                 const at::Tensor& rois,
 7 |                                 const float spatial_scale,
 8 |                                 const int pooled_height,
 9 |                                 const int pooled_width,
10 |                                 const int sampling_ratio);


--------------------------------------------------------------------------------
/configs/Kinetics/SLOW_PROG_36x8_R101_50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: SLOW_8x8_R101_50.yaml
 2 | 
 3 | DATA:
 4 |   NUM_FRAMES: 36
 5 | 
 6 | PGT:
 7 |   ENABLE: True
 8 |   STEP_LEN: [8]
 9 |   STEPS: 5
10 |   OVERLAP: [1]
11 |   PG_EVAL: False
12 |   TRAIN_TOGETHER: True
13 | 
14 | SOLVER:
15 |   MAX_EPOCH: 98
16 |   WARMUP_EPOCHS: 17.0
17 |   WEIGHT_DECAY: 2e-4
18 | 
19 | TEST:
20 |   BATCH_SIZE: 8
21 |   NUM_ENSEMBLE_VIEWS: 2
22 |   NUM_SPATIAL_CROPS: 3
23 | 
24 | LOGS:
25 |   DIR: logs/Kinetics/SLOW_PROG_36x8_R101_50


--------------------------------------------------------------------------------
/configs/Kinetics/SLOW_PROG_36x8_R50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: SLOW_8x8_R50.yaml
 2 | 
 3 | DATA:
 4 |   NUM_FRAMES: 36
 5 | 
 6 | PGT:
 7 |   ENABLE: True
 8 |   STEP_LEN: [8]
 9 |   STEPS: 5
10 |   OVERLAP: [1]
11 |   CACHE: last
12 |   CACHE_MOMENTUM: 0.25
13 |   PG_EVAL: False
14 |   TRAIN_TOGETHER: True
15 | 
16 | SOLVER:
17 |   MAX_EPOCH: 98
18 |   WARMUP_EPOCHS: 17.0
19 |   WEIGHT_DECAY: 2e-4
20 | 
21 | TEST:
22 |   BATCH_SIZE: 8
23 |   NUM_ENSEMBLE_VIEWS: 2
24 |   NUM_SPATIAL_CROPS: 3
25 | 
26 | LOGS:
27 |   DIR: logs/Kinetics/SLOW_PROG_36x8_R50


--------------------------------------------------------------------------------
/configs/Kinetics/SLOWFAST_PROG_36x8_R50.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: SLOWFAST_8x8_R50.yaml
 2 | 
 3 | TRAIN:
 4 |   EVAL_PERIOD: 1
 5 | 
 6 | DATA:
 7 |   NUM_FRAMES: 144
 8 | 
 9 | PGT:
10 |   ENABLE: True
11 |   STEP_LEN: [8, 32]
12 |   STEPS: 5
13 |   OVERLAP: [1, 4]
14 |   TRAIN_TOGETHER: True
15 |   PG_EVAL: False
16 |   TPOOL_SIZE: [4, 4]
17 | 
18 | SOLVER:
19 |   MAX_EPOCH: 98
20 |   WARMUP_EPOCHS: 17.0
21 |   WEIGHT_DECAY: 2e-4
22 | 
23 | TEST:
24 |   BATCH_SIZE: 8
25 |   NUM_ENSEMBLE_VIEWS: 2
26 |   NUM_SPATIAL_CROPS: 3
27 | 
28 | LOGS:
29 |   DIR: logs/Kinetics/SLOWFAST_PROG_36x8_R50


--------------------------------------------------------------------------------
/configs/Charades/SLOWFAST_PROG_76x8_R50_K400.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: SLOWFAST_16x8_R50_K400.yaml
 2 | 
 3 | DATA:
 4 |   NUM_FRAMES: 304
 5 | 
 6 | PGT:
 7 |   ENABLE: True
 8 |   STEP_LEN: [16, 64]
 9 |   STEPS: 5
10 |   OVERLAP: [1, 1]
11 |   CACHE: max
12 |   CACHE_MOMENTUM: 0.25
13 |   PG_EVAL: True
14 |   TRAIN_TOGETHER: True
15 |   ENSEMBLE_METHOD: max
16 | 
17 | SOLVER:
18 |   BASE_LR: 0.0125
19 |   STEPS: [0, 20]
20 |   MAX_EPOCH: 25
21 |   WARMUP_EPOCHS: 4.0
22 | 
23 | TEST:
24 |   BATCH_SIZE: 16
25 |   NUM_ENSEMBLE_VIEWS: 2
26 |   NUM_SPATIAL_CROPS: 3
27 | 
28 | LOGS:
29 |   DIR: logs/Charades/SLOWFAST_PROG_76x8_R50_K400/


--------------------------------------------------------------------------------
/configs/Charades/SLOWFAST_PROG_76x8_R101_K400.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: SLOWFAST_16x8_R101_K400.yaml
 2 | 
 3 | DATA:
 4 |   NUM_FRAMES: 304
 5 | 
 6 | PGT:
 7 |   ENABLE: True
 8 |   STEP_LEN: [16, 64]
 9 |   STEPS: 5
10 |   OVERLAP: [1, 1]
11 |   CACHE: max
12 |   CACHE_MOMENTUM: 0.25
13 |   PG_EVAL: True
14 |   TRAIN_TOGETHER: True
15 |   ENSEMBLE_METHOD: max
16 | 
17 | SOLVER:
18 |   BASE_LR: 0.0125
19 |   STEPS: [0, 20]
20 |   MAX_EPOCH: 25
21 |   WARMUP_EPOCHS: 4.0
22 | 
23 | TEST:
24 |   BATCH_SIZE: 16
25 |   NUM_ENSEMBLE_VIEWS: 2
26 |   NUM_SPATIAL_CROPS: 3
27 | 
28 | LOGS:
29 |   DIR: logs/Charades/SLOWFAST_PROG_76x8_R101_K400/


--------------------------------------------------------------------------------
/configs/Charades/SLOWFAST_PROG_76x8_R101_K600.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: SLOWFAST_16x8_R101_K600.yaml
 2 | 
 3 | DATA:
 4 |   NUM_FRAMES: 304
 5 | 
 6 | PGT:
 7 |   ENABLE: True
 8 |   STEP_LEN: [16, 64]
 9 |   STEPS: 5
10 |   OVERLAP: [1, 1]
11 |   CACHE: max
12 |   CACHE_MOMENTUM: 0.25
13 |   PG_EVAL: True
14 |   TRAIN_TOGETHER: True
15 |   ENSEMBLE_METHOD: max
16 | 
17 | SOLVER:
18 |   BASE_LR: 0.0125
19 |   STEPS: [0, 20]
20 |   MAX_EPOCH: 25
21 |   WARMUP_EPOCHS: 4.0
22 | 
23 | TEST:
24 |   BATCH_SIZE: 16
25 |   NUM_ENSEMBLE_VIEWS: 2
26 |   NUM_SPATIAL_CROPS: 3
27 | 
28 | LOGS:
29 |   DIR: logs/Charades/SLOWFAST_PROG_76x8_R101_K600/


--------------------------------------------------------------------------------
/configs/Charades/SLOW_PROG_76x8_R50_K400.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: SLOW_16x8_R50_K400.yaml
 2 | 
 3 | DATA:
 4 |   NUM_FRAMES: 76
 5 | 
 6 | PGT:
 7 |   ENABLE: True
 8 |   STEP_LEN: [16]
 9 |   STEPS: 5
10 |   OVERLAP: [1]
11 |   CACHE: max
12 |   CACHE_MOMENTUM: 0.25
13 |   PG_EVAL: True
14 |   TRAIN_TOGETHER: True
15 |   ENSEMBLE_METHOD: max
16 | 
17 | SOLVER:
18 |   BASE_LR: 0.025
19 |   STEPS: [0, 20]
20 |   MAX_EPOCH: 20
21 |   WARMUP_EPOCHS: 1.0
22 | 
23 | TEST:
24 |   BATCH_SIZE: 16
25 |   NUM_ENSEMBLE_VIEWS: 2
26 |   NUM_SPATIAL_CROPS: 3
27 | 
28 | LOGS:
29 |   DIR: logs/Charades/SLOW_PROG_76x8_R50_K400/
30 | 
31 | NUM_GPUS: 4
32 | NUM_SHARDS: 1
33 | RNG_SEED: 0
34 | 


--------------------------------------------------------------------------------
/slowfast/models/losses.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Loss functions."""
 5 | 
 6 | import torch.nn as nn
 7 | 
 8 | _LOSSES = {
 9 |     "cross_entropy": nn.CrossEntropyLoss,
10 |     "bce": nn.BCELoss,
11 |     "bce_logit": nn.BCEWithLogitsLoss,
12 | }
13 | 
14 | 
15 | def get_loss_func(loss_name):
16 |     """
17 |     Retrieve the loss given the loss name.
18 |     Args (int):
19 |         loss_name: the name of the loss to use.
20 |     """
21 |     if loss_name not in _LOSSES.keys():
22 |         raise NotImplementedError("Loss {} is not supported".format(loss_name))
23 |     return _LOSSES[loss_name]
24 | 


--------------------------------------------------------------------------------
/configs/Charades/SLOW_PROG_76x8_R101_K400.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: SLOW_16x8_R101_K400.yaml
 2 | 
 3 | TRAIN:
 4 |   BATCH_SIZE: 32
 5 | 
 6 | DATA:
 7 |   NUM_FRAMES: 76
 8 | 
 9 | PGT:
10 |   ENABLE: True
11 |   STEP_LEN: [16]
12 |   STEPS: 5
13 |   OVERLAP: [1]
14 |   CACHE: max
15 |   CACHE_MOMENTUM: 0.25
16 |   PG_EVAL: True
17 |   TRAIN_TOGETHER: True
18 |   ENSEMBLE_METHOD: max
19 | 
20 | SOLVER:
21 |   BASE_LR: 0.025
22 |   STEPS: [0, 20]
23 |   MAX_EPOCH: 20
24 |   WARMUP_EPOCHS: 1.0
25 | 
26 | TEST:
27 |   BATCH_SIZE: 32
28 |   NUM_ENSEMBLE_VIEWS: 2
29 |   NUM_SPATIAL_CROPS: 3
30 | 
31 | LOGS:
32 |   DIR: logs/Charades/SLOW_PROG_76x8_R101_K400/
33 | 
34 | NUM_GPUS: 4
35 | NUM_SHARDS: 1
36 | RNG_SEED: 0
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PGT
 2 | 
 3 | Code for paper [PGT: A Progressive Method for Training Models on Long Videos](https://arxiv.org/abs/2103.11313).
 4 | 
 5 | ## Install
 6 | 
 7 | 1. Run `pip install -r requirements.txt`.
 8 | 2. Run `python setup.py build develop` to compile RoIAlign python wrapper.
 9 | 
10 | ## Model zoo
11 | 
12 | Please refer to [MODEL_ZOO.md](./MODEL_ZOO.md)
13 | 
14 | ## Acknowledgement
15 | 
16 | This repository is built on [SlowFast](https://github.com/facebookresearch/SlowFast).
17 | 
18 | ## Citing PGT
19 | 
20 | ```
21 | @article{pang2021pgt,
22 |   title={PGT: A Progressive Method for Training Models on Long Videos},
23 |   author={Pang, Bo and Peng, Gao and Li, Yizhuo and Lu, Cewu},
24 |   journal={arXiv preprint arXiv:2103.11313},
25 |   year={2021}
26 | }
27 | ```


--------------------------------------------------------------------------------
/MODEL_ZOO.md:
--------------------------------------------------------------------------------
 1 | # Model zoo
 2 | 
 3 | ## Kinetics
 4 | 
 5 | | Method | Backbone | Pretrain | Config | top-1 | top-5 | Checkpoint | Log |
 6 | | --- | --- | --- | --- | --- | --- | --- | --- |
 7 | | Slow 36x8 + PGT | R50 | from scratch | Kinetics/SLOW_PROG_36x8_R50.yaml | 75.6 | 92.3 |  | 
 8 | | Slow 36x8 + PGT | R101 | from scratch | Kinetics/SLOW_PROG_36x8_R101_50.yaml | 76.9 | 92.8 | |
 9 | | SlowFast 36x8 + PGT | R50 | from scratch | Kinetics/SLOWFAST_PROG_76x8_R50.yaml | 76.6 | 92.5 |  | 
10 | 
11 | ## Charades
12 | 
13 | | Method | Backbone | Pretrain | Config | mAP | Checkpoint | Log |
14 | | --- | --- | --- | --- | --- | --- | --- |
15 | | Slow 76x8 + PGT | R50 | K400 | Charades/SLOW_16x8_R50_K400.yaml | 40.2 | |
16 | | SlowFast + PGT 76x8 | R50 | K400 | Charades/SLOWFAST_PROG_76x8_R50_K400.yaml | 43.8 | | |
17 | | Slow + PGT 76x8 | R101 | K400 | Charades/SLOW_PROG_76x8_R101_K400.yaml | 42.7 |  |
18 | | SlowFast + PGT 76x8 | R101 | K400 | Charades/SLOWFAST_PROG_76x8_R101_K400.yaml | 44.3 |  |  |


--------------------------------------------------------------------------------
/slowfast/csrc/cuda/vision.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
 5 |                                  const at::Tensor& rois,
 6 |                                  const float spatial_scale,
 7 |                                  const int pooled_height,
 8 |                                  const int pooled_width,
 9 |                                  const int sampling_ratio);
10 | 
11 | at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
12 |                                   const at::Tensor& rois,
13 |                                   const float spatial_scale,
14 |                                   const int pooled_height,
15 |                                   const int pooled_width,
16 |                                   const int batch_size,
17 |                                   const int channels,
18 |                                   const int height,
19 |                                   const int width,
20 |                                   const int sampling_ratio);
21 | 


--------------------------------------------------------------------------------
/tools/run_net.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Wrapper to train and test a video classification model."""
 5 | from slowfast.utils.misc import launch_job
 6 | from slowfast.utils.parser import load_config, parse_args
 7 | 
 8 | from test_net import test
 9 | from train_net import train
10 | 
11 | 
12 | def main():
13 |     """
14 |     Main function to spawn the train and test process.
15 |     """
16 |     args, opts = parse_args()
17 |     cfg = load_config(args, opts)
18 | 
19 |     # Perform training.
20 |     if cfg.TRAIN.ENABLE:
21 |         launch_job(cfg=cfg, init_method=args.init_method, func=train)
22 | 
23 |     # Perform multi-clip testing.
24 |     if cfg.TEST.ENABLE:
25 |         launch_job(cfg=cfg, init_method=args.init_method, func=test)
26 | 
27 |     if cfg.DEMO.ENABLE:
28 |         launch_job(cfg=cfg, init_method=args.init_method, func=demo)
29 | 
30 |     if cfg.TENSORBOARD.ENABLE and cfg.TENSORBOARD.MODEL_VIS.ENABLE:
31 |         launch_job(cfg=cfg, init_method=args.init_method, func=visualize)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/slowfast/datasets/build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | from fvcore.common.registry import Registry
 5 | 
 6 | DATASET_REGISTRY = Registry("DATASET")
 7 | DATASET_REGISTRY.__doc__ = """
 8 | Registry for dataset.
 9 | 
10 | The registered object will be called with `obj(cfg, split)`.
11 | The call should return a `torch.utils.data.Dataset` object.
12 | """
13 | 
14 | 
15 | def build_dataset(dataset_name, cfg, split):
16 |     """
17 |     Build a dataset, defined by `dataset_name`.
18 |     Args:
19 |         dataset_name (str): the name of the dataset to be constructed.
20 |         cfg (CfgNode): configs. Details can be found in
21 |             slowfast/config/defaults.py
22 |         split (str): the split of the data loader. Options include `train`,
23 |             `val`, and `test`.
24 |     Returns:
25 |         Dataset: a constructed dataset specified by dataset_name.
26 |     """
27 |     # Capitalize the the first letter of the dataset_name since the dataset_name
28 |     # in configs may be in lowercase but the name of dataset class should always
29 |     # start with an uppercase letter.
30 |     name = dataset_name.capitalize()
31 |     return DATASET_REGISTRY.get(name)(cfg, split)
32 | 


--------------------------------------------------------------------------------
/slowfast/datasets/video_container.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | import av
 5 | from slowfast.utils.setup_moxing_env import wrap_input_path2
 6 | 
 7 | 
 8 | def get_video_container(path_to_vid, multi_thread_decode=False, backend="pyav"):
 9 |     """
10 |     Given the path to the video, return the pyav video container.
11 |     Args:
12 |         path_to_vid (str): path to the video.
13 |         multi_thread_decode (bool): if True, perform multi-thread decoding.
14 |         backend (str): decoder backend, options include `pyav` and
15 |             `torchvision`, default is `pyav`.
16 |     Returns:
17 |         container (container): video container.
18 |     """
19 |     path_to_vid = wrap_input_path2(path_to_vid)
20 |     if backend == "torchvision":
21 |         with open(path_to_vid, "rb") as fp:
22 |             container = fp.read()
23 |         return container
24 |     elif backend == "pyav":
25 |         container = av.open(path_to_vid)
26 |         if multi_thread_decode:
27 |             # Enable multiple threads for decoding.
28 |             container.streams.video[0].thread_type = "AUTO"
29 |         return container
30 |     else:
31 |         raise NotImplementedError("Unknown backend {}".format(backend))
32 | 


--------------------------------------------------------------------------------
/tools/eval_ava.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | 
 5 | import slowfast.utils.logging as logging
 6 | from slowfast.utils.parser import load_config
 7 | from slowfast.utils.ava_eval_helper import evaluate_ava_from_files
 8 | 
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser(description="AVA evaluator.")
12 |     parser.add_argument(
13 |         "--cfg",
14 |         dest="cfg_file",
15 |         help="Path to the config file",
16 |         default="configs/Kinetics/SLOWFAST_4x16_R50.yaml",
17 |         type=str,
18 |     )
19 |     parser.add_argument(
20 |         "opts",
21 |         help="See slowfast/config/defaults.py for all options",
22 |         default=None,
23 |         nargs=argparse.REMAINDER,
24 |     )
25 |     if len(sys.argv) == 1:
26 |         parser.print_help()
27 |     return parser.parse_args()
28 | 
29 | 
30 | def main():
31 |     args = parse_args()
32 |     cfg = load_config(args)
33 | 
34 |     logging.setup_logger(cfg, 'test')
35 |     evaluate_ava_from_files(
36 |         os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.LABEL_MAP_FILE),
37 |         os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.GROUNDTRUTH_FILE),
38 |         os.path.join(cfg.LOGS.DIR, "detections_latest.csv"),
39 |         os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.EXCLUSION_FILE)
40 |     )
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/configs/Kinetics/SLOW_8x8_R50.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 64
 5 |   EVAL_PERIOD: 1
 6 |   CHECKPOINT_PERIOD: 1
 7 |   AUTO_RESUME: True
 8 | 
 9 | DATA:
10 |   NUM_FRAMES: 8
11 |   SAMPLING_RATE: 8
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 256
15 |   INPUT_CHANNEL_NUM: [3]
16 |   PATH_TO_DATA_DIR: /home/pg/data/Kinetics_400/
17 |   PATH_PREFIX: /home/pg/data/Kinetics_400/
18 | 
19 | RESNET:
20 |   ZERO_INIT_FINAL_BN: True
21 |   WIDTH_PER_GROUP: 64
22 |   NUM_GROUPS: 1
23 |   DEPTH: 50
24 |   TRANS_FUNC: bottleneck_transform
25 |   STRIDE_1X1: False
26 |   NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]]
27 | 
28 | NONLOCAL:
29 |   LOCATION: [[[]], [[]], [[]], [[]]]
30 |   GROUP: [[1], [1], [1], [1]]
31 |   INSTANTIATION: dot_product
32 | 
33 | BN:
34 |   USE_PRECISE_STATS: False
35 |   NUM_BATCHES_PRECISE: 200
36 | 
37 | SOLVER:
38 |   BASE_LR: 0.1
39 |   LR_POLICY: cosine
40 |   MAX_EPOCH: 196
41 |   MOMENTUM: 0.9
42 |   WEIGHT_DECAY: 1e-4
43 |   WARMUP_EPOCHS: 34.0
44 |   WARMUP_START_LR: 0.01
45 |   OPTIMIZING_METHOD: sgd
46 | 
47 | MODEL:
48 |   NUM_CLASSES: 400
49 |   ARCH: slow
50 |   MODEL_NAME: ResNet
51 |   LOSS_FUNC: cross_entropy
52 |   DROPOUT_RATE: 0.5
53 | 
54 | TEST:
55 |   ENABLE: True
56 |   DATASET: kinetics
57 |   BATCH_SIZE: 64
58 | 
59 | DATA_LOADER:
60 |   NUM_WORKERS: 8
61 |   PIN_MEMORY: True
62 | 
63 | LOGS:
64 |   DIR: logs/Kinetics/SLOW_8x8_R50/
65 | 
66 | NUM_GPUS: 8
67 | NUM_SHARDS: 1
68 | RNG_SEED: 0
69 | 


--------------------------------------------------------------------------------
/configs/Kinetics/SLOW_8x8_R101.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 64
 5 |   EVAL_PERIOD: 10
 6 |   CHECKPOINT_PERIOD: 1
 7 |   AUTO_RESUME: True
 8 | 
 9 | DATA:
10 |   NUM_FRAMES: 8
11 |   SAMPLING_RATE: 8
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 256
15 |   INPUT_CHANNEL_NUM: [3]
16 |   PATH_TO_DATA_DIR: /home/pg/data/Kinetics_400/
17 |   PATH_PREFIX: /home/pg/data/Kinetics_400/
18 | 
19 | RESNET:
20 |   ZERO_INIT_FINAL_BN: True
21 |   WIDTH_PER_GROUP: 64
22 |   NUM_GROUPS: 1
23 |   DEPTH: 101
24 |   TRANS_FUNC: bottleneck_transform
25 |   STRIDE_1X1: False
26 |   NUM_BLOCK_TEMP_KERNEL: [[3], [4], [23], [3]]
27 | 
28 | NONLOCAL:
29 |   LOCATION: [[[]], [[]], [[]], [[]]]
30 |   GROUP: [[1], [1], [1], [1]]
31 |   INSTANTIATION: dot_product
32 | 
33 | BN:
34 |   USE_PRECISE_STATS: False
35 |   NUM_BATCHES_PRECISE: 200
36 | 
37 | SOLVER:
38 |   BASE_LR: 0.1
39 |   LR_POLICY: cosine
40 |   MAX_EPOCH: 196
41 |   MOMENTUM: 0.9
42 |   WEIGHT_DECAY: 1e-4
43 |   WARMUP_EPOCHS: 34.0
44 |   WARMUP_START_LR: 0.01
45 |   OPTIMIZING_METHOD: sgd
46 | 
47 | MODEL:
48 |   NUM_CLASSES: 400
49 |   ARCH: slow
50 |   MODEL_NAME: ResNet
51 |   LOSS_FUNC: cross_entropy
52 |   DROPOUT_RATE: 0.5
53 | 
54 | TEST:
55 |   ENABLE: True
56 |   DATASET: kinetics
57 |   BATCH_SIZE: 64
58 | 
59 | DATA_LOADER:
60 |   NUM_WORKERS: 8
61 |   PIN_MEMORY: True
62 | 
63 | LOGS:
64 |   DIR: logs/Kinetics/SLOW_8x8_R101/
65 | 
66 | NUM_GPUS: 8
67 | NUM_SHARDS: 1
68 | RNG_SEED: 0
69 |       


--------------------------------------------------------------------------------
/slowfast/models/build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Model construction functions."""
 5 | 
 6 | import torch
 7 | from fvcore.common.registry import Registry
 8 | 
 9 | MODEL_REGISTRY = Registry("MODEL")
10 | MODEL_REGISTRY.__doc__ = """
11 | Registry for video model.
12 | 
13 | The registered object will be called with `obj(cfg)`.
14 | The call should return a `torch.nn.Module` object.
15 | """
16 | 
17 | 
18 | def build_model(cfg):
19 |     """
20 |     Builds the video model.
21 |     Args:
22 |         cfg (configs): configs that contains the hyper-parameters to build the
23 |         backbone. Details can be seen in slowfast/config/defaults.py.
24 |     """
25 |     assert (
26 |         cfg.NUM_GPUS <= torch.cuda.device_count()
27 |     ), "Cannot use more GPU devices than available"
28 | 
29 |     # Construct the model
30 |     name = cfg.MODEL.MODEL_NAME
31 |     model = MODEL_REGISTRY.get(name)(cfg)
32 |     # Determine the GPU used by the current process
33 |     cur_device = torch.cuda.current_device()
34 |     # Transfer the model to the current GPU device
35 |     model = model.cuda(device=cur_device)
36 |     # Use multi-process data parallel model in the multi-gpu setting
37 |     if cfg.NUM_GPUS > 1:
38 |         # Make model replica operate on the current device
39 |         model = torch.nn.parallel.DistributedDataParallel(
40 |             module=model, device_ids=[cur_device], output_device=cur_device
41 |         )
42 |     return model
43 | 


--------------------------------------------------------------------------------
/slowfast/utils/weight_init_helper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Utility function for weight initialization"""
 5 | 
 6 | import torch.nn as nn
 7 | from fvcore.nn.weight_init import c2_msra_fill
 8 | 
 9 | 
10 | def init_weights(model, fc_init_std=0.01, zero_init_final_bn=True):
11 |     """
12 |     Performs ResNet style weight initialization.
13 |     Args:
14 |         fc_init_std (float): the expected standard deviation for fc layer.
15 |         zero_init_final_bn (bool): if True, zero initialize the final bn for
16 |             every bottleneck.
17 |     """
18 |     for m in model.modules():
19 |         if isinstance(m, nn.Conv3d):
20 |             """
21 |             Follow the initialization method proposed in:
22 |             {He, Kaiming, et al.
23 |             "Delving deep into rectifiers: Surpassing human-level
24 |             performance on imagenet classification."
25 |             arXiv preprint arXiv:1502.01852 (2015)}
26 |             """
27 |             c2_msra_fill(m)
28 |         elif isinstance(m, nn.BatchNorm3d):
29 |             if (
30 |                 hasattr(m, "transform_final_bn")
31 |                 and m.transform_final_bn
32 |                 and zero_init_final_bn
33 |             ):
34 |                 batchnorm_weight = 0.0
35 |             else:
36 |                 batchnorm_weight = 1.0
37 |             if m.weight is not None:
38 |                 m.weight.data.fill_(batchnorm_weight)
39 |             if m.bias is not None:
40 |                 m.bias.data.zero_()
41 |         if isinstance(m, nn.Linear):
42 |             m.weight.data.normal_(mean=0.0, std=fc_init_std)
43 |             m.bias.data.zero_()
44 | 


--------------------------------------------------------------------------------
/slowfast/csrc/ROIAlign.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cpu/vision.h"
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include "cuda/vision.h"
 7 | #endif
 8 | 
 9 | // Interface for Python
10 | at::Tensor ROIAlign_forward(const at::Tensor& input,
11 |                             const at::Tensor& rois,
12 |                             const float spatial_scale,
13 |                             const int pooled_height,
14 |                             const int pooled_width,
15 |                             const int sampling_ratio) {
16 |   if (input.type().is_cuda()) {
17 | #ifdef WITH_CUDA
18 |     return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
19 | #else
20 |     AT_ERROR("Not compiled with GPU support");
21 | #endif
22 |   }
23 |   return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
24 | }
25 | 
26 | at::Tensor ROIAlign_backward(const at::Tensor& grad,
27 |                              const at::Tensor& rois,
28 |                              const float spatial_scale,
29 |                              const int pooled_height,
30 |                              const int pooled_width,
31 |                              const int batch_size,
32 |                              const int channels,
33 |                              const int height,
34 |                              const int width,
35 |                              const int sampling_ratio) {
36 |   if (grad.type().is_cuda()) {
37 | #ifdef WITH_CUDA
38 |     return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
39 | #else
40 |     AT_ERROR("Not compiled with GPU support");
41 | #endif
42 |   }
43 |   AT_ERROR("Not implemented on the CPU");
44 | }
45 | 


--------------------------------------------------------------------------------
/configs/Kinetics/SLOWFAST_8x8_R50.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 64
 5 |   EVAL_PERIOD: 10
 6 |   CHECKPOINT_PERIOD: 1
 7 |   AUTO_RESUME: True
 8 | 
 9 | DATA:
10 |   NUM_FRAMES: 32
11 |   SAMPLING_RATE: 2
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 256
15 |   INPUT_CHANNEL_NUM: [3, 3]
16 |   PATH_TO_DATA_DIR: /home/pg/data/Kinetics_400/
17 |   PATH_PREFIX: /home/pg/data/Kinetics_400/
18 | 
19 | SLOWFAST:
20 |   ALPHA: 4
21 |   BETA_INV: 8
22 |   FUSION_CONV_CHANNEL_RATIO: 2
23 |   FUSION_KERNEL_SZ: 5
24 | 
25 | RESNET:
26 |   ZERO_INIT_FINAL_BN: True
27 |   WIDTH_PER_GROUP: 64
28 |   NUM_GROUPS: 1
29 |   DEPTH: 50
30 |   TRANS_FUNC: bottleneck_transform
31 |   STRIDE_1X1: False
32 |   NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
33 |   SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
34 |   SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
35 | 
36 | NONLOCAL:
37 |   LOCATION: [[[], []], [[], []], [[], []], [[], []]]
38 |   GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
39 |   INSTANTIATION: dot_product
40 | 
41 | BN:
42 |   USE_PRECISE_STATS: False
43 |   NUM_BATCHES_PRECISE: 200
44 | 
45 | SOLVER:
46 |   BASE_LR: 0.1
47 |   LR_POLICY: cosine
48 |   MAX_EPOCH: 196
49 |   MOMENTUM: 0.9
50 |   WEIGHT_DECAY: 1e-4
51 |   WARMUP_EPOCHS: 34.0
52 |   WARMUP_START_LR: 0.01
53 |   OPTIMIZING_METHOD: sgd
54 | 
55 | MODEL:
56 |   NUM_CLASSES: 400
57 |   ARCH: slowfast
58 |   MODEL_NAME: SlowFast
59 |   LOSS_FUNC: cross_entropy
60 |   DROPOUT_RATE: 0.5
61 | 
62 | TEST:
63 |   ENABLE: True
64 |   DATASET: kinetics
65 |   BATCH_SIZE: 64
66 | 
67 | DATA_LOADER:
68 |   NUM_WORKERS: 8
69 |   PIN_MEMORY: True
70 | 
71 | LOGS:
72 |   DIR: logs/Kinetics/SLOWFAST_8x8_R50/
73 | 
74 | NUM_GPUS: 8
75 | NUM_SHARDS: 1
76 | RNG_SEED: 0
77 | 
78 | 


--------------------------------------------------------------------------------
/configs/Charades/SLOW_16x8_R50_K400.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: charades
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 1
 6 |   FULL_TIME_EVAL: True
 7 |   CHECKPOINT_PERIOD: 1
 8 |   AUTO_RESUME: True
 9 |   CHECKPOINT_TYPE: caffe2
10 |   CHECKPOINT_FILE_PATH: /home/pg/data/models/k400/SLOW_8x8_R50.pkl
11 | 
12 | DATA:
13 |   NUM_FRAMES: 16
14 |   SAMPLING_RATE: 8
15 |   TRAIN_JITTER_SCALES: [256, 320]
16 |   TRAIN_CROP_SIZE: 224
17 |   TEST_CROP_SIZE: 256
18 |   INPUT_CHANNEL_NUM: [3]
19 |   MULTI_LABEL: True
20 |   INV_UNIFORM_SAMPLE: True
21 |   ENSEMBLE_METHOD: max
22 |   REVERSE_INPUT_CHANNEL: True
23 |   PATH_TO_DATA_DIR: /home/pg/data/Charades/
24 |   PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb
25 | 
26 | RESNET:
27 |   ZERO_INIT_FINAL_BN: True
28 |   WIDTH_PER_GROUP: 64
29 |   NUM_GROUPS: 1
30 |   DEPTH: 50
31 |   TRANS_FUNC: bottleneck_transform
32 |   STRIDE_1X1: False
33 |   NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]]
34 | 
35 | NONLOCAL:
36 |   LOCATION: [[[]], [[]], [[]], [[]]]
37 | 
38 | BN:
39 |   USE_PRECISE_STATS: False
40 |   WEIGHT_DECAY: 0.0
41 |   NORM_TYPE: frozen_batchnorm
42 | 
43 | SOLVER:
44 |   BASE_LR: 0.075
45 |   LR_POLICY: steps_with_relative_lrs
46 |   LRS: [1, 0.1, 0.01]
47 |   STEPS: [0, 20, 30]
48 |   MAX_EPOCH: 35
49 |   MOMENTUM: 0.9
50 |   WEIGHT_DECAY: 1e-4
51 |   WARMUP_EPOCHS: 4.0
52 |   WARMUP_START_LR: 0.0002
53 |   OPTIMIZING_METHOD: sgd
54 | 
55 | MODEL:
56 |   NUM_CLASSES: 157
57 |   ARCH: slow
58 |   MODEL_NAME: ResNet
59 |   LOSS_FUNC: bce_logit
60 |   HEAD_ACT: sigmoid
61 |   DROPOUT_RATE: 0.5
62 |   FINAL_POOL: ["avg", "max"]
63 | 
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: charades
67 |   BATCH_SIZE: 32
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | 
71 | DATA_LOADER:
72 |   NUM_WORKERS: 4
73 |   PIN_MEMORY: True
74 | 
75 | LOGS:
76 |   DIR: logs/Charades/SLOW_16x8_R50_K400/
77 | 
78 | NUM_GPUS: 2
79 | NUM_SHARDS: 1
80 | RNG_SEED: 0
81 | 


--------------------------------------------------------------------------------
/configs/Charades/SLOW_16x8_R101_K400.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: charades
 4 |   BATCH_SIZE: 16
 5 |   EVAL_PERIOD: 1
 6 |   FULL_TIME_EVAL: True
 7 |   CHECKPOINT_PERIOD: 1
 8 |   AUTO_RESUME: True
 9 |   CHECKPOINT_TYPE: pytorch
10 |   CHECKPOINT_FILE_PATH: /home/pg/data/models/k400/SLOWFAST_8x8_R101.pyth
11 | 
12 | DATA:
13 |   NUM_FRAMES: 16
14 |   SAMPLING_RATE: 8
15 |   TRAIN_JITTER_SCALES: [256, 320]
16 |   TRAIN_CROP_SIZE: 224
17 |   TEST_CROP_SIZE: 256
18 |   INPUT_CHANNEL_NUM: [3]
19 |   MULTI_LABEL: True
20 |   INV_UNIFORM_SAMPLE: True
21 |   ENSEMBLE_METHOD: max
22 |   REVERSE_INPUT_CHANNEL: True
23 |   PATH_TO_DATA_DIR: /home/pg/data/Charades/
24 |   PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb
25 | 
26 | RESNET:
27 |   ZERO_INIT_FINAL_BN: True
28 |   WIDTH_PER_GROUP: 64
29 |   NUM_GROUPS: 1
30 |   DEPTH: 101
31 |   TRANS_FUNC: bottleneck_transform
32 |   STRIDE_1X1: False
33 |   NUM_BLOCK_TEMP_KERNEL: [[3], [4], [23], [3]]
34 | 
35 | NONLOCAL:
36 |   LOCATION: [[[]], [[]], [[]], [[]]]
37 | 
38 | BN:
39 |   USE_PRECISE_STATS: False
40 |   WEIGHT_DECAY: 0.0
41 |   NORM_TYPE: frozen_batchnorm
42 | 
43 | SOLVER:
44 |   BASE_LR: 0.0375
45 |   LR_POLICY: steps_with_relative_lrs
46 |   LRS: [1, 0.1, 0.01]
47 |   STEPS: [0, 20, 30]
48 |   MAX_EPOCH: 35
49 |   MOMENTUM: 0.9
50 |   WEIGHT_DECAY: 1e-4
51 |   WARMUP_EPOCHS: 4.0
52 |   WARMUP_START_LR: 0.0001
53 |   OPTIMIZING_METHOD: sgd
54 | 
55 | MODEL:
56 |   NUM_CLASSES: 157
57 |   ARCH: slow
58 |   MODEL_NAME: ResNet
59 |   LOSS_FUNC: bce_logit
60 |   HEAD_ACT: sigmoid
61 |   DROPOUT_RATE: 0.5
62 |   FINAL_POOL: ["avg", "max"]
63 | 
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: charades
67 |   BATCH_SIZE: 16
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | 
71 | DATA_LOADER:
72 |   NUM_WORKERS: 4
73 |   PIN_MEMORY: True
74 | 
75 | LOGS:
76 |   DIR: logs/Charades/SLOW_16x8_R101_K400/
77 | 
78 | NUM_GPUS: 2
79 | NUM_SHARDS: 1
80 | RNG_SEED: 0
81 | 


--------------------------------------------------------------------------------
/tools/visualize_log.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from tensorboardX import SummaryWriter
 4 | 
 5 | from slowfast.utils.parser import load_config, parse_args
 6 | 
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--log_dir", type=str, required=True)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def main():
15 |     args = parse_args()
16 | 
17 |     filename = os.path.join(args.log_dir, 'train.log')
18 |     tblogger = SummaryWriter(log_dir=args.log_dir)
19 | 
20 |     with open(filename) as f:
21 |         log = f.readlines()
22 | 
23 |     for l in log:
24 |         if 'train_iter' in l:
25 |             l = l.split(']: ')[1].strip()
26 |             cur_epoch = int(l.split('epoch: ')[1].split('/')[0])
27 |             cur_iter = int(l.split('iter: ')[1].split('/')[0])
28 |             epoch_iters = int(l.split('iter: ')[1].split('/')[1].split(';')[0])
29 |             iters = cur_iter + 1 + epoch_iters * cur_epoch
30 |             for kv in l.split('; '):
31 |                 k, v = kv.split(': ')
32 |                 if 'err' in k or 'loss' in k:
33 |                     tblogger.add_scalar('train/{}'.format(k), float(v), iters)
34 |                 elif k == 'epoch':
35 |                     tblogger.add_scalar('other/epoch', cur_epoch + 1, iters)
36 |                 elif k == 'lr':
37 |                     tblogger.add_scalar('other/lr', float(v), iters)
38 |                 else:
39 |                     continue
40 |         elif 'val_epoch' in l:
41 |             l = l.split(']: ')[1].strip()
42 |             cur_epoch = int(l.split('epoch: ')[1].split('/')[0])
43 |             for kv in l.split('; '):
44 |                 k, v = kv.split(': ')
45 |                 if 'err' in k or 'map' in k:
46 |                     tblogger.add_scalar(
47 |                         'val/{}'.format(k), float(v), cur_epoch + 1)
48 |                 else:
49 |                     continue
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/slowfast/utils/multiprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Multiprocessing helpers."""
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def run(
10 |     local_rank, num_proc, func, init_method, shard_id, num_shards, backend, cfg
11 | ):
12 |     """
13 |     Runs a function from a child process.
14 |     Args:
15 |         local_rank (int): rank of the current process on the current machine.
16 |         num_proc (int): number of processes per machine.
17 |         func (function): function to execute on each of the process.
18 |         init_method (string): method to initialize the distributed training.
19 |             TCP initialization: equiring a network address reachable from all
20 |             processes followed by the port.
21 |             Shared file-system initialization: makes use of a file system that
22 |             is shared and visible from all machines. The URL should start with
23 |             file:// and contain a path to a non-existent file on a shared file
24 |             system.
25 |         shard_id (int): the rank of the current machine.
26 |         num_shards (int): number of overall machines for the distributed
27 |             training job.
28 |         backend (string): three distributed backends ('nccl', 'gloo', 'mpi') are
29 |             supports, each with different capabilities. Details can be found
30 |             here:
31 |             https://pytorch.org/docs/stable/distributed.html
32 |         cfg (CfgNode): configs. Details can be found in
33 |             slowfast/config/defaults.py
34 |     """
35 |     # Initialize the process group.
36 |     world_size = num_proc * num_shards
37 |     rank = shard_id * num_proc + local_rank
38 | 
39 |     try:
40 |         torch.distributed.init_process_group(
41 |             backend=backend,
42 |             init_method=init_method,
43 |             world_size=world_size,
44 |             rank=rank,
45 |         )
46 |     except Exception as e:
47 |         raise e
48 | 
49 |     torch.cuda.set_device(local_rank)
50 |     func(cfg)
51 | 


--------------------------------------------------------------------------------
/configs/Charades/SLOWFAST_16x8_R50_K400.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: charades
 4 |   BATCH_SIZE: 16
 5 |   EVAL_PERIOD: 1
 6 |   FULL_TIME_EVAL: True
 7 |   CHECKPOINT_PERIOD: 1
 8 |   AUTO_RESUME: True
 9 |   CHECKPOINT_TYPE: caffe2
10 |   CHECKPOINT_FILE_PATH: /home/pg/data/models/k400/SLOWFAST_8x8_R50.pkl
11 | 
12 | DATA:
13 |   NUM_FRAMES: 64
14 |   SAMPLING_RATE: 2
15 |   TRAIN_JITTER_SCALES: [256, 320]
16 |   TRAIN_CROP_SIZE: 224
17 |   TEST_CROP_SIZE: 256
18 |   INPUT_CHANNEL_NUM: [3, 3]
19 |   MULTI_LABEL: True
20 |   INV_UNIFORM_SAMPLE: True
21 |   ENSEMBLE_METHOD: max
22 |   REVERSE_INPUT_CHANNEL: True
23 |   PATH_TO_DATA_DIR: /home/pg/data/Charades/
24 |   PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb
25 | 
26 | SLOWFAST:
27 |   ALPHA: 4
28 |   BETA_INV: 8
29 |   FUSION_CONV_CHANNEL_RATIO: 2
30 |   FUSION_KERNEL_SZ: 7
31 | 
32 | RESNET:
33 |   SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
34 |   SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
35 |   ZERO_INIT_FINAL_BN: True
36 |   WIDTH_PER_GROUP: 64
37 |   NUM_GROUPS: 1
38 |   DEPTH: 50
39 |   TRANS_FUNC: bottleneck_transform
40 |   STRIDE_1X1: False
41 |   NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
42 | 
43 | NONLOCAL:
44 |   LOCATION: [[[], []], [[], []], [[], []], [[], []]]
45 |   GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
46 | 
47 | BN:
48 |   USE_PRECISE_STATS: False
49 |   NORM_TYPE: frozen_batchnorm
50 | 
51 | SOLVER:
52 |   BASE_LR: 0.0375
53 |   LR_POLICY: steps_with_relative_lrs
54 |   LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
55 |   STEPS: [0, 23, 39]
56 |   MAX_EPOCH: 40
57 |   MOMENTUM: 0.9
58 |   WEIGHT_DECAY: 1e-4
59 |   WARMUP_EPOCHS: 4.0
60 |   WARMUP_START_LR: 0.0001
61 |   OPTIMIZING_METHOD: sgd
62 | 
63 | MODEL:
64 |   NUM_CLASSES: 157
65 |   ARCH: slowfast
66 |   LOSS_FUNC: bce_logit
67 |   HEAD_ACT: sigmoid
68 |   DROPOUT_RATE: 0.5
69 |   FINAL_POOL: ["avg", "max"]
70 | 
71 | TEST:
72 |   ENABLE: True
73 |   DATASET: charades
74 |   BATCH_SIZE: 16
75 |   NUM_ENSEMBLE_VIEWS: 10
76 |   NUM_SPATIAL_CROPS: 3
77 | 
78 | DATA_LOADER:
79 |   NUM_WORKERS: 8
80 |   PIN_MEMORY: True
81 | 
82 | LOGS:
83 |   DIR: logs/Charades/SLOWFAST_16x8_R50_K400/
84 | 
85 | NUM_GPUS: 8
86 | NUM_SHARDS: 1
87 | RNG_SEED: 0
88 | 


--------------------------------------------------------------------------------
/slowfast/models/detection_helper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.autograd import Function
 4 | from torch.autograd.function import once_differentiable
 5 | from torch.nn.modules.utils import _pair
 6 | 
 7 | from slowfast import _C
 8 | 
 9 | 
10 | class _ROIAlign(Function):
11 |     @staticmethod
12 |     def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
13 |         ctx.save_for_backward(roi)
14 |         ctx.output_size = _pair(output_size)
15 |         ctx.spatial_scale = spatial_scale
16 |         ctx.sampling_ratio = sampling_ratio
17 |         ctx.input_shape = input.size()
18 |         output = _C.roi_align_forward(
19 |             input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
20 |         )
21 |         return output
22 | 
23 |     @staticmethod
24 |     @once_differentiable
25 |     def backward(ctx, grad_output):
26 |         rois, = ctx.saved_tensors
27 |         output_size = ctx.output_size
28 |         spatial_scale = ctx.spatial_scale
29 |         sampling_ratio = ctx.sampling_ratio
30 |         bs, ch, h, w = ctx.input_shape
31 |         grad_input = _C.roi_align_backward(
32 |             grad_output,
33 |             rois,
34 |             spatial_scale,
35 |             output_size[0],
36 |             output_size[1],
37 |             bs,
38 |             ch,
39 |             h,
40 |             w,
41 |             sampling_ratio,
42 |         )
43 |         return grad_input, None, None, None, None
44 | 
45 | 
46 | roi_align = _ROIAlign.apply
47 | 
48 | 
49 | class ROIAlign(nn.Module):
50 |     def __init__(self, output_size, spatial_scale, sampling_ratio):
51 |         super(ROIAlign, self).__init__()
52 |         self.output_size = output_size
53 |         self.spatial_scale = spatial_scale
54 |         self.sampling_ratio = sampling_ratio
55 | 
56 |     def forward(self, input, rois):
57 |         return roi_align(
58 |             input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
59 |         )
60 | 
61 |     def __repr__(self):
62 |         tmpstr = self.__class__.__name__ + "("
63 |         tmpstr += "output_size=" + str(self.output_size)
64 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
65 |         tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
66 |         tmpstr += ")"
67 |         return tmpstr
68 | 


--------------------------------------------------------------------------------
/configs/Charades/SLOWFAST_16x8_R101_K400.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: charades
 4 |   BATCH_SIZE: 16
 5 |   EVAL_PERIOD: 1
 6 |   FULL_TIME_EVAL: True
 7 |   CHECKPOINT_PERIOD: 1
 8 |   AUTO_RESUME: True
 9 |   CHECKPOINT_TYPE: pytorch
10 |   CHECKPOINT_FILE_PATH: /home/pg/projects/action/progress-action/logs/Kinetics/MOX2_SLOWFAST_8x8_R101.77.22/checkpoints/checkpoint_epoch_00196.pyth
11 | 
12 | DATA:
13 |   NUM_FRAMES: 64
14 |   SAMPLING_RATE: 2
15 |   TRAIN_JITTER_SCALES: [256, 320]
16 |   TRAIN_CROP_SIZE: 224
17 |   TEST_CROP_SIZE: 256
18 |   INPUT_CHANNEL_NUM: [3, 3]
19 |   MULTI_LABEL: True
20 |   INV_UNIFORM_SAMPLE: True
21 |   ENSEMBLE_METHOD: max
22 |   REVERSE_INPUT_CHANNEL: True
23 |   PATH_TO_DATA_DIR: /home/pg/data/Charades/
24 |   PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb
25 | 
26 | SLOWFAST:
27 |   ALPHA: 4
28 |   BETA_INV: 8
29 |   FUSION_CONV_CHANNEL_RATIO: 2
30 |   FUSION_KERNEL_SZ: 5
31 | 
32 | RESNET:
33 |   ZERO_INIT_FINAL_BN: True
34 |   WIDTH_PER_GROUP: 64
35 |   NUM_GROUPS: 1
36 |   DEPTH: 101
37 |   TRANS_FUNC: bottleneck_transform
38 |   STRIDE_1X1: False
39 |   NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [23, 23], [3, 3]]
40 |   SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
41 |   SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
42 | 
43 | NONLOCAL:
44 |   LOCATION: [[[], []], [[], []], [[], []], [[], []]]
45 |   GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
46 |   INSTANTIATION: dot_product
47 | 
48 | BN:
49 |   USE_PRECISE_STATS: False
50 |   WEIGHT_DECAY: 0.0
51 |   NORM_TYPE: frozen_batchnorm
52 | 
53 | SOLVER:
54 |   BASE_LR: 0.0375
55 |   LR_POLICY: steps_with_relative_lrs
56 |   LRS: [1, 0.1, 0.01, 0.001]
57 |   STEPS: [0, 23, 39]
58 |   MAX_EPOCH: 40
59 |   MOMENTUM: 0.9
60 |   WEIGHT_DECAY: 1e-4
61 |   WARMUP_EPOCHS: 4.0
62 |   WARMUP_START_LR: 0.0001
63 |   OPTIMIZING_METHOD: sgd
64 | 
65 | MODEL:
66 |   NUM_CLASSES: 157
67 |   ARCH: slowfast
68 |   MODEL_NAME: SlowFast
69 |   LOSS_FUNC: bce_logit
70 |   HEAD_ACT: sigmoid
71 |   DROPOUT_RATE: 0.5
72 |   FINAL_POOL: ["avg", "max"]
73 | 
74 | TEST:
75 |   ENABLE: True
76 |   DATASET: charades
77 |   BATCH_SIZE: 16
78 |   NUM_ENSEMBLE_VIEWS: 10
79 |   NUM_SPATIAL_CROPS: 3
80 | 
81 | DATA_LOADER:
82 |   NUM_WORKERS: 8
83 |   PIN_MEMORY: True
84 | 
85 | LOGS:
86 |   DIR: logs/Charades/SLOWFAST_16x8_R101_K400/
87 | 
88 | NUM_GPUS: 8
89 | NUM_SHARDS: 1
90 | RNG_SEED: 0
91 | 


--------------------------------------------------------------------------------
/configs/Charades/SLOWFAST_16x8_R101_K600.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: charades
 4 |   BATCH_SIZE: 16
 5 |   EVAL_PERIOD: 1
 6 |   FULL_TIME_EVAL: True
 7 |   CHECKPOINT_PERIOD: 1
 8 |   AUTO_RESUME: True
 9 |   CHECKPOINT_TYPE: aia
10 |   CHECKPOINT_FILE_PATH: /home/pg/data/models/k600/SLOWFAST_8x8_R101.pth
11 | 
12 | DATA:
13 |   NUM_FRAMES: 64
14 |   SAMPLING_RATE: 2
15 |   TRAIN_JITTER_SCALES: [256, 320]
16 |   TRAIN_CROP_SIZE: 224
17 |   TEST_CROP_SIZE: 256
18 |   INPUT_CHANNEL_NUM: [3, 3]
19 |   MULTI_LABEL: True
20 |   INV_UNIFORM_SAMPLE: True
21 |   ENSEMBLE_METHOD: max
22 |   REVERSE_INPUT_CHANNEL: True
23 |   PATH_TO_DATA_DIR: /home/pg/data/Charades/
24 |   PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb
25 | 
26 | SLOWFAST:
27 |   ALPHA: 4
28 |   BETA_INV: 8
29 |   FUSION_CONV_CHANNEL_RATIO: 2
30 |   FUSION_KERNEL_SZ: 5
31 |   FUSION_BN: False
32 |   FUSION_RELU: False
33 | 
34 | RESNET:
35 |   ZERO_INIT_FINAL_BN: True
36 |   WIDTH_PER_GROUP: 64
37 |   NUM_GROUPS: 1
38 |   DEPTH: 101
39 |   TRANS_FUNC: bottleneck_transform
40 |   STRIDE_1X1: False
41 |   NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [23, 23], [3, 3]]
42 |   SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
43 |   SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
44 |   STEM_POOL_PAD: False
45 | 
46 | NONLOCAL:
47 |   LOCATION: [[[], []], [[], []], [[], []], [[], []]]
48 |   GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
49 |   INSTANTIATION: dot_product
50 | 
51 | BN:
52 |   USE_PRECISE_STATS: False
53 |   WEIGHT_DECAY: 0.0
54 |   NORM_TYPE: frozen_batchnorm
55 | 
56 | SOLVER:
57 |   BASE_LR: 0.0375
58 |   LR_POLICY: steps_with_relative_lrs
59 |   LRS: [1, 0.1, 0.01, 0.001]
60 |   STEPS: [0, 23, 39]
61 |   MAX_EPOCH: 40
62 |   MOMENTUM: 0.9
63 |   WEIGHT_DECAY: 1e-4
64 |   WARMUP_EPOCHS: 4.0
65 |   WARMUP_START_LR: 0.0001
66 |   OPTIMIZING_METHOD: sgd
67 | 
68 | MODEL:
69 |   NUM_CLASSES: 157
70 |   ARCH: slowfast
71 |   MODEL_NAME: SlowFast
72 |   LOSS_FUNC: bce_logit
73 |   HEAD_ACT: sigmoid
74 |   DROPOUT_RATE: 0.5
75 |   FINAL_POOL: ["avg", "max"]
76 | 
77 | TEST:
78 |   ENABLE: True
79 |   DATASET: charades
80 |   BATCH_SIZE: 16
81 |   NUM_ENSEMBLE_VIEWS: 10
82 |   NUM_SPATIAL_CROPS: 3
83 | 
84 | DATA_LOADER:
85 |   NUM_WORKERS: 8
86 |   PIN_MEMORY: True
87 | 
88 | LOGS:
89 |   DIR: logs/Charades/SLOWFAST_16x8_R101_K600/
90 | 
91 | NUM_GPUS: 8
92 | NUM_SHARDS: 1
93 | RNG_SEED: 0
94 | 


--------------------------------------------------------------------------------
/slowfast/models/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | """Video models."""
 2 | 
 3 | # Number of blocks for different stages given the model depth.
 4 | _MODEL_STAGE_DEPTH = {
 5 |     # ResNet
 6 |     50: (3, 4, 6, 3),
 7 |     101: (3, 4, 23, 3),
 8 | }
 9 | 
10 | 
11 | # Basis of temporal kernel sizes for each of the stage.
12 | _TEMPORAL_KERNEL_BASIS = {
13 |     "c2d": [
14 |         [[1]],  # conv1 temporal kernel.
15 |         [[1]],  # res2 temporal kernel.
16 |         [[1]],  # res3 temporal kernel.
17 |         [[1]],  # res4 temporal kernel.
18 |         [[1]],  # res5 temporal kernel.
19 |     ],
20 |     "c2d_nopool": [
21 |         [[1]],  # conv1 temporal kernel.
22 |         [[1]],  # res2 temporal kernel.
23 |         [[1]],  # res3 temporal kernel.
24 |         [[1]],  # res4 temporal kernel.
25 |         [[1]],  # res5 temporal kernel.
26 |     ],
27 |     "i3d": [
28 |         [[5]],  # conv1 temporal kernel.
29 |         [[3]],  # res2 temporal kernel.
30 |         [[3, 1]],  # res3 temporal kernel.
31 |         [[3, 1]],  # res4 temporal kernel.
32 |         [[1, 3]],  # res5 temporal kernel.
33 |     ],
34 |     "i3d_nopool": [
35 |         [[5]],  # conv1 temporal kernel.
36 |         [[3]],  # res2 temporal kernel.
37 |         [[3, 1]],  # res3 temporal kernel.
38 |         [[3, 1]],  # res4 temporal kernel.
39 |         [[1, 3]],  # res5 temporal kernel.
40 |     ],
41 |     "slow": [
42 |         [[1]],  # conv1 temporal kernel.
43 |         [[1]],  # res2 temporal kernel.
44 |         [[1]],  # res3 temporal kernel.
45 |         [[3]],  # res4 temporal kernel.
46 |         [[3]],  # res5 temporal kernel.
47 |     ],
48 |     "slowfast": [
49 |         [[1], [5]],  # conv1 temporal kernel for slow and fast pathway.
50 |         [[1], [3]],  # res2 temporal kernel for slow and fast pathway.
51 |         [[1], [3]],  # res3 temporal kernel for slow and fast pathway.
52 |         [[3], [3]],  # res4 temporal kernel for slow and fast pathway.
53 |         [[3], [3]],  # res5 temporal kernel for slow and fast pathway.
54 |     ],
55 |     "x3d": [
56 |         [[5]],  # conv1 temporal kernels.
57 |         [[3]],  # res2 temporal kernels.
58 |         [[3]],  # res3 temporal kernels.
59 |         [[3]],  # res4 temporal kernels.
60 |         [[3]],  # res5 temporal kernels.
61 |     ],
62 | }
63 | 
64 | _POOL1 = {
65 |     "c2d": [[2, 1, 1]],
66 |     "c2d_nopool": [[1, 1, 1]],
67 |     "i3d": [[2, 1, 1]],
68 |     "i3d_nopool": [[1, 1, 1]],
69 |     "slow": [[1, 1, 1]],
70 |     "slowfast": [[1, 1, 1], [1, 1, 1]],
71 |     "x3d": [[1, 1, 1]],
72 | }


--------------------------------------------------------------------------------
/slowfast/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Functions for computing metrics."""
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def topks_correct(preds, labels, ks):
10 |     """
11 |     Given the predictions, labels, and a list of top-k values, compute the
12 |     number of correct predictions for each top-k value.
13 | 
14 |     Args:
15 |         preds (array): array of predictions. Dimension is batchsize
16 |             N x ClassNum.
17 |         labels (array): array of labels. Dimension is batchsize N.
18 |         ks (list): list of top-k values. For example, ks = [1, 5] correspods
19 |             to top-1 and top-5.
20 | 
21 |     Returns:
22 |         topks_correct (list): list of numbers, where the `i`-th entry
23 |             corresponds to the number of top-`ks[i]` correct predictions.
24 |     """
25 |     assert preds.size(0) == labels.size(
26 |         0
27 |     ), "Batch dim of predictions and labels must match"
28 |     # Find the top max_k predictions for each sample
29 |     _top_max_k_vals, top_max_k_inds = torch.topk(
30 |         preds, max(ks), dim=1, largest=True, sorted=True
31 |     )
32 |     # (batch_size, max_k) -> (max_k, batch_size).
33 |     top_max_k_inds = top_max_k_inds.t()
34 |     # (batch_size, ) -> (max_k, batch_size).
35 |     rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds)
36 |     # (i, j) = 1 if top i-th prediction for the j-th sample is correct.
37 |     top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels)
38 |     # Compute the number of topk correct predictions for each k.
39 |     topks_correct = [
40 |         top_max_k_correct[:k, :].view(-1).float().sum() for k in ks
41 |     ]
42 |     return topks_correct
43 | 
44 | 
45 | def topk_errors(preds, labels, ks):
46 |     """
47 |     Computes the top-k error for each k.
48 |     Args:
49 |         preds (array): array of predictions. Dimension is N.
50 |         labels (array): array of labels. Dimension is N.
51 |         ks (list): list of ks to calculate the top accuracies.
52 |     """
53 |     num_topks_correct = topks_correct(preds, labels, ks)
54 |     return [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct]
55 | 
56 | 
57 | def topk_accuracies(preds, labels, ks):
58 |     """
59 |     Computes the top-k accuracy for each k.
60 |     Args:
61 |         preds (array): array of predictions. Dimension is N.
62 |         labels (array): array of labels. Dimension is N.
63 |         ks (list): list of ks to calculate the top accuracies.
64 |     """
65 |     num_topks_correct = topks_correct(preds, labels, ks)
66 |     return [(x / preds.size(0)) * 100.0 for x in num_topks_correct]
67 | 


--------------------------------------------------------------------------------
/slowfast/models/operators.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Custom operators."""
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | 
10 | class Swish(nn.Module):
11 |     """Swish activation function: x * sigmoid(x)."""
12 | 
13 |     def __init__(self):
14 |         super(Swish, self).__init__()
15 | 
16 |     def forward(self, x):
17 |         return SwishEfficient.apply(x)
18 | 
19 | 
20 | class SwishEfficient(torch.autograd.Function):
21 |     """Swish activation function: x * sigmoid(x)."""
22 | 
23 |     @staticmethod
24 |     def forward(ctx, x):
25 |         result = x * torch.sigmoid(x)
26 |         ctx.save_for_backward(x)
27 |         return result
28 | 
29 |     @staticmethod
30 |     def backward(ctx, grad_output):
31 |         x = ctx.saved_variables[0]
32 |         sigmoid_x = torch.sigmoid(x)
33 |         return grad_output * (sigmoid_x * (1 + x * (1 - sigmoid_x)))
34 | 
35 | 
36 | class SE(nn.Module):
37 |     """Squeeze-and-Excitation (SE) block w/ Swish: AvgPool, FC, Swish, FC, Sigmoid."""
38 | 
39 |     def _round_width(self, width, multiplier, min_width=8, divisor=8):
40 |         """
41 |         Round width of filters based on width multiplier
42 |         Args:
43 |             width (int): the channel dimensions of the input.
44 |             multiplier (float): the multiplication factor.
45 |             min_width (int): the minimum width after multiplication.
46 |             divisor (int): the new width should be dividable by divisor.
47 |         """
48 |         if not multiplier:
49 |             return width
50 | 
51 |         width *= multiplier
52 |         min_width = min_width or divisor
53 |         width_out = max(
54 |             min_width, int(width + divisor / 2) // divisor * divisor
55 |         )
56 |         if width_out < 0.9 * width:
57 |             width_out += divisor
58 |         return int(width_out)
59 | 
60 |     def __init__(self, dim_in, ratio, relu_act=True):
61 |         """
62 |         Args:
63 |             dim_in (int): the channel dimensions of the input.
64 |             ratio (float): the channel reduction ratio for squeeze.
65 |             relu_act (bool): whether to use ReLU activation instead
66 |                 of Swish (default).
67 |             divisor (int): the new width should be dividable by divisor.
68 |         """
69 |         super(SE, self).__init__()
70 |         self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
71 |         dim_fc = self._round_width(dim_in, ratio)
72 |         self.fc1 = nn.Conv3d(dim_in, dim_fc, 1, bias=True)
73 |         self.fc1_act = nn.ReLU() if relu_act else Swish()
74 |         self.fc2 = nn.Conv3d(dim_fc, dim_in, 1, bias=True)
75 | 
76 |         self.fc2_sig = nn.Sigmoid()
77 | 
78 |     def forward(self, x):
79 |         x_in = x
80 |         for module in self.children():
81 |             x = module(x)
82 |         return x_in * x
83 | 


--------------------------------------------------------------------------------
/slowfast/datasets/multigrid_helper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Helper functions for multigrid training."""
 5 | 
 6 | import numpy as np
 7 | from torch._six import int_classes as _int_classes
 8 | from torch.utils.data.sampler import Sampler
 9 | 
10 | 
11 | class ShortCycleBatchSampler(Sampler):
12 |     """
13 |     Extend Sampler to support "short cycle" sampling.
14 |     See paper "A Multigrid Method for Efficiently Training Video Models",
15 |     Wu et al., 2019 (https://arxiv.org/abs/1912.00998) for details.
16 |     """
17 | 
18 |     def __init__(self, sampler, batch_size, drop_last, cfg):
19 |         if not isinstance(sampler, Sampler):
20 |             raise ValueError(
21 |                 "sampler should be an instance of "
22 |                 "torch.utils.data.Sampler, but got sampler={}".format(sampler)
23 |             )
24 |         if (
25 |             not isinstance(batch_size, _int_classes)
26 |             or isinstance(batch_size, bool)
27 |             or batch_size <= 0
28 |         ):
29 |             raise ValueError(
30 |                 "batch_size should be a positive integer value, "
31 |                 "but got batch_size={}".format(batch_size)
32 |             )
33 |         if not isinstance(drop_last, bool):
34 |             raise ValueError(
35 |                 "drop_last should be a boolean value, but got "
36 |                 "drop_last={}".format(drop_last)
37 |             )
38 |         self.sampler = sampler
39 |         self.drop_last = drop_last
40 | 
41 |         bs_factor = [
42 |             int(
43 |                 round(
44 |                     (
45 |                         float(cfg.DATA.TRAIN_CROP_SIZE)
46 |                         / (s * cfg.MULTIGRID.DEFAULT_S)
47 |                     )
48 |                     ** 2
49 |                 )
50 |             )
51 |             for s in cfg.MULTIGRID.SHORT_CYCLE_FACTORS
52 |         ]
53 | 
54 |         self.batch_sizes = [
55 |             batch_size * bs_factor[0],
56 |             batch_size * bs_factor[1],
57 |             batch_size,
58 |         ]
59 | 
60 |     def __iter__(self):
61 |         counter = 0
62 |         batch_size = self.batch_sizes[0]
63 |         batch = []
64 |         for idx in self.sampler:
65 |             batch.append((idx, counter % 3))
66 |             if len(batch) == batch_size:
67 |                 yield batch
68 |                 counter += 1
69 |                 batch_size = self.batch_sizes[counter % 3]
70 |                 batch = []
71 |         if len(batch) > 0 and not self.drop_last:
72 |             yield batch
73 | 
74 |     def __len__(self):
75 |         avg_batch_size = sum(self.batch_sizes) / 3.0
76 |         if self.drop_last:
77 |             return int(np.floor(len(self.sampler) / avg_batch_size))
78 |         else:
79 |             return int(np.ceil(len(self.sampler) / avg_batch_size))
80 | 


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/np_box_mask_list.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Numpy BoxMaskList classes and functions."""
17 | 
18 | from __future__ import (
19 |     absolute_import,
20 |     division,
21 |     print_function,
22 |     unicode_literals,
23 | )
24 | import numpy as np
25 | 
26 | from . import np_box_list
27 | 
28 | 
29 | class BoxMaskList(np_box_list.BoxList):
30 |     """Convenience wrapper for BoxList with masks.
31 | 
32 |   BoxMaskList extends the np_box_list.BoxList to contain masks as well.
33 |   In particular, its constructor receives both boxes and masks. Note that the
34 |   masks correspond to the full image.
35 |   """
36 | 
37 |     def __init__(self, box_data, mask_data):
38 |         """Constructs box collection.
39 | 
40 |     Args:
41 |       box_data: a numpy array of shape [N, 4] representing box coordinates
42 |       mask_data: a numpy array of shape [N, height, width] representing masks
43 |         with values are in {0,1}. The masks correspond to the full
44 |         image. The height and the width will be equal to image height and width.
45 | 
46 |     Raises:
47 |       ValueError: if bbox data is not a numpy array
48 |       ValueError: if invalid dimensions for bbox data
49 |       ValueError: if mask data is not a numpy array
50 |       ValueError: if invalid dimension for mask data
51 |     """
52 |         super(BoxMaskList, self).__init__(box_data)
53 |         if not isinstance(mask_data, np.ndarray):
54 |             raise ValueError("Mask data must be a numpy array.")
55 |         if len(mask_data.shape) != 3:
56 |             raise ValueError("Invalid dimensions for mask data.")
57 |         if mask_data.dtype != np.uint8:
58 |             raise ValueError(
59 |                 "Invalid data type for mask data: uint8 is required."
60 |             )
61 |         if mask_data.shape[0] != box_data.shape[0]:
62 |             raise ValueError(
63 |                 "There should be the same number of boxes and masks."
64 |             )
65 |         self.data["masks"] = mask_data
66 | 
67 |     def get_masks(self):
68 |         """Convenience function for accessing masks.
69 | 
70 |     Returns:
71 |       a numpy array of shape [N, height, width] representing masks
72 |     """
73 |         return self.get_field("masks")
74 | 


--------------------------------------------------------------------------------
/slowfast/utils/logging.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Logging."""
 5 | 
 6 | import os
 7 | import builtins
 8 | import decimal
 9 | import functools
10 | import logging
11 | import os
12 | import sys
13 | import simplejson
14 | import torch.distributed as dist
15 | from datetime import datetime
16 | from fvcore.common.file_io import PathManager
17 | 
18 | import slowfast.utils.distributed as du
19 | 
20 | 
21 | def _suppress_print():
22 |     """
23 |     Suppresses printing from the current process.
24 |     """
25 | 
26 |     def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False):
27 |         pass
28 | 
29 |     builtins.print = print_pass
30 | 
31 | 
32 | def setup_logger(cfg, name=None):
33 |     logger = logging.getLogger('progress-action')
34 |     logger.setLevel(logging.DEBUG)
35 |     logger.propogate = False
36 |     # don't log results for the non-master process
37 |     if not du.is_master_proc():
38 |         _suppress_print()
39 |         return logger
40 |     formatter = logging.Formatter(
41 |         "%(asctime)s [%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s")
42 | 
43 |     ch = logging.StreamHandler(stream=sys.stdout)
44 |     ch.setLevel(logging.INFO)
45 |     ch.setFormatter(formatter)
46 |     logger.addHandler(ch)
47 | 
48 |     # multi-machine
49 |     if cfg.NUM_GPUS != du.get_world_size():
50 |         assert du.is_master_proc()
51 |         num_gpus_per_machine = cfg.NUM_GPUS
52 |         worker = du.get_rank() // cfg.NUM_GPUS
53 |         filename = os.path.join(cfg.LOGS.DIR, f"{name}-worker-{worker}.log")
54 |     else:
55 |         filename = os.path.join(cfg.LOGS.DIR, f"{name}.log")
56 |     if name is None or os.path.exists(filename):
57 |         filename = os.path.join(
58 |             cfg.LOGS.DIR, '{} {}.log'.format(name, datetime.now()))
59 |     fh = logging.FileHandler(filename)
60 |     fh.setLevel(logging.DEBUG)
61 |     fh.setFormatter(formatter)
62 |     logger.addHandler(fh)
63 | 
64 |     return logger
65 | 
66 | 
67 | def get_logger(name):
68 |     """
69 |     Retrieve the logger with the specified name or, if name is None, return a
70 |     logger which is the root logger of the hierarchy.
71 |     Args:
72 |         name (string): name of the logger.
73 |     """
74 |     return logging.getLogger('progress-action.' + name)
75 | 
76 | 
77 | def log_json_stats(stats):
78 |     """
79 |     Logs json stats.
80 |     Args:
81 |         stats (dict): a dictionary of statistical information to log.
82 |     """
83 |     stats = {
84 |         k: decimal.Decimal("{:.6f}".format(v)) if isinstance(v, float) else v
85 |         for k, v in stats.items()
86 |     }
87 |     # json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True)
88 |     logstr = "; ".join(["{}: {}".format(k, v) for k, v in stats.items()])
89 |     if du.is_master_proc():
90 |         logger = get_logger(__name__)
91 |         logger.info(logstr)
92 |         # logger.info("json_stats: {:s}".format(json_stats))
93 | 


--------------------------------------------------------------------------------
/slowfast/utils/bn_helper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """bn helper."""
 5 | 
 6 | import itertools
 7 | import torch
 8 | 
 9 | 
10 | @torch.no_grad()
11 | def compute_and_update_bn_stats(model, data_loader, num_batches=200):
12 |     """
13 |     Compute and update the batch norm stats to make it more precise. During
14 |     training both bn stats and the weight are changing after every iteration,
15 |     so the bn can not precisely reflect the latest stats of the current model.
16 |     Here the bn stats is recomputed without change of weights, to make the
17 |     running mean and running var more precise.
18 |     Args:
19 |         model (model): the model using to compute and update the bn stats.
20 |         data_loader (dataloader): dataloader using to provide inputs.
21 |         num_batches (int): running iterations using to compute the stats.
22 |     """
23 | 
24 |     # Prepares all the bn layers.
25 |     bn_layers = [
26 |         m
27 |         for m in model.modules()
28 |         if any(
29 |             (
30 |                 isinstance(m, bn_type)
31 |                 for bn_type in (
32 |                     torch.nn.BatchNorm1d,
33 |                     torch.nn.BatchNorm2d,
34 |                     torch.nn.BatchNorm3d,
35 |                 )
36 |             )
37 |         )
38 |     ]
39 | 
40 |     # In order to make the running stats only reflect the current batch, the
41 |     # momentum is disabled.
42 |     # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean
43 |     # Setting the momentum to 1.0 to compute the stats without momentum.
44 |     momentum_actual = [bn.momentum for bn in bn_layers]
45 |     for bn in bn_layers:
46 |         bn.momentum = 1.0
47 | 
48 |     # Calculates the running iterations for precise stats computation.
49 |     running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers]
50 |     running_square_mean = [torch.zeros_like(bn.running_var) for bn in bn_layers]
51 | 
52 |     for ind, (inputs, _, _) in enumerate(
53 |         itertools.islice(data_loader, num_batches)
54 |     ):
55 |         # Forwards the model to update the bn stats.
56 |         if isinstance(inputs, (list,)):
57 |             for i in range(len(inputs)):
58 |                 inputs[i] = inputs[i].float().cuda(non_blocking=True)
59 |         else:
60 |             inputs = inputs.cuda(non_blocking=True)
61 |         model(inputs)
62 | 
63 |         for i, bn in enumerate(bn_layers):
64 |             # Accumulates the bn stats.
65 |             running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1)
66 |             # $E(x^2) = Var(x) + E(x)^2$.
67 |             cur_square_mean = bn.running_var + bn.running_mean ** 2
68 |             running_square_mean[i] += (
69 |                 cur_square_mean - running_square_mean[i]
70 |             ) / (ind + 1)
71 | 
72 |     for i, bn in enumerate(bn_layers):
73 |         bn.running_mean = running_mean[i]
74 |         # Var(x) = $E(x^2) - E(x)^2$.
75 |         bn.running_var = running_square_mean[i] - bn.running_mean ** 2
76 |         # Sets the precise bn stats.
77 |         bn.momentum = momentum_actual[i]
78 | 


--------------------------------------------------------------------------------
/slowfast/utils/lr_policy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Learning rate policy."""
 5 | 
 6 | import math
 7 | 
 8 | 
 9 | def get_lr_at_epoch(cfg, cur_epoch):
10 |     """
11 |     Retrieve the learning rate of the current epoch with the option to perform
12 |     warm up in the beginning of the training stage.
13 |     Args:
14 |         cfg (CfgNode): configs. Details can be found in
15 |             slowfast/config/defaults.py
16 |         cur_epoch (float): the number of epoch of the current training stage.
17 |     """
18 |     lr = get_lr_func(cfg.SOLVER.LR_POLICY)(cfg, cur_epoch)
19 |     # Perform warm up.
20 |     if cur_epoch < cfg.SOLVER.WARMUP_EPOCHS:
21 |         lr_start = cfg.SOLVER.WARMUP_START_LR
22 |         lr_end = get_lr_func(cfg.SOLVER.LR_POLICY)(
23 |             cfg, cfg.SOLVER.WARMUP_EPOCHS
24 |         )
25 |         alpha = (lr_end - lr_start) / cfg.SOLVER.WARMUP_EPOCHS
26 |         lr = cur_epoch * alpha + lr_start
27 |     return lr
28 | 
29 | 
30 | def lr_func_cosine(cfg, cur_epoch):
31 |     """
32 |     Retrieve the learning rate to specified values at specified epoch with the
33 |     cosine learning rate schedule. Details can be found in:
34 |     Ilya Loshchilov, and  Frank Hutter
35 |     SGDR: Stochastic Gradient Descent With Warm Restarts.
36 |     Args:
37 |         cfg (CfgNode): configs. Details can be found in
38 |             slowfast/config/defaults.py
39 |         cur_epoch (float): the number of epoch of the current training stage.
40 |     """
41 |     return (
42 |         cfg.SOLVER.BASE_LR
43 |         * (math.cos(math.pi * cur_epoch / cfg.SOLVER.MAX_EPOCH) + 1.0)
44 |         * 0.5
45 |     )
46 | 
47 | 
48 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch):
49 |     """
50 |     Retrieve the learning rate to specified values at specified epoch with the
51 |     steps with relative learning rate schedule.
52 |     Args:
53 |         cfg (CfgNode): configs. Details can be found in
54 |             slowfast/config/defaults.py
55 |         cur_epoch (float): the number of epoch of the current training stage.
56 |     """
57 |     ind = get_step_index(cfg, cur_epoch)
58 |     return cfg.SOLVER.LRS[ind] * cfg.SOLVER.BASE_LR
59 | 
60 | 
61 | def get_step_index(cfg, cur_epoch):
62 |     """
63 |     Retrieves the lr step index for the given epoch.
64 |     Args:
65 |         cfg (CfgNode): configs. Details can be found in
66 |             slowfast/config/defaults.py
67 |         cur_epoch (float): the number of epoch of the current training stage.
68 |     """
69 |     steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_EPOCH]
70 |     for ind, step in enumerate(steps):  # NoQA
71 |         if cur_epoch < step:
72 |             break
73 |     return ind - 1
74 | 
75 | 
76 | def get_lr_func(lr_policy):
77 |     """
78 |     Given the configs, retrieve the specified lr policy function.
79 |     Args:
80 |         lr_policy (string): the learning rate policy to use for the job.
81 |     """
82 |     policy = "lr_func_" + lr_policy
83 |     if policy not in globals():
84 |         raise NotImplementedError("Unknown LR policy: {}".format(lr_policy))
85 |     else:
86 |         return globals()[policy]
87 | 


--------------------------------------------------------------------------------
/slowfast/utils/aia_model_loading.py:
--------------------------------------------------------------------------------
 1 | """AIA to PyTorch checkpoint name converting utility."""
 2 | 
 3 | import re
 4 | 
 5 | 
 6 | def get_name_convert_func():
 7 |     """
 8 |     Get the function to convert AIA layer names to SlowFast layer names.
 9 |     Returns:
10 |         (func): function to convert parameter name from AIA format to PyTorch
11 |         format.
12 |     """
13 | 
14 |     pairs = [
15 |         # fuse fast to slow
16 |         # -----------------------------------------------------
17 |         # fast.Tconv1.conv.weight -> s1_fuse.conv_f2s.weight
18 |         [r"^fast.Tconv([1-4]).conv.(.*)", r"s\1_fuse.conv_f2s.\2"],
19 | 
20 |         # pathway
21 |         # -----------------------------------------------------
22 |         # slow -> pathway0, fast -> pathway1
23 |         [r"^slow(.*)", r"pathway0_\1"],
24 |         [r"^fast(.*)", r"pathway1_\1"],
25 | 
26 |         # stem
27 |         # ----------------------------------------------------
28 |         # slow.conv1.weight -> s1.pathway0_stem.conv.weight
29 |         [r"(.*).conv1.weight", r"s0.\1stem.conv.weight"],
30 |         # slow.bn1.weight -> s1.pathway0_stem.bn.weight
31 |         [r"(.*).bn1(.*)", r"s0.\1stem.bn\2"],
32 | 
33 |         # res stage
34 |         # -----------------------------------------------------
35 |         # conv1 -> a
36 |         [r"(.*).conv1.(.*)", r"\1.a.\2",],
37 |         # conv2 -> b
38 |         [r"(.*).conv2.(.*)", r"\1.b.\2",],
39 |         # conv3 -> c
40 |         [r"(.*).conv3.(.*)", r"\1.c.\2",],
41 |         # btnk -> branch2
42 |         [r"(.*).btnk.(.*)", r"\1.branch2.\2",],
43 |         # shortcut -> branch1
44 |         [r"(.*).shortcut.(.*)", r"\1.branch1.\2",],
45 |         # conv.weight -> weight
46 |         [r"(.*)([abc123]).conv.weight\Z", r"\1\2.weight"],
47 |         # .bn. -> _bn.
48 |         [r"(.*)([abc123]).bn\.(.*)", r"\1\2_bn.\3"],
49 | 
50 |         # res_nl1 -> s1
51 |         [r"(.*).res_nl([1-4])(.*)", r"s\2.\1\3"],
52 |         # .res_0 -> _res0
53 |         [r"(.*).res_([0-9]+)(.*)", r"\1res\2\3"],
54 | 
55 |         # stage number
56 |         [r"^s4\.(.*)", r"s5.\1"],
57 |         [r"^s3\.(.*)", r"s4.\1"],
58 |         [r"^s2\.(.*)", r"s3.\1"],
59 |         [r"^s1\.(.*)", r"s2.\1"],
60 |         [r"^s0\.(.*)", r"s1.\1"],
61 | 
62 |         # head
63 |         # -----------------------------------------------------
64 |         # cls_head.pred.weight -> head.projection.weight
65 |         [r"cls_head.pred", r"head.projection"],
66 |     ]
67 |     
68 |     def convert_aia_name_to_pytorch(aia_layer_name):
69 |         """
70 |         Convert the aia_layer_name to slowfast format by apply the list of
71 |         regular expressions.
72 |         Args:
73 |             aia_layer_name (str): aia layer name.
74 |         Returns:
75 |             (str): pytorch layer name.
76 |         """
77 |         if aia_layer_name.startswith("module"):
78 |             aia_layer_name = aia_layer_name.split("module.")[1]
79 |         if aia_layer_name.startswith("backbone"):
80 |             aia_layer_name = aia_layer_name.split("backbone.")[1]
81 |         for source, dest in pairs:
82 |             aia_layer_name = re.sub(source, dest, aia_layer_name)
83 |         return aia_layer_name
84 | 
85 |     return convert_aia_name_to_pytorch
86 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import glob
  4 | import os
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from setuptools import setup, Extension, find_packages
  9 | from torch.utils.cpp_extension import CUDA_HOME
 10 | from torch.utils.cpp_extension import CppExtension
 11 | from torch.utils.cpp_extension import CUDAExtension
 12 | from Cython.Build import cythonize
 13 | import platform
 14 | 
 15 | requirements = ["torch", "torchvision"]
 16 | 
 17 | 
 18 | def make_cython_ext(name, module, sources):
 19 |     extra_compile_args = None
 20 |     if platform.system() != 'Windows':
 21 |         extra_compile_args = {
 22 |             'cxx': ['-Wno-unused-function', '-Wno-write-strings']
 23 |         }
 24 | 
 25 |     extension = Extension(
 26 |         '{}.{}'.format(module, name),
 27 |         [os.path.join(*module.split('.'), p) for p in sources],
 28 |         include_dirs=[np.get_include()],
 29 |         language='c++',
 30 |         extra_compile_args=extra_compile_args)
 31 |     extension, = cythonize(extension)
 32 |     return extension
 33 | 
 34 | 
 35 | def make_cuda_ext(name, module, sources):
 36 |     return CUDAExtension(
 37 |         name='{}.{}'.format(module, name),
 38 |         sources=[os.path.join(*module.split('.'), p) for p in sources],
 39 |         extra_compile_args={
 40 |             'cxx': [],
 41 |             'nvcc': [
 42 |                 '-D__CUDA_NO_HALF_OPERATORS__',
 43 |                 '-D__CUDA_NO_HALF_CONVERSIONS__',
 44 |                 '-D__CUDA_NO_HALF2_OPERATORS__',
 45 |             ]
 46 |         })
 47 | 
 48 | 
 49 | def get_extensions():
 50 |     this_dir = os.path.dirname(os.path.abspath(__file__))
 51 |     extensions_dir = os.path.join(this_dir, "slowfast", "csrc")
 52 | 
 53 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
 54 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
 55 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
 56 | 
 57 |     sources = main_file + source_cpu
 58 |     extension = CppExtension
 59 | 
 60 |     extra_compile_args = {"cxx": []}
 61 |     define_macros = []
 62 | 
 63 |     if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
 64 |         extension = CUDAExtension
 65 |         sources += source_cuda
 66 |         define_macros += [("WITH_CUDA", None)]
 67 |         extra_compile_args["nvcc"] = [
 68 |             "-DCUDA_HAS_FP16=1",
 69 |             "-D__CUDA_NO_HALF_OPERATORS__",
 70 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
 71 |             "-D__CUDA_NO_HALF2_OPERATORS__",
 72 |             "--expt-relaxed-constexpr",
 73 |         ]
 74 | 
 75 |     sources = [os.path.join(extensions_dir, s) for s in sources]
 76 | 
 77 |     include_dirs = [extensions_dir]
 78 | 
 79 |     ext_modules = [
 80 |         extension(
 81 |             "slowfast._C",
 82 |             sources,
 83 |             include_dirs=include_dirs,
 84 |             define_macros=define_macros,
 85 |             extra_compile_args=extra_compile_args,
 86 |         ),
 87 |     ]
 88 | 
 89 |     return ext_modules
 90 | 
 91 | 
 92 | setup(
 93 |     name="slowfast",
 94 |     ext_modules=get_extensions(),
 95 |     packages=find_packages(".", exclude=[
 96 |         "configs", "scripts", "logs", "tools", "data",
 97 |     ]),
 98 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
 99 | )
100 | 


--------------------------------------------------------------------------------
/slowfast/models/optimizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Optimizer."""
 5 | 
 6 | import torch
 7 | 
 8 | import slowfast.utils.lr_policy as lr_policy
 9 | 
10 | 
11 | def construct_optimizer(model, cfg):
12 |     """
13 |     Construct a stochastic gradient descent or ADAM optimizer with momentum.
14 |     Details can be found in:
15 |     Herbert Robbins, and Sutton Monro. "A stochastic approximation method."
16 |     and
17 |     Diederik P.Kingma, and Jimmy Ba.
18 |     "Adam: A Method for Stochastic Optimization."
19 | 
20 |     Args:
21 |         model (model): model to perform stochastic gradient descent
22 |         optimization or ADAM optimization.
23 |         cfg (config): configs of hyper-parameters of SGD or ADAM, includes base
24 |         learning rate,  momentum, weight_decay, dampening, and etc.
25 |     """
26 |     # Batchnorm parameters.
27 |     bn_params = []
28 |     # Non-batchnorm parameters.
29 |     non_bn_parameters = []
30 |     for name, p in model.named_parameters():
31 |         if "bn" in name:
32 |             bn_params.append(p)
33 |         else:
34 |             non_bn_parameters.append(p)
35 |     # Apply different weight decay to Batchnorm and non-batchnorm parameters.
36 |     # In Caffe2 classification codebase the weight decay for batchnorm is 0.0.
37 |     # Having a different weight decay on batchnorm might cause a performance
38 |     # drop.
39 |     optim_params = [
40 |         {"params": bn_params, "weight_decay": cfg.BN.WEIGHT_DECAY},
41 |         {"params": non_bn_parameters, "weight_decay": cfg.SOLVER.WEIGHT_DECAY},
42 |     ]
43 |     # Check all parameters will be passed into optimizer.
44 |     assert len(list(model.parameters())) == len(non_bn_parameters) + len(
45 |         bn_params
46 |     ), "parameter size does not match: {} + {} != {}".format(
47 |         len(non_bn_parameters), len(bn_params), len(list(model.parameters()))
48 |     )
49 | 
50 |     if cfg.SOLVER.OPTIMIZING_METHOD == "sgd":
51 |         return torch.optim.SGD(
52 |             optim_params,
53 |             lr=cfg.SOLVER.BASE_LR,
54 |             momentum=cfg.SOLVER.MOMENTUM,
55 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
56 |             dampening=cfg.SOLVER.DAMPENING,
57 |             nesterov=cfg.SOLVER.NESTEROV,
58 |         )
59 |     elif cfg.SOLVER.OPTIMIZING_METHOD == "adam":
60 |         return torch.optim.Adam(
61 |             optim_params,
62 |             lr=cfg.SOLVER.BASE_LR,
63 |             betas=(0.9, 0.999),
64 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
65 |         )
66 |     else:
67 |         raise NotImplementedError(
68 |             "Does not support {} optimizer".format(cfg.SOLVER.OPTIMIZING_METHOD)
69 |         )
70 | 
71 | 
72 | def get_epoch_lr(cur_epoch, cfg):
73 |     """
74 |     Retrieves the lr for the given epoch (as specified by the lr policy).
75 |     Args:
76 |         cfg (config): configs of hyper-parameters of ADAM, includes base
77 |         learning rate, betas, and weight decays.
78 |         cur_epoch (float): the number of epoch of the current training stage.
79 |     """
80 |     return lr_policy.get_lr_at_epoch(cfg, cur_epoch)
81 | 
82 | 
83 | def set_lr(optimizer, new_lr):
84 |     """
85 |     Sets the optimizer lr to the specified value.
86 |     Args:
87 |         optimizer (optim): the optimizer using to optimize the current network.
88 |         new_lr (float): the new learning rate to set.
89 |     """
90 |     for param_group in optimizer.param_groups:
91 |         param_group["lr"] = new_lr
92 | 


--------------------------------------------------------------------------------
/slowfast/utils/benchmark.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Functions for benchmarks.
  4 | """
  5 | 
  6 | import numpy as np
  7 | import pprint
  8 | import torch
  9 | import tqdm
 10 | from fvcore.common.timer import Timer
 11 | 
 12 | import slowfast.utils.logging as logging
 13 | import slowfast.utils.misc as misc
 14 | from slowfast.datasets import loader
 15 | from slowfast.utils.env import setup_environment
 16 | 
 17 | logger = logging.get_logger(__name__)
 18 | 
 19 | 
 20 | def benchmark_data_loading(cfg):
 21 |     """
 22 |     Benchmark the speed of data loading in PySlowFast.
 23 |     Args:
 24 | 
 25 |         cfg (CfgNode): configs. Details can be found in
 26 |             slowfast/config/defaults.py
 27 |     """
 28 |     # Set up environment.
 29 |     setup_environment()
 30 |     # Set random seed from configs.
 31 |     np.random.seed(cfg.RNG_SEED)
 32 |     torch.manual_seed(cfg.RNG_SEED)
 33 | 
 34 |     # Setup logging format.
 35 |     logging.setup_logger(cfg.LOGS.DIR)
 36 | 
 37 |     # Print config.
 38 |     logger.info("Benchmark data loading with config:")
 39 |     logger.info(pprint.pformat(cfg))
 40 | 
 41 |     timer = Timer()
 42 |     dataloader = loader.construct_loader(cfg, "train")
 43 |     logger.info(
 44 |         "Initialize loader using {:.2f} seconds.".format(timer.seconds())
 45 |     )
 46 |     # Total batch size across different machines.
 47 |     batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS
 48 |     log_period = cfg.BENCHMARK.LOG_PERIOD
 49 |     epoch_times = []
 50 |     # Test for a few epochs.
 51 |     for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS):
 52 |         timer = Timer()
 53 |         timer_epoch = Timer()
 54 |         iter_times = []
 55 |         if cfg.BENCHMARK.SHUFFLE:
 56 |             loader.shuffle_dataset(dataloader, cur_epoch)
 57 |         for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)):
 58 |             if cur_iter > 0 and cur_iter % log_period == 0:
 59 |                 iter_times.append(timer.seconds())
 60 |                 ram_usage, ram_total = misc.cpu_mem_usage()
 61 |                 logger.info(
 62 |                     "Epoch {}: {} iters ({} videos) in {:.2f} seconds. "
 63 |                     "RAM Usage: {:.2f}/{:.2f} GB.".format(
 64 |                         cur_epoch,
 65 |                         log_period,
 66 |                         log_period * batch_size,
 67 |                         iter_times[-1],
 68 |                         ram_usage,
 69 |                         ram_total,
 70 |                     )
 71 |                 )
 72 |                 timer.reset()
 73 |         epoch_times.append(timer_epoch.seconds())
 74 |         ram_usage, ram_total = misc.cpu_mem_usage()
 75 |         logger.info(
 76 |             "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. "
 77 |             "RAM Usage: {:.2f}/{:.2f} GB.".format(
 78 |                 cur_epoch,
 79 |                 len(dataloader),
 80 |                 len(dataloader) * batch_size,
 81 |                 epoch_times[-1],
 82 |                 ram_usage,
 83 |                 ram_total,
 84 |             )
 85 |         )
 86 |         logger.info(
 87 |             "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} "
 88 |             "(avg/std) seconds.".format(
 89 |                 cur_epoch,
 90 |                 log_period,
 91 |                 log_period * batch_size,
 92 |                 np.mean(iter_times),
 93 |                 np.std(iter_times),
 94 |             )
 95 |         )
 96 |     logger.info(
 97 |         "On average every epoch ({} videos) takes {:.2f}/{:.2f} "
 98 |         "(avg/std) seconds.".format(
 99 |             len(dataloader) * batch_size,
100 |             np.mean(epoch_times),
101 |             np.std(epoch_times),
102 |         )
103 |     )
104 | 


--------------------------------------------------------------------------------
/slowfast/utils/parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Argument parser functions."""
  5 | 
  6 | import argparse
  7 | import sys
  8 | 
  9 | import slowfast.utils.checkpoint as cu
 10 | from slowfast.config.defaults import get_cfg
 11 | 
 12 | 
 13 | def parse_args():
 14 |     """
 15 |     Parse the following arguments for a default parser for PySlowFast users.
 16 |     Args:
 17 |         shard_id (int): shard id for the current machine. Starts from 0 to
 18 |             num_shards - 1. If single machine is used, then set shard id to 0.
 19 |         num_shards (int): number of shards using by the job.
 20 |         init_method (str): initialization method to launch the job with multiple
 21 |             devices. Options includes TCP or shared file-system for
 22 |             initialization. details can be find in
 23 |             https://pytorch.org/docs/stable/distributed.html#tcp-initialization
 24 |         cfg (str): path to the config file.
 25 |         opts (argument): provide addtional options from the command line, it
 26 |             overwrites the config loaded from file.
 27 |         """
 28 |     parser = argparse.ArgumentParser(
 29 |         description="Provide SlowFast video training and testing pipeline."
 30 |     )
 31 |     parser.add_argument(
 32 |         "--rank",
 33 |         help="The shard id of current node, Starts from 0 to num_shards - 1",
 34 |         default=0,
 35 |         type=int,
 36 |     )
 37 |     parser.add_argument(
 38 |         "--world_size",
 39 |         help="Number of shards using by the job",
 40 |         default=1,
 41 |         type=int,
 42 |     )
 43 |     parser.add_argument(
 44 |         "--init_method",
 45 |         help="Initialization method, includes TCP or shared file-system.",
 46 |         default="auto",
 47 |         type=str,
 48 |     )
 49 |     parser.add_argument(
 50 |         "--cfg",
 51 |         dest="cfg_file",
 52 |         help="Path to the config file",
 53 |         default="configs/Kinetics/SLOWFAST_4x16_R50.yaml",
 54 |         type=str,
 55 |     )
 56 |     if len(sys.argv) == 1:
 57 |         parser.print_help()
 58 |     return parser.parse_known_args()
 59 | 
 60 | 
 61 | def load_config(args, opts=None):
 62 |     """
 63 |     Given the arguemnts, load and initialize the configs.
 64 |     Args:
 65 |         args (argument): arguments includes `shard_id`, `num_shards`,
 66 |             `init_method`, `cfg_file`, and `opts`.
 67 |     """
 68 |     # Setup cfg.
 69 |     cfg = get_cfg()
 70 |     # Load config from cfg.
 71 |     if args.cfg_file is not None:
 72 |         cfg.merge_from_file(args.cfg_file)
 73 |     # Load config from command line, overwrite config from opts.
 74 |     if opts is not None:
 75 |         # remove unkown args
 76 |         remove_lists = []
 77 |         for i in range(len(opts)):
 78 |             if opts[i].startswith("--"):
 79 |                 remove_lists.append(opts[i])
 80 |                 if "=" not in opts[i]:
 81 |                     remove_lists.append(opts[i+1])
 82 |             elif opts[i].startswith("-"):
 83 |                 remove_lists.append(opts[i])
 84 |         for opt in remove_lists:
 85 |             print("Remove unkonwn args {}".format(opt))
 86 |             opts.remove(opt)
 87 |         cfg.merge_from_list(opts)
 88 | 
 89 |     # Inherit parameters from args.
 90 |     if hasattr(args, "world_size") and hasattr(args, "rank"):
 91 |         cfg.NUM_SHARDS = args.world_size
 92 |         cfg.SHARD_ID = args.rank
 93 |     if hasattr(args, "rng_seed"):
 94 |         cfg.RNG_SEED = args.rng_seed
 95 |     if hasattr(args, "output_dir"):
 96 |         cfg.LOGS.DIR = args.output_dir
 97 | 
 98 |     # Create the checkpoint dir.
 99 |     cu.make_checkpoint_dir(cfg.LOGS.DIR)
100 |     cfg.freeze()
101 |     return cfg
102 | 


--------------------------------------------------------------------------------
/slowfast/utils/setup_moxing_env.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | import os
  3 | import filelock
  4 | import logging
  5 | import tempfile
  6 | import six
  7 | 
  8 | import torch
  9 | filelock.logger().setLevel(logging.WARNING)
 10 | 
 11 | 
 12 | def safe_s3_cache(org_path, targ_path, copy_type):
 13 |     import moxing as mox
 14 |     mox.file.shift("os", "mox")
 15 | 
 16 |     safe_flag = targ_path + ".safe"
 17 |     if os.path.exists(safe_flag):
 18 |         return
 19 |     lock = filelock.FileLock(targ_path + ".lock")
 20 |     with lock:
 21 |         if not os.path.exists(safe_flag) and os.path.exists(org_path):
 22 |             if copy_type == "file":
 23 |                 mox.file.copy(org_path, targ_path)
 24 |             else:
 25 |                 mox.file.copy_parallel(org_path, targ_path, is_processing=False)
 26 |             open(safe_flag, "a").close()
 27 | 
 28 | 
 29 | def wrap_input_path(module, func_name, tmp_dir="/cache/", copy_method="file"):
 30 |     origin_func = getattr(module, func_name)
 31 | 
 32 |     def wrapped_func(input_path, *args, **kwargs):
 33 |         if input_path.startswith("obs://"):
 34 |             import moxing as mox
 35 |             mox.file.shift("os", "mox")
 36 | 
 37 |             relative_path = os.path.join("obs/", input_path[6:])
 38 |             local_path = os.path.join(tmp_dir, relative_path)
 39 |             local_dir, _ = os.path.split(local_path)
 40 |             os.makedirs(local_dir, exist_ok=True)
 41 |             if copy_method == "file":
 42 |                 safe_s3_cache(input_path, local_path, copy_method)
 43 |             else:
 44 |                 safe_s3_cache(os.path.split(input_path)[0], local_dir, copy_method)
 45 |             return origin_func(local_path, *args, **kwargs)
 46 |         else:
 47 |             return origin_func(input_path, *args, **kwargs)
 48 | 
 49 |     setattr(module, func_name, wrapped_func)
 50 | 
 51 | 
 52 | def wrap_output_path(module, func_name, tmp_dir="/cache/"):
 53 |     origin_func = getattr(module, func_name)
 54 | 
 55 |     def wrapped_func(data, output_path, *args, **kwargs):
 56 |         if isinstance(output_path, six.string_types) and output_path.startswith("obs://"):
 57 |             import moxing as mox
 58 |             mox.file.shift("os", "mox")
 59 | 
 60 |             with tempfile.NamedTemporaryFile(dir=tmp_dir) as f:
 61 |                 temp_path = f.name
 62 |                 origin_ret = origin_func(data, temp_path, *args, **kwargs)
 63 |                 mox.file.copy(temp_path, output_path)
 64 |         else:
 65 |             origin_ret = origin_func(data, output_path, *args, *kwargs)
 66 |         return origin_ret
 67 | 
 68 |     setattr(module, func_name, wrapped_func)
 69 | 
 70 | 
 71 | def wrap_input_path2(input_path, tmp_dir="/cache/", copy_method="file"):
 72 |     if input_path.startswith("obs://"):
 73 |         import moxing as mox
 74 |         mox.file.shift("os", "mox")
 75 | 
 76 |         relative_path = os.path.join("obs/", input_path[6:])
 77 |         local_path = os.path.join(tmp_dir, relative_path)
 78 |         local_dir, _ = os.path.split(local_path)
 79 |         os.makedirs(local_dir, exist_ok=True)
 80 |         if copy_method == "file":
 81 |             safe_s3_cache(input_path, local_path, copy_method)
 82 |         else:
 83 |             safe_s3_cache(os.path.split(input_path)[0], local_dir, copy_method)
 84 |         return local_path
 85 |     else:
 86 |         return input_path
 87 | 
 88 | 
 89 | def wrap_output_path2(origin_func, data, output_path, *args, **kwargs):
 90 |     if isinstance(output_path, six.string_types) and output_path.startswith("obs://"):
 91 |         import moxing as mox
 92 |         mox.file.shift("os", "mox")
 93 | 
 94 |         with tempfile.NamedTemporaryFile(dir="/cache/") as f:
 95 |             temp_path = f.name
 96 |             origin_ret = origin_func(data, temp_path, *args, **kwargs)
 97 |             mox.file.copy(temp_path, output_path)
 98 |     else:
 99 |         origin_ret = origin_func(data, output_path, *args, **kwargs)
100 | 


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt:
--------------------------------------------------------------------------------
  1 | item {
  2 |   name: "bend/bow (at the waist)"
  3 |   id: 1
  4 | }
  5 | item {
  6 |   name: "crouch/kneel"
  7 |   id: 3
  8 | }
  9 | item {
 10 |   name: "dance"
 11 |   id: 4
 12 | }
 13 | item {
 14 |   name: "fall down"
 15 |   id: 5
 16 | }
 17 | item {
 18 |   name: "get up"
 19 |   id: 6
 20 | }
 21 | item {
 22 |   name: "jump/leap"
 23 |   id: 7
 24 | }
 25 | item {
 26 |   name: "lie/sleep"
 27 |   id: 8
 28 | }
 29 | item {
 30 |   name: "martial art"
 31 |   id: 9
 32 | }
 33 | item {
 34 |   name: "run/jog"
 35 |   id: 10
 36 | }
 37 | item {
 38 |   name: "sit"
 39 |   id: 11
 40 | }
 41 | item {
 42 |   name: "stand"
 43 |   id: 12
 44 | }
 45 | item {
 46 |   name: "swim"
 47 |   id: 13
 48 | }
 49 | item {
 50 |   name: "walk"
 51 |   id: 14
 52 | }
 53 | item {
 54 |   name: "answer phone"
 55 |   id: 15
 56 | }
 57 | item {
 58 |   name: "carry/hold (an object)"
 59 |   id: 17
 60 | }
 61 | item {
 62 |   name: "climb (e.g., a mountain)"
 63 |   id: 20
 64 | }
 65 | item {
 66 |   name: "close (e.g., a door, a box)"
 67 |   id: 22
 68 | }
 69 | item {
 70 |   name: "cut"
 71 |   id: 24
 72 | }
 73 | item {
 74 |   name: "dress/put on clothing"
 75 |   id: 26
 76 | }
 77 | item {
 78 |   name: "drink"
 79 |   id: 27
 80 | }
 81 | item {
 82 |   name: "drive (e.g., a car, a truck)"
 83 |   id: 28
 84 | }
 85 | item {
 86 |   name: "eat"
 87 |   id: 29
 88 | }
 89 | item {
 90 |   name: "enter"
 91 |   id: 30
 92 | }
 93 | item {
 94 |   name: "hit (an object)"
 95 |   id: 34
 96 | }
 97 | item {
 98 |   name: "lift/pick up"
 99 |   id: 36
100 | }
101 | item {
102 |   name: "listen (e.g., to music)"
103 |   id: 37
104 | }
105 | item {
106 |   name: "open (e.g., a window, a car door)"
107 |   id: 38
108 | }
109 | item {
110 |   name: "play musical instrument"
111 |   id: 41
112 | }
113 | item {
114 |   name: "point to (an object)"
115 |   id: 43
116 | }
117 | item {
118 |   name: "pull (an object)"
119 |   id: 45
120 | }
121 | item {
122 |   name: "push (an object)"
123 |   id: 46
124 | }
125 | item {
126 |   name: "put down"
127 |   id: 47
128 | }
129 | item {
130 |   name: "read"
131 |   id: 48
132 | }
133 | item {
134 |   name: "ride (e.g., a bike, a car, a horse)"
135 |   id: 49
136 | }
137 | item {
138 |   name: "sail boat"
139 |   id: 51
140 | }
141 | item {
142 |   name: "shoot"
143 |   id: 52
144 | }
145 | item {
146 |   name: "smoke"
147 |   id: 54
148 | }
149 | item {
150 |   name: "take a photo"
151 |   id: 56
152 | }
153 | item {
154 |   name: "text on/look at a cellphone"
155 |   id: 57
156 | }
157 | item {
158 |   name: "throw"
159 |   id: 58
160 | }
161 | item {
162 |   name: "touch (an object)"
163 |   id: 59
164 | }
165 | item {
166 |   name: "turn (e.g., a screwdriver)"
167 |   id: 60
168 | }
169 | item {
170 |   name: "watch (e.g., TV)"
171 |   id: 61
172 | }
173 | item {
174 |   name: "work on a computer"
175 |   id: 62
176 | }
177 | item {
178 |   name: "write"
179 |   id: 63
180 | }
181 | item {
182 |   name: "fight/hit (a person)"
183 |   id: 64
184 | }
185 | item {
186 |   name: "give/serve (an object) to (a person)"
187 |   id: 65
188 | }
189 | item {
190 |   name: "grab (a person)"
191 |   id: 66
192 | }
193 | item {
194 |   name: "hand clap"
195 |   id: 67
196 | }
197 | item {
198 |   name: "hand shake"
199 |   id: 68
200 | }
201 | item {
202 |   name: "hand wave"
203 |   id: 69
204 | }
205 | item {
206 |   name: "hug (a person)"
207 |   id: 70
208 | }
209 | item {
210 |   name: "kiss (a person)"
211 |   id: 72
212 | }
213 | item {
214 |   name: "lift (a person)"
215 |   id: 73
216 | }
217 | item {
218 |   name: "listen to (a person)"
219 |   id: 74
220 | }
221 | item {
222 |   name: "push (another person)"
223 |   id: 76
224 | }
225 | item {
226 |   name: "sing to (e.g., self, a person, a group)"
227 |   id: 77
228 | }
229 | item {
230 |   name: "take (an object) from (a person)"
231 |   id: 78
232 | }
233 | item {
234 |   name: "talk to (e.g., self, a person, a group)"
235 |   id: 79
236 | }
237 | item {
238 |   name: "watch (a person)"
239 |   id: 80
240 | }
241 | 


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/np_box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Operations for [N, 4] numpy arrays representing bounding boxes.
 17 | 
 18 | Example box operations that are supported:
 19 |   * Areas: compute bounding box areas
 20 |   * IOU: pairwise intersection-over-union scores
 21 | """
 22 | from __future__ import (
 23 |     absolute_import,
 24 |     division,
 25 |     print_function,
 26 |     unicode_literals,
 27 | )
 28 | import numpy as np
 29 | 
 30 | 
 31 | def area(boxes):
 32 |     """Computes area of boxes.
 33 | 
 34 |   Args:
 35 |     boxes: Numpy array with shape [N, 4] holding N boxes
 36 | 
 37 |   Returns:
 38 |     a numpy array with shape [N*1] representing box areas
 39 |   """
 40 |     return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 41 | 
 42 | 
 43 | def intersection(boxes1, boxes2):
 44 |     """Compute pairwise intersection areas between boxes.
 45 | 
 46 |   Args:
 47 |     boxes1: a numpy array with shape [N, 4] holding N boxes
 48 |     boxes2: a numpy array with shape [M, 4] holding M boxes
 49 | 
 50 |   Returns:
 51 |     a numpy array with shape [N*M] representing pairwise intersection area
 52 |   """
 53 |     [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
 54 |     [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
 55 | 
 56 |     all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
 57 |     all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
 58 |     intersect_heights = np.maximum(
 59 |         np.zeros(all_pairs_max_ymin.shape),
 60 |         all_pairs_min_ymax - all_pairs_max_ymin,
 61 |     )
 62 |     all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
 63 |     all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
 64 |     intersect_widths = np.maximum(
 65 |         np.zeros(all_pairs_max_xmin.shape),
 66 |         all_pairs_min_xmax - all_pairs_max_xmin,
 67 |     )
 68 |     return intersect_heights * intersect_widths
 69 | 
 70 | 
 71 | def iou(boxes1, boxes2):
 72 |     """Computes pairwise intersection-over-union between box collections.
 73 | 
 74 |   Args:
 75 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
 76 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
 77 | 
 78 |   Returns:
 79 |     a numpy array with shape [N, M] representing pairwise iou scores.
 80 |   """
 81 |     intersect = intersection(boxes1, boxes2)
 82 |     area1 = area(boxes1)
 83 |     area2 = area(boxes2)
 84 |     union = (
 85 |         np.expand_dims(area1, axis=1)
 86 |         + np.expand_dims(area2, axis=0)
 87 |         - intersect
 88 |     )
 89 |     return intersect / union
 90 | 
 91 | 
 92 | def ioa(boxes1, boxes2):
 93 |     """Computes pairwise intersection-over-area between box collections.
 94 | 
 95 |   Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
 96 |   their intersection area over box2's area. Note that ioa is not symmetric,
 97 |   that is, IOA(box1, box2) != IOA(box2, box1).
 98 | 
 99 |   Args:
100 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
101 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
102 | 
103 |   Returns:
104 |     a numpy array with shape [N, M] representing pairwise ioa scores.
105 |   """
106 |     intersect = intersection(boxes1, boxes2)
107 |     areas = np.expand_dims(area(boxes2), axis=0)
108 |     return intersect / areas
109 | 


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/np_mask_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Operations for [N, height, width] numpy arrays representing masks.
 17 | 
 18 | Example mask operations that are supported:
 19 |   * Areas: compute mask areas
 20 |   * IOU: pairwise intersection-over-union scores
 21 | """
 22 | from __future__ import (
 23 |     absolute_import,
 24 |     division,
 25 |     print_function,
 26 |     unicode_literals,
 27 | )
 28 | import numpy as np
 29 | 
 30 | EPSILON = 1e-7
 31 | 
 32 | 
 33 | def area(masks):
 34 |     """Computes area of masks.
 35 | 
 36 |   Args:
 37 |     masks: Numpy array with shape [N, height, width] holding N masks. Masks
 38 |       values are of type np.uint8 and values are in {0,1}.
 39 | 
 40 |   Returns:
 41 |     a numpy array with shape [N*1] representing mask areas.
 42 | 
 43 |   Raises:
 44 |     ValueError: If masks.dtype is not np.uint8
 45 |   """
 46 |     if masks.dtype != np.uint8:
 47 |         raise ValueError("Masks type should be np.uint8")
 48 |     return np.sum(masks, axis=(1, 2), dtype=np.float32)
 49 | 
 50 | 
 51 | def intersection(masks1, masks2):
 52 |     """Compute pairwise intersection areas between masks.
 53 | 
 54 |   Args:
 55 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
 56 |       values are of type np.uint8 and values are in {0,1}.
 57 |     masks2: a numpy array with shape [M, height, width] holding M masks. Masks
 58 |       values are of type np.uint8 and values are in {0,1}.
 59 | 
 60 |   Returns:
 61 |     a numpy array with shape [N*M] representing pairwise intersection area.
 62 | 
 63 |   Raises:
 64 |     ValueError: If masks1 and masks2 are not of type np.uint8.
 65 |   """
 66 |     if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
 67 |         raise ValueError("masks1 and masks2 should be of type np.uint8")
 68 |     n = masks1.shape[0]
 69 |     m = masks2.shape[0]
 70 |     answer = np.zeros([n, m], dtype=np.float32)
 71 |     for i in np.arange(n):
 72 |         for j in np.arange(m):
 73 |             answer[i, j] = np.sum(
 74 |                 np.minimum(masks1[i], masks2[j]), dtype=np.float32
 75 |             )
 76 |     return answer
 77 | 
 78 | 
 79 | def iou(masks1, masks2):
 80 |     """Computes pairwise intersection-over-union between mask collections.
 81 | 
 82 |   Args:
 83 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
 84 |       values are of type np.uint8 and values are in {0,1}.
 85 |     masks2: a numpy array with shape [M, height, width] holding N masks. Masks
 86 |       values are of type np.uint8 and values are in {0,1}.
 87 | 
 88 |   Returns:
 89 |     a numpy array with shape [N, M] representing pairwise iou scores.
 90 | 
 91 |   Raises:
 92 |     ValueError: If masks1 and masks2 are not of type np.uint8.
 93 |   """
 94 |     if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
 95 |         raise ValueError("masks1 and masks2 should be of type np.uint8")
 96 |     intersect = intersection(masks1, masks2)
 97 |     area1 = area(masks1)
 98 |     area2 = area(masks2)
 99 |     union = (
100 |         np.expand_dims(area1, axis=1)
101 |         + np.expand_dims(area2, axis=0)
102 |         - intersect
103 |     )
104 |     return intersect / np.maximum(union, EPSILON)
105 | 
106 | 
107 | def ioa(masks1, masks2):
108 |     """Computes pairwise intersection-over-area between box collections.
109 | 
110 |   Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as
111 |   their intersection area over mask2's area. Note that ioa is not symmetric,
112 |   that is, IOA(mask1, mask2) != IOA(mask2, mask1).
113 | 
114 |   Args:
115 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
116 |       values are of type np.uint8 and values are in {0,1}.
117 |     masks2: a numpy array with shape [M, height, width] holding N masks. Masks
118 |       values are of type np.uint8 and values are in {0,1}.
119 | 
120 |   Returns:
121 |     a numpy array with shape [N, M] representing pairwise ioa scores.
122 | 
123 |   Raises:
124 |     ValueError: If masks1 and masks2 are not of type np.uint8.
125 |   """
126 |     if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
127 |         raise ValueError("masks1 and masks2 should be of type np.uint8")
128 |     intersect = intersection(masks1, masks2)
129 |     areas = np.expand_dims(area(masks2), axis=0)
130 |     return intersect / (areas + EPSILON)
131 | 


--------------------------------------------------------------------------------
/slowfast/utils/c2_model_loading.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Caffe2 to PyTorch checkpoint name converting utility."""
  5 | 
  6 | import re
  7 | 
  8 | 
  9 | def get_name_convert_func():
 10 |     """
 11 |     Get the function to convert Caffe2 layer names to PyTorch layer names.
 12 |     Returns:
 13 |         (func): function to convert parameter name from Caffe2 format to PyTorch
 14 |         format.
 15 |     """
 16 |     pairs = [
 17 |         # ------------------------------------------------------------
 18 |         # 'nonlocal_conv3_1_theta_w' -> 's3.pathway0_nonlocal3.conv_g.weight'
 19 |         [
 20 |             r"^nonlocal_conv([0-9]+)_([0-9]+)_(.*)",
 21 |             r"s\1.pathway0_nonlocal\2_\3",
 22 |         ],
 23 |         # 'theta' -> 'conv_theta'
 24 |         [r"^(.*)_nonlocal([0-9]+)_(theta)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 25 |         # 'g' -> 'conv_g'
 26 |         [r"^(.*)_nonlocal([0-9]+)_(g)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 27 |         # 'phi' -> 'conv_phi'
 28 |         [r"^(.*)_nonlocal([0-9]+)_(phi)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 29 |         # 'out' -> 'conv_out'
 30 |         [r"^(.*)_nonlocal([0-9]+)_(out)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 31 |         # 'nonlocal_conv4_5_bn_s' -> 's4.pathway0_nonlocal3.bn.weight'
 32 |         [r"^(.*)_nonlocal([0-9]+)_(bn)_(.*)", r"\1_nonlocal\2.\3.\4"],
 33 |         # ------------------------------------------------------------
 34 |         # 't_pool1_subsample_bn' -> 's1_fuse.conv_f2s.bn.running_mean'
 35 |         [r"^t_pool1_subsample_bn_(.*)", r"s1_fuse.bn.\1"],
 36 |         # 't_pool1_subsample' -> 's1_fuse.conv_f2s'
 37 |         [r"^t_pool1_subsample_(.*)", r"s1_fuse.conv_f2s.\1"],
 38 |         # 't_res4_5_branch2c_bn_subsample_bn_rm' -> 's4_fuse.conv_f2s.bias'
 39 |         [
 40 |             r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_bn_(.*)",
 41 |             r"s\1_fuse.bn.\3",
 42 |         ],
 43 |         # 't_pool1_subsample' -> 's1_fuse.conv_f2s'
 44 |         [
 45 |             r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_(.*)",
 46 |             r"s\1_fuse.conv_f2s.\3",
 47 |         ],
 48 |         # ------------------------------------------------------------
 49 |         # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b'
 50 |         [
 51 |             r"^res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)",
 52 |             r"s\1.pathway0_res\2.branch\3.\4_\5",
 53 |         ],
 54 |         # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.'
 55 |         [r"^res_conv1_bn_(.*)", r"s1.pathway0_stem.bn.\1"],
 56 |         # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.'
 57 |         [r"^conv1_(.*)", r"s1.pathway0_stem.conv.\1"],
 58 |         # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight'
 59 |         [
 60 |             r"^res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)",
 61 |             r"s\1.pathway0_res\2.branch\3_\4",
 62 |         ],
 63 |         # 'res_conv1_' -> 's1.pathway0_stem.conv.'
 64 |         [r"^res_conv1_(.*)", r"s1.pathway0_stem.conv.\1"],
 65 |         # ------------------------------------------------------------
 66 |         # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b'
 67 |         [
 68 |             r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)",
 69 |             r"s\1.pathway1_res\2.branch\3.\4_\5",
 70 |         ],
 71 |         # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.'
 72 |         [r"^t_res_conv1_bn_(.*)", r"s1.pathway1_stem.bn.\1"],
 73 |         # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.'
 74 |         [r"^t_conv1_(.*)", r"s1.pathway1_stem.conv.\1"],
 75 |         # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight'
 76 |         [
 77 |             r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)",
 78 |             r"s\1.pathway1_res\2.branch\3_\4",
 79 |         ],
 80 |         # 'res_conv1_' -> 's1.pathway0_stem.conv.'
 81 |         [r"^t_res_conv1_(.*)", r"s1.pathway1_stem.conv.\1"],
 82 |         # ------------------------------------------------------------
 83 |         # pred_ -> head.projection.
 84 |         [r"pred_(.*)", r"head.projection.\1"],
 85 |         # '.bn_b' -> '.weight'
 86 |         [r"(.*)bn.b\Z", r"\1bn.bias"],
 87 |         # '.bn_s' -> '.weight'
 88 |         [r"(.*)bn.s\Z", r"\1bn.weight"],
 89 |         # '_bn_rm' -> '.running_mean'
 90 |         [r"(.*)bn.rm\Z", r"\1bn.running_mean"],
 91 |         # '_bn_riv' -> '.running_var'
 92 |         [r"(.*)bn.riv\Z", r"\1bn.running_var"],
 93 |         # '_b' -> '.bias'
 94 |         [r"(.*)[\._]b\Z", r"\1.bias"],
 95 |         # '_w' -> '.weight'
 96 |         [r"(.*)[\._]w\Z", r"\1.weight"],
 97 |     ]
 98 | 
 99 |     def convert_caffe2_name_to_pytorch(caffe2_layer_name):
100 |         """
101 |         Convert the caffe2_layer_name to pytorch format by apply the list of
102 |         regular expressions.
103 |         Args:
104 |             caffe2_layer_name (str): caffe2 layer name.
105 |         Returns:
106 |             (str): pytorch layer name.
107 |         """
108 |         for source, dest in pairs:
109 |             caffe2_layer_name = re.sub(source, dest, caffe2_layer_name)
110 |         return caffe2_layer_name
111 | 
112 |     return convert_caffe2_name_to_pytorch
113 | 


--------------------------------------------------------------------------------
/slowfast/datasets/loader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Data loader."""
  5 | 
  6 | import itertools
  7 | import numpy as np
  8 | import torch
  9 | from torch.utils.data._utils.collate import default_collate
 10 | from torch.utils.data.distributed import DistributedSampler
 11 | from torch.utils.data.sampler import RandomSampler
 12 | 
 13 | from slowfast.datasets.multigrid_helper import ShortCycleBatchSampler
 14 | 
 15 | from .build import build_dataset
 16 | 
 17 | 
 18 | def detection_collate(batch):
 19 |     """
 20 |     Collate function for detection task. Concatanate bboxes, labels and
 21 |     metadata from different samples in the first dimension instead of
 22 |     stacking them to have a batch-size dimension.
 23 |     Args:
 24 |         batch (tuple or list): data batch to collate.
 25 |     Returns:
 26 |         (tuple): collated detection data batch.
 27 |     """
 28 |     inputs, labels, video_idx, extra_data = zip(*batch)
 29 |     inputs, video_idx = default_collate(inputs), default_collate(video_idx)
 30 |     labels = torch.tensor(np.concatenate(labels, axis=0)).float()
 31 | 
 32 |     collated_extra_data = {}
 33 |     for key in extra_data[0].keys():
 34 |         data = [d[key] for d in extra_data]
 35 |         if key == "boxes" or key == "ori_boxes":
 36 |             # Append idx info to the bboxes before concatenating them.
 37 |             bboxes = [
 38 |                 np.concatenate(
 39 |                     [np.full((data[i].shape[0], 1), float(i)), data[i]], axis=1
 40 |                 )
 41 |                 for i in range(len(data))
 42 |             ]
 43 |             bboxes = np.concatenate(bboxes, axis=0)
 44 |             collated_extra_data[key] = torch.tensor(bboxes).float()
 45 |         elif key == "step_idxes":
 46 |             collated_extra_data[key] = torch.tensor(np.concatenate(data, axis=0))
 47 |         elif key == "metadata":
 48 |             collated_extra_data[key] = torch.tensor(
 49 |                 list(itertools.chain(*data))
 50 |             ).view(-1, 3)
 51 |         else:
 52 |             collated_extra_data[key] = default_collate(data)
 53 | 
 54 |     return inputs, labels, video_idx, collated_extra_data
 55 | 
 56 | 
 57 | def construct_loader(cfg, split, is_precise_bn=False):
 58 |     """
 59 |     Constructs the data loader for the given dataset.
 60 |     Args:
 61 |         cfg (CfgNode): configs. Details can be found in
 62 |             slowfast/config/defaults.py
 63 |         split (str): the split of the data loader. Options include `train`,
 64 |             `val`, and `test`.
 65 |     """
 66 |     assert split in ["train", "val", "test"]
 67 |     if split in ["train"]:
 68 |         dataset_name = cfg.TRAIN.DATASET
 69 |         batch_size = int(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS)
 70 |         shuffle = True
 71 |         drop_last = True
 72 |     elif split in ["val"]:
 73 |         dataset_name = cfg.TRAIN.DATASET
 74 |         batch_size = int(cfg.TEST.BATCH_SIZE / cfg.NUM_GPUS)
 75 |         shuffle = False
 76 |         drop_last = False
 77 |     elif split in ["test"]:
 78 |         dataset_name = cfg.TEST.DATASET
 79 |         batch_size = int(cfg.TEST.BATCH_SIZE / cfg.NUM_GPUS)
 80 |         shuffle = False
 81 |         drop_last = False
 82 | 
 83 |     # Construct the dataset
 84 |     dataset = build_dataset(dataset_name, cfg, split)
 85 | 
 86 |     if cfg.MULTIGRID.SHORT_CYCLE and split in ["train"] and not is_precise_bn:
 87 |         # Create a sampler for multi-process training
 88 |         sampler = (
 89 |             DistributedSampler(dataset)
 90 |             if cfg.NUM_GPUS > 1
 91 |             else RandomSampler(dataset)
 92 |         )
 93 |         batch_sampler = ShortCycleBatchSampler(
 94 |             sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg
 95 |         )
 96 |         # Create a loader
 97 |         loader = torch.utils.data.DataLoader(
 98 |             dataset,
 99 |             batch_sampler=batch_sampler,
100 |             num_workers=cfg.DATA_LOADER.NUM_WORKERS,
101 |             pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
102 |         )
103 |     else:
104 |         # Create a sampler for multi-process training
105 |         sampler = DistributedSampler(dataset) if cfg.NUM_GPUS > 1 else None
106 |         # Create a loader
107 |         loader = torch.utils.data.DataLoader(
108 |             dataset,
109 |             batch_size=batch_size,
110 |             shuffle=(False if sampler else shuffle),
111 |             sampler=sampler,
112 |             num_workers=cfg.DATA_LOADER.NUM_WORKERS,
113 |             pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
114 |             drop_last=drop_last,
115 |             collate_fn=detection_collate if cfg.DETECTION.ENABLE else None,
116 |         )
117 |     return loader
118 | 
119 | 
120 | def shuffle_dataset(loader, cur_epoch):
121 |     """"
122 |     Shuffles the data.
123 |     Args:
124 |         loader (loader): data loader to perform shuffle.
125 |         cur_epoch (int): number of the current epoch.
126 |     """
127 |     sampler = (
128 |         loader.batch_sampler.sampler
129 |         if isinstance(loader.batch_sampler, ShortCycleBatchSampler)
130 |         else loader.sampler
131 |     )
132 |     assert isinstance(
133 |         sampler, (RandomSampler, DistributedSampler)
134 |     ), "Sampler type '{}' not supported".format(type(sampler))
135 |     # RandomSampler handles shuffling automatically
136 |     if isinstance(sampler, DistributedSampler):
137 |         # DistributedSampler shuffles data based on epoch
138 |         sampler.set_epoch(cur_epoch)
139 | 


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/np_box_list.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Numpy BoxList classes and functions."""
 17 | 
 18 | from __future__ import (
 19 |     absolute_import,
 20 |     division,
 21 |     print_function,
 22 |     unicode_literals,
 23 | )
 24 | import numpy as np
 25 | 
 26 | 
 27 | class BoxList(object):
 28 |     """Box collection.
 29 | 
 30 |   BoxList represents a list of bounding boxes as numpy array, where each
 31 |   bounding box is represented as a row of 4 numbers,
 32 |   [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within a
 33 |   given list correspond to a single image.
 34 | 
 35 |   Optionally, users can add additional related fields (such as
 36 |   objectness/classification scores).
 37 |   """
 38 | 
 39 |     def __init__(self, data):
 40 |         """Constructs box collection.
 41 | 
 42 |     Args:
 43 |       data: a numpy array of shape [N, 4] representing box coordinates
 44 | 
 45 |     Raises:
 46 |       ValueError: if bbox data is not a numpy array
 47 |       ValueError: if invalid dimensions for bbox data
 48 |     """
 49 |         if not isinstance(data, np.ndarray):
 50 |             raise ValueError("data must be a numpy array.")
 51 |         if len(data.shape) != 2 or data.shape[1] != 4:
 52 |             raise ValueError("Invalid dimensions for box data.")
 53 |         if data.dtype != np.float32 and data.dtype != np.float64:
 54 |             raise ValueError(
 55 |                 "Invalid data type for box data: float is required."
 56 |             )
 57 |         if not self._is_valid_boxes(data):
 58 |             raise ValueError(
 59 |                 "Invalid box data. data must be a numpy array of "
 60 |                 "N*[y_min, x_min, y_max, x_max]"
 61 |             )
 62 |         self.data = {"boxes": data}
 63 | 
 64 |     def num_boxes(self):
 65 |         """Return number of boxes held in collections."""
 66 |         return self.data["boxes"].shape[0]
 67 | 
 68 |     def get_extra_fields(self):
 69 |         """Return all non-box fields."""
 70 |         return [k for k in self.data.keys() if k != "boxes"]
 71 | 
 72 |     def has_field(self, field):
 73 |         return field in self.data
 74 | 
 75 |     def add_field(self, field, field_data):
 76 |         """Add data to a specified field.
 77 | 
 78 |     Args:
 79 |       field: a string parameter used to speficy a related field to be accessed.
 80 |       field_data: a numpy array of [N, ...] representing the data associated
 81 |           with the field.
 82 |     Raises:
 83 |       ValueError: if the field is already exist or the dimension of the field
 84 |           data does not matches the number of boxes.
 85 |     """
 86 |         if self.has_field(field):
 87 |             raise ValueError("Field " + field + "already exists")
 88 |         if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes():
 89 |             raise ValueError("Invalid dimensions for field data")
 90 |         self.data[field] = field_data
 91 | 
 92 |     def get(self):
 93 |         """Convenience function for accesssing box coordinates.
 94 | 
 95 |     Returns:
 96 |       a numpy array of shape [N, 4] representing box corners
 97 |     """
 98 |         return self.get_field("boxes")
 99 | 
100 |     def get_field(self, field):
101 |         """Accesses data associated with the specified field in the box collection.
102 | 
103 |     Args:
104 |       field: a string parameter used to speficy a related field to be accessed.
105 | 
106 |     Returns:
107 |       a numpy 1-d array representing data of an associated field
108 | 
109 |     Raises:
110 |       ValueError: if invalid field
111 |     """
112 |         if not self.has_field(field):
113 |             raise ValueError("field {} does not exist".format(field))
114 |         return self.data[field]
115 | 
116 |     def get_coordinates(self):
117 |         """Get corner coordinates of boxes.
118 | 
119 |     Returns:
120 |      a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
121 |     """
122 |         box_coordinates = self.get()
123 |         y_min = box_coordinates[:, 0]
124 |         x_min = box_coordinates[:, 1]
125 |         y_max = box_coordinates[:, 2]
126 |         x_max = box_coordinates[:, 3]
127 |         return [y_min, x_min, y_max, x_max]
128 | 
129 |     def _is_valid_boxes(self, data):
130 |         """Check whether data fullfills the format of N*[ymin, xmin, ymax, xmin].
131 | 
132 |     Args:
133 |       data: a numpy array of shape [N, 4] representing box coordinates
134 | 
135 |     Returns:
136 |       a boolean indicating whether all ymax of boxes are equal or greater than
137 |           ymin, and all xmax of boxes are equal or greater than xmin.
138 |     """
139 |         if data.shape[0] > 0:
140 |             for i in range(data.shape[0]):
141 |                 if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]:
142 |                     return False
143 |         return True
144 | 


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/metrics.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Functions for computing metrics like precision, recall, CorLoc and etc."""
 17 | from __future__ import division
 18 | import numpy as np
 19 | 
 20 | 
 21 | def compute_precision_recall(scores, labels, num_gt):
 22 |     """Compute precision and recall.
 23 | 
 24 |   Args:
 25 |     scores: A float numpy array representing detection score
 26 |     labels: A boolean numpy array representing true/false positive labels
 27 |     num_gt: Number of ground truth instances
 28 | 
 29 |   Raises:
 30 |     ValueError: if the input is not of the correct format
 31 | 
 32 |   Returns:
 33 |     precision: Fraction of positive instances over detected ones. This value is
 34 |       None if no ground truth labels are present.
 35 |     recall: Fraction of detected positive instance over all positive instances.
 36 |       This value is None if no ground truth labels are present.
 37 | 
 38 |   """
 39 |     if (
 40 |         not isinstance(labels, np.ndarray)
 41 |         or labels.dtype != np.bool
 42 |         or len(labels.shape) != 1
 43 |     ):
 44 |         raise ValueError("labels must be single dimension bool numpy array")
 45 | 
 46 |     if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:
 47 |         raise ValueError("scores must be single dimension numpy array")
 48 | 
 49 |     if num_gt < np.sum(labels):
 50 |         raise ValueError(
 51 |             "Number of true positives must be smaller than num_gt."
 52 |         )
 53 | 
 54 |     if len(scores) != len(labels):
 55 |         raise ValueError("scores and labels must be of the same size.")
 56 | 
 57 |     if num_gt == 0:
 58 |         return None, None
 59 | 
 60 |     sorted_indices = np.argsort(scores)
 61 |     sorted_indices = sorted_indices[::-1]
 62 |     labels = labels.astype(int)
 63 |     true_positive_labels = labels[sorted_indices]
 64 |     false_positive_labels = 1 - true_positive_labels
 65 |     cum_true_positives = np.cumsum(true_positive_labels)
 66 |     cum_false_positives = np.cumsum(false_positive_labels)
 67 |     precision = cum_true_positives.astype(float) / (
 68 |         cum_true_positives + cum_false_positives
 69 |     )
 70 |     recall = cum_true_positives.astype(float) / num_gt
 71 |     return precision, recall
 72 | 
 73 | 
 74 | def compute_average_precision(precision, recall):
 75 |     """Compute Average Precision according to the definition in VOCdevkit.
 76 | 
 77 |   Precision is modified to ensure that it does not decrease as recall
 78 |   decrease.
 79 | 
 80 |   Args:
 81 |     precision: A float [N, 1] numpy array of precisions
 82 |     recall: A float [N, 1] numpy array of recalls
 83 | 
 84 |   Raises:
 85 |     ValueError: if the input is not of the correct format
 86 | 
 87 |   Returns:
 88 |     average_precison: The area under the precision recall curve. NaN if
 89 |       precision and recall are None.
 90 | 
 91 |   """
 92 |     if precision is None:
 93 |         if recall is not None:
 94 |             raise ValueError("If precision is None, recall must also be None")
 95 |         return np.NAN
 96 | 
 97 |     if not isinstance(precision, np.ndarray) or not isinstance(
 98 |         recall, np.ndarray
 99 |     ):
100 |         raise ValueError("precision and recall must be numpy array")
101 |     if precision.dtype != np.float or recall.dtype != np.float:
102 |         raise ValueError("input must be float numpy array.")
103 |     if len(precision) != len(recall):
104 |         raise ValueError("precision and recall must be of the same size.")
105 |     if not precision.size:
106 |         return 0.0
107 |     if np.amin(precision) < 0 or np.amax(precision) > 1:
108 |         raise ValueError("Precision must be in the range of [0, 1].")
109 |     if np.amin(recall) < 0 or np.amax(recall) > 1:
110 |         raise ValueError("recall must be in the range of [0, 1].")
111 |     if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
112 |         raise ValueError("recall must be a non-decreasing array")
113 | 
114 |     recall = np.concatenate([[0], recall, [1]])
115 |     precision = np.concatenate([[0], precision, [0]])
116 | 
117 |     # Preprocess precision to be a non-decreasing array
118 |     for i in range(len(precision) - 2, -1, -1):
119 |         precision[i] = np.maximum(precision[i], precision[i + 1])
120 | 
121 |     indices = np.where(recall[1:] != recall[:-1])[0] + 1
122 |     average_precision = np.sum(
123 |         (recall[indices] - recall[indices - 1]) * precision[indices]
124 |     )
125 |     return average_precision
126 | 
127 | 
128 | def compute_cor_loc(
129 |     num_gt_imgs_per_class, num_images_correctly_detected_per_class
130 | ):
131 |     """Compute CorLoc according to the definition in the following paper.
132 | 
133 |   https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf
134 | 
135 |   Returns nans if there are no ground truth images for a class.
136 | 
137 |   Args:
138 |     num_gt_imgs_per_class: 1D array, representing number of images containing
139 |         at least one object instance of a particular class
140 |     num_images_correctly_detected_per_class: 1D array, representing number of
141 |         images that are correctly detected at least one object instance of a
142 |         particular class
143 | 
144 |   Returns:
145 |     corloc_per_class: A float numpy array represents the corloc score of each
146 |       class
147 |   """
148 |     # Divide by zero expected for classes with no gt examples.
149 |     with np.errstate(divide="ignore", invalid="ignore"):
150 |         return np.where(
151 |             num_gt_imgs_per_class == 0,
152 |             np.nan,
153 |             num_images_correctly_detected_per_class / num_gt_imgs_per_class,
154 |         )
155 | 


--------------------------------------------------------------------------------
/slowfast/models/backbones/x3d.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | import slowfast.utils.weight_init_helper as init_helper
  6 | from slowfast.models.batchnorm_helper import get_norm
  7 | 
  8 | from .. import head_helper, resnet_helper, stem_helper
  9 | from ..build import MODEL_REGISTRY
 10 | from . import _MODEL_STAGE_DEPTH, _TEMPORAL_KERNEL_BASIS, _POOL1
 11 | 
 12 | 
 13 | @MODEL_REGISTRY.register()
 14 | class X3D(nn.Module):
 15 |     """
 16 |     X3D model builder. It builds a X3D network backbone, which is a ResNet.
 17 |     Christoph Feichtenhofer.
 18 |     "X3D: Expanding Architectures for Efficient Video Recognition."
 19 |     https://arxiv.org/abs/2004.04730
 20 |     """
 21 | 
 22 |     def __init__(self, cfg):
 23 |         """
 24 |         The `__init__` method of any subclass should also contain these
 25 |             arguments.
 26 |         Args:
 27 |             cfg (CfgNode): model building configs, details are in the
 28 |                 comments of the config file.
 29 |         """
 30 |         super(X3D, self).__init__()
 31 |         self.norm_module = get_norm(cfg)
 32 |         self.enable_detection = cfg.DETECTION.ENABLE
 33 |         self.num_pathways = 1
 34 | 
 35 |         exp_stage = 2.0
 36 |         self.dim_c1 = cfg.X3D.DIM_C1
 37 | 
 38 |         self.dim_res2 = (
 39 |             self._round_width(self.dim_c1, exp_stage, divisor=8)
 40 |             if cfg.X3D.SCALE_RES2
 41 |             else self.dim_c1
 42 |         )
 43 |         self.dim_res3 = self._round_width(self.dim_res2, exp_stage, divisor=8)
 44 |         self.dim_res4 = self._round_width(self.dim_res3, exp_stage, divisor=8)
 45 |         self.dim_res5 = self._round_width(self.dim_res4, exp_stage, divisor=8)
 46 | 
 47 |         self.block_basis = [
 48 |             # blocks, c, stride
 49 |             [1, self.dim_res2, 2],
 50 |             [2, self.dim_res3, 2],
 51 |             [5, self.dim_res4, 2],
 52 |             [3, self.dim_res5, 2],
 53 |         ]
 54 |         self._construct_network(cfg)
 55 |         init_helper.init_weights(
 56 |             self, cfg.MODEL.FC_INIT_STD, cfg.RESNET.ZERO_INIT_FINAL_BN
 57 |         )
 58 | 
 59 |     def _round_width(self, width, multiplier, min_depth=8, divisor=8):
 60 |         """Round width of filters based on width multiplier."""
 61 |         if not multiplier:
 62 |             return width
 63 | 
 64 |         width *= multiplier
 65 |         min_depth = min_depth or divisor
 66 |         new_filters = max(
 67 |             min_depth, int(width + divisor / 2) // divisor * divisor
 68 |         )
 69 |         if new_filters < 0.9 * width:
 70 |             new_filters += divisor
 71 |         return int(new_filters)
 72 | 
 73 |     def _round_repeats(self, repeats, multiplier):
 74 |         """Round number of layers based on depth multiplier."""
 75 |         multiplier = multiplier
 76 |         if not multiplier:
 77 |             return repeats
 78 |         return int(math.ceil(multiplier * repeats))
 79 | 
 80 |     def _construct_network(self, cfg):
 81 |         """
 82 |         Builds a single pathway X3D model.
 83 |         Args:
 84 |             cfg (CfgNode): model building configs, details are in the
 85 |                 comments of the config file.
 86 |         """
 87 |         assert cfg.MODEL.ARCH in _POOL1.keys()
 88 |         assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys()
 89 | 
 90 |         (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH]
 91 | 
 92 |         num_groups = cfg.RESNET.NUM_GROUPS
 93 |         width_per_group = cfg.RESNET.WIDTH_PER_GROUP
 94 |         dim_inner = num_groups * width_per_group
 95 | 
 96 |         w_mul = cfg.X3D.WIDTH_FACTOR
 97 |         d_mul = cfg.X3D.DEPTH_FACTOR
 98 |         dim_res1 = self._round_width(self.dim_c1, w_mul)
 99 | 
100 |         temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH]
101 | 
102 |         self.s1 = stem_helper.VideoModelStem(
103 |             cfg=cfg,
104 |             dim_in=cfg.DATA.INPUT_CHANNEL_NUM,
105 |             dim_out=[dim_res1],
106 |             kernel=[temp_kernel[0][0] + [3, 3]],
107 |             stride=[[1, 2, 2]],
108 |             padding=[[temp_kernel[0][0][0] // 2, 1, 1]],
109 |             norm_module=self.norm_module,
110 |             stem_func_name="x3d_stem",
111 |         )
112 | 
113 |         # blob_in = s1
114 |         dim_in = dim_res1
115 |         for stage, block in enumerate(self.block_basis):
116 |             dim_out = self._round_width(block[1], w_mul)
117 |             dim_inner = int(cfg.X3D.BOTTLENECK_FACTOR * dim_out)
118 | 
119 |             n_rep = self._round_repeats(block[0], d_mul)
120 |             prefix = "s{}".format(
121 |                 stage + 2
122 |             )  # start w res2 to follow convention
123 | 
124 |             s = resnet_helper.ResStage(
125 |                 cfg=cfg,
126 |                 dim_in=[dim_in],
127 |                 dim_out=[dim_out],
128 |                 dim_inner=[dim_inner],
129 |                 temp_kernel_sizes=temp_kernel[1],
130 |                 stride=[block[2]],
131 |                 num_blocks=[n_rep],
132 |                 num_groups=[dim_inner]
133 |                 if cfg.X3D.CHANNELWISE_3x3x3
134 |                 else [num_groups],
135 |                 num_block_temp_kernel=[n_rep],
136 |                 nonlocal_inds=cfg.NONLOCAL.LOCATION[0],
137 |                 nonlocal_group=cfg.NONLOCAL.GROUP[0],
138 |                 nonlocal_pool=cfg.NONLOCAL.POOL[0],
139 |                 nonlocal_progress=cfg.NONLOCAL.PROGRESS,
140 |                 nonlocal_use_bn=cfg.NONLOCAL.USE_BN,
141 |                 instantiation=cfg.NONLOCAL.INSTANTIATION,
142 |                 trans_func_name=cfg.RESNET.TRANS_FUNC,
143 |                 stride_1x1=cfg.RESNET.STRIDE_1X1,
144 |                 norm_module=self.norm_module,
145 |                 dilation=cfg.RESNET.SPATIAL_DILATIONS[stage],
146 |                 temp_progress=cfg.PGT.ENABLE,
147 |             )
148 |             dim_in = dim_out
149 |             self.add_module(prefix, s)
150 | 
151 |         if self.enable_detection:
152 |             NotImplementedError
153 |         else:
154 |             spat_sz = int(math.ceil(cfg.DATA.TRAIN_CROP_SIZE / 32.0))
155 |             self.head = head_helper.X3DHead(
156 |                 cfg=cfg,
157 |                 dim_in=dim_out,
158 |                 dim_inner=dim_inner,
159 |                 dim_out=cfg.X3D.DIM_C5,
160 |                 num_classes=cfg.MODEL.NUM_CLASSES,
161 |                 pool_size=[cfg.DATA.NUM_FRAMES, spat_sz, spat_sz],
162 |                 pool_type=cfg.MODEL.FINAL_POOL,
163 |                 dropout_rate=cfg.MODEL.DROPOUT_RATE,
164 |                 act_func=cfg.MODEL.HEAD_ACT,
165 |                 bn_lin5_on=cfg.X3D.BN_LIN5,
166 |             )
167 | 
168 |     def forward(self, x, bboxes=None):
169 |         for module in self.children():
170 |             x = module(x)
171 |         return x


--------------------------------------------------------------------------------
/slowfast/utils/ava_evaluation/label_map_util.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Label map utility functions."""
 16 | 
 17 | from __future__ import (
 18 |     absolute_import,
 19 |     division,
 20 |     print_function,
 21 |     unicode_literals,
 22 | )
 23 | import logging
 24 | 
 25 | # from google.protobuf import text_format
 26 | # from google3.third_party.tensorflow_models.object_detection.protos import string_int_label_map_pb2
 27 | 
 28 | 
 29 | def _validate_label_map(label_map):
 30 |     """Checks if a label map is valid.
 31 | 
 32 |   Args:
 33 |     label_map: StringIntLabelMap to validate.
 34 | 
 35 |   Raises:
 36 |     ValueError: if label map is invalid.
 37 |   """
 38 |     for item in label_map.item:
 39 |         if item.id < 1:
 40 |             raise ValueError("Label map ids should be >= 1.")
 41 | 
 42 | 
 43 | def create_category_index(categories):
 44 |     """Creates dictionary of COCO compatible categories keyed by category id.
 45 | 
 46 |   Args:
 47 |     categories: a list of dicts, each of which has the following keys:
 48 |       'id': (required) an integer id uniquely identifying this category.
 49 |       'name': (required) string representing category name
 50 |         e.g., 'cat', 'dog', 'pizza'.
 51 | 
 52 |   Returns:
 53 |     category_index: a dict containing the same entries as categories, but keyed
 54 |       by the 'id' field of each category.
 55 |   """
 56 |     category_index = {}
 57 |     for cat in categories:
 58 |         category_index[cat["id"]] = cat
 59 |     return category_index
 60 | 
 61 | 
 62 | def get_max_label_map_index(label_map):
 63 |     """Get maximum index in label map.
 64 | 
 65 |   Args:
 66 |     label_map: a StringIntLabelMapProto
 67 | 
 68 |   Returns:
 69 |     an integer
 70 |   """
 71 |     return max([item.id for item in label_map.item])
 72 | 
 73 | 
 74 | def convert_label_map_to_categories(
 75 |     label_map, max_num_classes, use_display_name=True
 76 | ):
 77 |     """Loads label map proto and returns categories list compatible with eval.
 78 | 
 79 |   This function loads a label map and returns a list of dicts, each of which
 80 |   has the following keys:
 81 |     'id': (required) an integer id uniquely identifying this category.
 82 |     'name': (required) string representing category name
 83 |       e.g., 'cat', 'dog', 'pizza'.
 84 |   We only allow class into the list if its id-label_id_offset is
 85 |   between 0 (inclusive) and max_num_classes (exclusive).
 86 |   If there are several items mapping to the same id in the label map,
 87 |   we will only keep the first one in the categories list.
 88 | 
 89 |   Args:
 90 |     label_map: a StringIntLabelMapProto or None.  If None, a default categories
 91 |       list is created with max_num_classes categories.
 92 |     max_num_classes: maximum number of (consecutive) label indices to include.
 93 |     use_display_name: (boolean) choose whether to load 'display_name' field
 94 |       as category name.  If False or if the display_name field does not exist,
 95 |       uses 'name' field as category names instead.
 96 |   Returns:
 97 |     categories: a list of dictionaries representing all possible categories.
 98 |   """
 99 |     categories = []
100 |     list_of_ids_already_added = []
101 |     if not label_map:
102 |         label_id_offset = 1
103 |         for class_id in range(max_num_classes):
104 |             categories.append(
105 |                 {
106 |                     "id": class_id + label_id_offset,
107 |                     "name": "category_{}".format(class_id + label_id_offset),
108 |                 }
109 |             )
110 |         return categories
111 |     for item in label_map.item:
112 |         if not 0 < item.id <= max_num_classes:
113 |             logging.info(
114 |                 "Ignore item %d since it falls outside of requested "
115 |                 "label range.",
116 |                 item.id,
117 |             )
118 |             continue
119 |         if use_display_name and item.HasField("display_name"):
120 |             name = item.display_name
121 |         else:
122 |             name = item.name
123 |         if item.id not in list_of_ids_already_added:
124 |             list_of_ids_already_added.append(item.id)
125 |             categories.append({"id": item.id, "name": name})
126 |     return categories
127 | 
128 | 
129 | def load_labelmap(path):
130 |     """Loads label map proto.
131 | 
132 |   Args:
133 |     path: path to StringIntLabelMap proto text file.
134 |   Returns:
135 |     a StringIntLabelMapProto
136 |   """
137 |     with open(path, "r") as fid:
138 |         label_map_string = fid.read()
139 |         label_map = string_int_label_map_pb2.StringIntLabelMap()
140 |         try:
141 |             text_format.Merge(label_map_string, label_map)
142 |         except text_format.ParseError:
143 |             label_map.ParseFromString(label_map_string)
144 |     _validate_label_map(label_map)
145 |     return label_map
146 | 
147 | 
148 | def get_label_map_dict(label_map_path, use_display_name=False):
149 |     """Reads a label map and returns a dictionary of label names to id.
150 | 
151 |   Args:
152 |     label_map_path: path to label_map.
153 |     use_display_name: whether to use the label map items' display names as keys.
154 | 
155 |   Returns:
156 |     A dictionary mapping label names to id.
157 |   """
158 |     label_map = load_labelmap(label_map_path)
159 |     label_map_dict = {}
160 |     for item in label_map.item:
161 |         if use_display_name:
162 |             label_map_dict[item.display_name] = item.id
163 |         else:
164 |             label_map_dict[item.name] = item.id
165 |     return label_map_dict
166 | 
167 | 
168 | def create_category_index_from_labelmap(label_map_path):
169 |     """Reads a label map and returns a category index.
170 | 
171 |   Args:
172 |     label_map_path: Path to `StringIntLabelMap` proto text file.
173 | 
174 |   Returns:
175 |     A category index, which is a dictionary that maps integer ids to dicts
176 |     containing categories, e.g.
177 |     {1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...}
178 |   """
179 |     label_map = load_labelmap(label_map_path)
180 |     max_num_classes = max(item.id for item in label_map.item)
181 |     categories = convert_label_map_to_categories(label_map, max_num_classes)
182 |     return create_category_index(categories)
183 | 
184 | 
185 | def create_class_agnostic_category_index():
186 |     """Creates a category index with a single `object` class."""
187 |     return {1: {"id": 1, "name": "object"}}
188 | 


--------------------------------------------------------------------------------
/tools/test_net.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Multi-view test a video classification model."""
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from tqdm import tqdm
  9 | 
 10 | import slowfast.utils.checkpoint as cu
 11 | import slowfast.utils.distributed as du
 12 | import slowfast.utils.logging as logging
 13 | import slowfast.utils.misc as misc
 14 | from slowfast.datasets import loader
 15 | from slowfast.models import build_model
 16 | from slowfast.utils.meters import AVAMeter, TestMeter
 17 | from slowfast.models.progress_helper import ProgressTrainer
 18 | 
 19 | logger = logging.get_logger(__name__)
 20 | 
 21 | 
 22 | @torch.no_grad()
 23 | def perform_test(test_loader, model, test_meter, cfg, writer=None, cur_epoch=None):
 24 |     """
 25 |     For classification:
 26 |     Perform mutli-view testing that uniformly samples N clips from a video along
 27 |     its temporal axis. For each clip, it takes 3 crops to cover the spatial
 28 |     dimension, followed by averaging the softmax scores across all Nx3 views to
 29 |     form a video-level prediction. All video predictions are compared to
 30 |     ground-truth labels and the final testing performance is logged.
 31 |     For detection:
 32 |     Perform fully-convolutional testing on the full frames without crop.
 33 |     Args:
 34 |         test_loader (loader): video testing loader.
 35 |         model (model): the pretrained video model to test.
 36 |         test_meter (TestMeter): testing meters to log and ensemble the testing
 37 |             results.
 38 |         cfg (CfgNode): configs. Details can be found in
 39 |             slowfast/config/defaults.py
 40 |         writer (TensorboardWriter object, optional): TensorboardWriter object
 41 |             to writer Tensorboard log.
 42 |     """
 43 |     # Enable eval mode.
 44 |     model.eval()
 45 |     test_meter.iter_tic()
 46 | 
 47 |     if cfg.PGT.ENABLE:
 48 |         pg_trainer = ProgressTrainer(model, cfg, cur_epoch)
 49 | 
 50 |     if du.get_world_size() == 1:
 51 |         extra_args = {}
 52 |     else:
 53 |         rank = du.get_rank()
 54 |         extra_args = dict(desc="rank {}".format(rank))
 55 | 
 56 |     for _, (inputs, labels, video_idx, meta) in enumerate(tqdm(test_loader, **extra_args)):
 57 |         # Transfer the data to the current GPU device.
 58 |         if isinstance(inputs, (list,)):
 59 |             for i in range(len(inputs)):
 60 |                 inputs[i] = inputs[i].cuda(non_blocking=True)
 61 |         else:
 62 |             inputs = inputs.cuda(non_blocking=True)
 63 | 
 64 |         # Transfer the data to the current GPU device.
 65 |         labels = labels.cuda()
 66 |         video_idx = video_idx.cuda()
 67 |         for key, val in meta.items():
 68 |             if isinstance(val, (list,)):
 69 |                 for i in range(len(val)):
 70 |                     val[i] = val[i].cuda(non_blocking=True)
 71 |             else:
 72 |                 meta[key] = val.cuda(non_blocking=True)
 73 | 
 74 |         if cfg.DETECTION.ENABLE:
 75 |             # Compute the predictions.
 76 |             if not cfg.PGT.ENABLE:
 77 |                 preds = model(inputs, meta["boxes"])
 78 |             else:
 79 |                 # Take the meta of last step
 80 |                 if not cfg.PGT.ALL_STEP_TEST:
 81 |                     step_idx = pg_trainer.steps - 1
 82 |                     meta["boxes"] = meta["boxes"][meta["step_idxes"] == step_idx]
 83 |                     meta["ori_boxes"] = meta["ori_boxes"][meta["step_idxes"] == step_idx]
 84 |                     meta["metadata"] = meta["metadata"][meta["step_idxes"] == step_idx]
 85 |                     preds = pg_trainer.step_eval(inputs, meta["boxes"])
 86 |                 else:
 87 |                     preds = pg_trainer.step_eval(inputs, meta["boxes"], meta["step_idxes"])
 88 |             preds = preds.cpu()
 89 |             ori_boxes = meta["ori_boxes"].cpu()
 90 |             metadata = meta["metadata"].cpu()
 91 | 
 92 |             if cfg.NUM_GPUS > 1:
 93 |                 preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
 94 |                 ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0)
 95 |                 metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0)
 96 | 
 97 |             test_meter.iter_toc()
 98 |             # Update and log stats.
 99 |             test_meter.update_stats(
100 |                 preds.detach().cpu(),
101 |                 ori_boxes.detach().cpu(),
102 |                 metadata.detach().cpu(),
103 |             )
104 |         else:
105 |             # Perform the forward pass.
106 |             if not cfg.PGT.ENABLE:
107 |                 preds = model(inputs)
108 |             else:
109 |                 preds = pg_trainer.step_eval(inputs)
110 | 
111 |             # Gather all the predictions across all the devices to perform ensemble.
112 |             if cfg.NUM_GPUS > 1:
113 |                 preds, labels, video_idx = du.all_gather(
114 |                     [preds, labels, video_idx]
115 |                 )
116 | 
117 |             test_meter.iter_toc()
118 |             # Update and log stats.
119 |             test_meter.update_stats(
120 |                 preds.detach().cpu(),
121 |                 labels.detach().cpu(),
122 |                 video_idx.detach().cpu(),
123 |             )
124 | 
125 |         test_meter.iter_tic()
126 |     # Log epoch stats and print the final testing results.
127 |     if writer is not None:
128 |         all_preds_cpu = [
129 |             pred.clone().detach().cpu() for pred in test_meter.video_preds
130 |         ]
131 |         all_labels_cpu = [
132 |             label.clone().detach().cpu() for label in test_meter.video_labels
133 |         ]
134 |         writer.plot_eval(preds=all_preds_cpu, labels=all_labels_cpu)
135 | 
136 |     test_meter.finalize_metrics(cur_epoch=cur_epoch)
137 |     test_meter.reset()
138 | 
139 | 
140 | def test(cfg):
141 |     """
142 |     Perform multi-view testing on the pretrained video model.
143 |     Args:
144 |         cfg (CfgNode): configs. Details can be found in
145 |             slowfast/config/defaults.py
146 |     """
147 |     # Set up environment.
148 |     filename = "test" if cfg.LOGS.FILE_NAME == "" else cfg.LOGS.FILE_NAME
149 |     logging.setup_logger(cfg, filename)
150 |     du.init_distributed_training(cfg)
151 |     # Set random seed from configs.
152 |     np.random.seed(cfg.RNG_SEED)
153 |     torch.manual_seed(cfg.RNG_SEED)
154 | 
155 |     # Print config.
156 |     logger.info("Test with config:")
157 |     logger.info(cfg)
158 | 
159 |     # Build the video model and print model statistics.
160 |     model = build_model(cfg)
161 |     if du.is_master_proc() and cfg.LOGS.LOG_MODEL:
162 |         misc.log_model_info(model, cfg, use_train_input=False)
163 | 
164 |     cu.load_test_checkpoint(cfg, model)
165 | 
166 |     # Create video testing loaders.
167 |     test_loader = loader.construct_loader(cfg, "test")
168 |     logger.info("Testing model for {} iterations".format(len(test_loader)))
169 |     logger.info("Testing contains {} videos".format(len(test_loader.dataset)))
170 | 
171 |     if cfg.DETECTION.ENABLE:
172 |         assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE
173 |         test_meter = AVAMeter(len(test_loader), cfg, mode="test")
174 |     else:
175 |         assert (
176 |             len(test_loader.dataset)
177 |             % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS)
178 |             == 0
179 |         )
180 |         # Create meters for multi-view testing.
181 |         test_meter = TestMeter(
182 |             len(test_loader.dataset)
183 |             // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS),
184 |             cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS,
185 |             cfg.MODEL.NUM_CLASSES,
186 |             len(test_loader),
187 |             cfg.DATA.MULTI_LABEL,
188 |             cfg.DATA.ENSEMBLE_METHOD,
189 |         )
190 | 
191 |     # Set up writer for logging to Tensorboard format.
192 |     if cfg.TENSORBOARD.ENABLE and du.is_master_proc(
193 |         cfg.NUM_GPUS * cfg.NUM_SHARDS
194 |     ):
195 |         writer = tb.TensorboardWriter(cfg)
196 |     else:
197 |         writer = None
198 | 
199 |     # # Perform multi-view test on the entire dataset.
200 |     perform_test(test_loader, model, test_meter, cfg, writer)
201 |     if writer is not None:
202 |         writer.close()
203 |     
204 |     logger.info(f"Testing completed. Log directory: {cfg.LOGS.DIR}")
205 | 


--------------------------------------------------------------------------------
/slowfast/models/nonlocal_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Non-local helper"""
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | 
 10 | class Nonlocal(nn.Module):
 11 |     """
 12 |     Builds Non-local Neural Networks as a generic family of building
 13 |     blocks for capturing long-range dependencies. Non-local Network
 14 |     computes the response at a position as a weighted sum of the
 15 |     features at all positions. This building block can be plugged into
 16 |     many computer vision architectures.
 17 |     More details in the paper: https://arxiv.org/pdf/1711.07971.pdf
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         dim,
 23 |         dim_inner,
 24 |         pool_size=None,
 25 |         instantiation="softmax",
 26 |         zero_init_final_conv=False,
 27 |         zero_init_final_norm=True,
 28 |         use_bn=True,
 29 |         norm_eps=1e-5,
 30 |         norm_momentum=0.1,
 31 |         norm_module=nn.BatchNorm3d,
 32 |     ):
 33 |         """
 34 |         Args:
 35 |             dim (int): number of dimension for the input.
 36 |             dim_inner (int): number of dimension inside of the Non-local block.
 37 |             pool_size (list): the kernel size of spatial temporal pooling,
 38 |                 temporal pool kernel size, spatial pool kernel size, spatial
 39 |                 pool kernel size in order. By default pool_size is None,
 40 |                 then there would be no pooling used.
 41 |             instantiation (string): supports two different instantiation method:
 42 |                 "dot_product": normalizing correlation matrix with L2.
 43 |                 "softmax": normalizing correlation matrix with Softmax.
 44 |             zero_init_final_conv (bool): If true, zero initializing the final
 45 |                 convolution of the Non-local block.
 46 |             zero_init_final_norm (bool):
 47 |                 If true, zero initializing the final batch norm of the Non-local
 48 |                 block.
 49 |             norm_module (nn.Module): nn.Module for the normalization layer. The
 50 |                 default is nn.BatchNorm3d.
 51 |         """
 52 |         super(Nonlocal, self).__init__()
 53 |         self.dim = dim
 54 |         self.dim_inner = dim_inner
 55 |         self.pool_size = pool_size
 56 |         self.instantiation = instantiation
 57 |         self.use_pool = (
 58 |             False
 59 |             if pool_size is None
 60 |             else any((size > 1 for size in pool_size))
 61 |         )
 62 |         self.use_bn = use_bn
 63 |         self.norm_eps = norm_eps
 64 |         self.norm_momentum = norm_momentum
 65 |         self._construct_nonlocal(
 66 |             zero_init_final_conv, zero_init_final_norm, norm_module
 67 |         )
 68 | 
 69 |     def _construct_nonlocal(
 70 |         self, zero_init_final_conv, zero_init_final_norm, norm_module
 71 |     ):
 72 |         # Three convolution heads: theta, phi, and g.
 73 |         self.conv_theta = nn.Conv3d(
 74 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 75 |         )
 76 |         self.conv_phi = nn.Conv3d(
 77 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 78 |         )
 79 |         self.conv_g = nn.Conv3d(
 80 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 81 |         )
 82 | 
 83 |         # Final convolution output.
 84 |         self.conv_out = nn.Conv3d(
 85 |             self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0
 86 |         )
 87 |         # Zero initializing the final convolution output.
 88 |         self.conv_out.zero_init = zero_init_final_conv
 89 | 
 90 |         # TODO: change the name to `norm`
 91 |         if self.use_bn:
 92 |             self.bn = norm_module(
 93 |                 num_features=self.dim,
 94 |                 eps=self.norm_eps,
 95 |                 momentum=self.norm_momentum,
 96 |             )
 97 |             # Zero initializing the final bn.
 98 |             self.bn.transform_final_bn = zero_init_final_norm
 99 | 
100 |         # Optional to add the spatial-temporal pooling.
101 |         if self.use_pool:
102 |             self.pool = nn.MaxPool3d(
103 |                 kernel_size=self.pool_size,
104 |                 stride=self.pool_size,
105 |                 padding=[0, 0, 0],
106 |             )
107 | 
108 |     def forward(self, x):
109 |         x_identity = x
110 |         N, C, T, H, W = x.size()
111 | 
112 |         theta = self.conv_theta(x)
113 | 
114 |         # Perform temporal-spatial pooling to reduce the computation.
115 |         if self.use_pool:
116 |             x = self.pool(x)
117 | 
118 |         phi = self.conv_phi(x)
119 |         g = self.conv_g(x)
120 | 
121 |         theta = theta.view(N, self.dim_inner, -1)
122 |         phi = phi.view(N, self.dim_inner, -1)
123 |         g = g.view(N, self.dim_inner, -1)
124 | 
125 |         # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW).
126 |         theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
127 |         # For original Non-local paper, there are two main ways to normalize
128 |         # the affinity tensor:
129 |         #   1) Softmax normalization (norm on exp).
130 |         #   2) dot_product normalization.
131 |         if self.instantiation == "softmax":
132 |             # Normalizing the affinity tensor theta_phi before softmax.
133 |             theta_phi = theta_phi * (self.dim_inner ** -0.5)
134 |             theta_phi = nn.functional.softmax(theta_phi, dim=2)
135 |         elif self.instantiation == "dot_product":
136 |             spatial_temporal_dim = theta_phi.shape[2]
137 |             theta_phi = theta_phi / spatial_temporal_dim
138 |         else:
139 |             raise NotImplementedError(
140 |                 "Unknown norm type {}".format(self.instantiation)
141 |             )
142 | 
143 |         # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW).
144 |         theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
145 | 
146 |         # (N, C, TxHxW) => (N, C, T, H, W).
147 |         theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W)
148 | 
149 |         p = self.conv_out(theta_phi_g)
150 |         if self.use_bn:
151 |             p = self.bn(p)
152 |         return x_identity + p
153 | 
154 | 
155 | class Featurebank(Nonlocal):
156 |     """Feature Bank Operator"""
157 |     def __init__(
158 |         self,
159 |         dim,
160 |         dim_inner,
161 |         pool_size=None,
162 |     ):
163 |         super(Featurebank, self).__init__(
164 |             dim,
165 |             dim_inner, 
166 |             pool_size,
167 |             instantiation="softmax",
168 |             zero_init_final_conv=True,
169 |             use_bn=True,
170 |         )
171 | 
172 |     def _construct_nonlocal(self):
173 |         super(Featurebank, self)._construct_nonlocal(
174 |             zero_init_final_conv,
175 |             zero_init_final_norm,
176 |             norm_module
177 |         )
178 |         # GroupNorm with group = 1 is equivalent to LayerNorm. 
179 |         # Set affine to False to match with caffe2.
180 |         self.bn = nn.GroupNorm(1, self.dim, eps=self.norm_eps, affine=False)
181 | 
182 |     def forward(self, x):
183 |         x_identity = x
184 |         N, C, T, H, W = x.size()
185 | 
186 |         theta = self.conv_theta(x)
187 | 
188 |         # Perform temporal-spatial pooling to reduce the computation.
189 |         if self.use_pool:
190 |             y = self.pool(x)  # FIXME: where does y comes from?
191 | 
192 |         phi = self.conv_phi(y)
193 |         g = self.conv_g(y)
194 | 
195 |         theta = theta.view(N, self.dim_inner, -1)
196 |         phi = phi.view(N, self.dim_inner, -1)
197 |         g = g.view(N, self.dim_inner, -1)
198 | 
199 |         # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW).
200 |         theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
201 | 
202 |         # Normalizing the affinity tensor theta_phi before softmax.
203 |         theta_phi = theta_phi * (self.dim_inner ** -0.5)
204 |         theta_phi = nn.functional.softmax(theta_phi, dim=2)
205 | 
206 |         # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW).
207 |         theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
208 | 
209 |         # (N, C, TxHxW) => (N, C, T, H, W).
210 |         theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W)
211 | 
212 |         p = self.conv_out(theta_phi_g)
213 |         p = self.bn(p)
214 |         return x_identity + p


--------------------------------------------------------------------------------
/slowfast/datasets/ava_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | import os
  5 | from collections import defaultdict
  6 | from fvcore.common.file_io import PathManager
  7 | 
  8 | import slowfast.utils.logging as logging
  9 | from slowfast.utils.setup_moxing_env import wrap_input_path2
 10 | 
 11 | logger = logging.get_logger(__name__)
 12 | 
 13 | FPS = 30
 14 | AVA_VALID_FRAMES = range(902, 1799)
 15 | 
 16 | 
 17 | def load_image_lists(cfg, is_train):
 18 |     """
 19 |     Loading image paths from corresponding files.
 20 | 
 21 |     Args:
 22 |         cfg (CfgNode): config.
 23 |         is_train (bool): if it is training dataset or not.
 24 | 
 25 |     Returns:
 26 |         image_paths (list[list]): a list of items. Each item (also a list)
 27 |             corresponds to one video and contains the paths of images for
 28 |             this video.
 29 |         video_idx_to_name (list): a list which stores video names.
 30 |     """
 31 |     list_filenames = [
 32 |         os.path.join(cfg.AVA.FRAME_LIST_DIR, filename)
 33 |         for filename in (
 34 |             cfg.AVA.TRAIN_LISTS if is_train else cfg.AVA.TEST_LISTS
 35 |         )
 36 |     ]
 37 |     image_paths = defaultdict(list)
 38 |     video_name_to_idx = {}
 39 |     video_idx_to_name = []
 40 |     for list_filename in list_filenames:
 41 |         list_filename = wrap_input_path2(list_filename)
 42 |         with open(list_filename, "r") as f:
 43 |             f.readline()
 44 |             for i, line in enumerate(f):
 45 |                 if cfg.DEBUG and i > 10000:
 46 |                     break
 47 |                 row = line.split()
 48 |                 # The format of each row should follow:
 49 |                 # original_vido_id video_id frame_id path labels.
 50 |                 assert len(row) == 5
 51 |                 video_name = row[0]
 52 | 
 53 |                 if video_name not in video_name_to_idx:
 54 |                     idx = len(video_name_to_idx)
 55 |                     video_name_to_idx[video_name] = idx
 56 |                     video_idx_to_name.append(video_name)
 57 | 
 58 |                 data_key = video_name_to_idx[video_name]
 59 | 
 60 |                 image_paths[data_key].append(
 61 |                     os.path.join(cfg.AVA.FRAME_DIR, row[3])
 62 |                 )
 63 | 
 64 |     image_paths = [image_paths[i] for i in range(len(image_paths))]
 65 | 
 66 |     logger.info(
 67 |         "Finished loading image paths from: %s" % ", ".join(list_filenames)
 68 |     )
 69 | 
 70 |     return image_paths, video_idx_to_name
 71 | 
 72 | 
 73 | def load_boxes_and_labels(cfg, mode):
 74 |     """
 75 |     Loading boxes and labels from csv files.
 76 | 
 77 |     Args:
 78 |         cfg (CfgNode): config.
 79 |         mode (str): 'train', 'val', or 'test' mode.
 80 |     Returns:
 81 |         all_boxes (dict): a dict which maps from `video_name` and
 82 |             `frame_sec` to a list of `box`. Each `box` is a
 83 |             [`box_coord`, `box_labels`] where `box_coord` is the
 84 |             coordinates of box and 'box_labels` are the corresponding
 85 |             labels for the box.
 86 |     """
 87 |     gt_lists = cfg.AVA.TRAIN_GT_BOX_LISTS if mode == "train" else []
 88 |     pred_lists = (
 89 |         cfg.AVA.TRAIN_PREDICT_BOX_LISTS
 90 |         if mode == "train"
 91 |         else cfg.AVA.TEST_PREDICT_BOX_LISTS
 92 |     )
 93 |     ann_filenames = [
 94 |         os.path.join(cfg.AVA.ANNOTATION_DIR, filename)
 95 |         for filename in gt_lists + pred_lists
 96 |     ]
 97 |     ann_is_gt_box = [True] * len(gt_lists) + [False] * len(pred_lists)
 98 | 
 99 |     detect_thresh = cfg.AVA.DETECTION_SCORE_THRESH
100 |     all_boxes = {}
101 |     count = 0
102 |     unique_box_count = 0
103 |     for filename, is_gt_box in zip(ann_filenames, ann_is_gt_box):
104 |         filename = wrap_input_path2(filename)
105 |         with open(filename, "r") as f:
106 |             for i, line in enumerate(f):
107 |                 if cfg.DEBUG and i > 10000:
108 |                     break
109 |                 row = line.strip().split(",")
110 |                 # When we use predicted boxes to train/eval, we need to
111 |                 # ignore the boxes whose scores are below the threshold.
112 |                 if not is_gt_box:
113 |                     score = float(row[7])
114 |                     if score < detect_thresh:
115 |                         continue
116 | 
117 |                 video_name, frame_sec = row[0], int(row[1])
118 | 
119 |                 # Only select frame_sec % 4 = 0 samples for validation if not
120 |                 # set FULL_TEST_ON_VAL.
121 |                 if (
122 |                     mode == "val"
123 |                     and not cfg.AVA.FULL_TEST_ON_VAL
124 |                     and frame_sec % 4 != 0
125 |                 ):
126 |                     continue
127 | 
128 |                 # Box with format [x1, y1, x2, y2] with a range of [0, 1] as float.
129 |                 box_key = ",".join(row[2:6])
130 |                 box = list(map(float, row[2:6]))
131 |                 label = -1 if row[6] == "" else int(row[6])
132 | 
133 |                 if video_name not in all_boxes:
134 |                     all_boxes[video_name] = {}
135 |                     for sec in AVA_VALID_FRAMES:
136 |                         all_boxes[video_name][sec] = {}
137 | 
138 |                 if box_key not in all_boxes[video_name][frame_sec]:
139 |                     all_boxes[video_name][frame_sec][box_key] = [box, []]
140 |                     unique_box_count += 1
141 | 
142 |                 all_boxes[video_name][frame_sec][box_key][1].append(label)
143 |                 if label != -1:
144 |                     count += 1
145 | 
146 |     for video_name in all_boxes.keys():
147 |         for frame_sec in all_boxes[video_name].keys():
148 |             # Save in format of a list of [box_i, box_i_labels].
149 |             all_boxes[video_name][frame_sec] = list(
150 |                 all_boxes[video_name][frame_sec].values()
151 |             )
152 | 
153 |     logger.info(
154 |         "Finished loading annotations from: %s" % ", ".join(ann_filenames)
155 |     )
156 |     logger.info("Detection threshold: {}".format(detect_thresh))
157 |     logger.info("Number of unique boxes: %d" % unique_box_count)
158 |     logger.info("Number of annotations: %d" % count)
159 | 
160 |     return all_boxes
161 | 
162 | 
163 | def get_keyframe_data(boxes_and_labels):
164 |     """
165 |     Getting keyframe indices, boxes and labels in the dataset.
166 | 
167 |     Args:
168 |         boxes_and_labels (list[dict]): a list which maps from video_idx to a dict.
169 |             Each dict `frame_sec` to a list of boxes and corresponding labels.
170 | 
171 |     Returns:
172 |         keyframe_indices (list): a list of indices of the keyframes.
173 |         keyframe_boxes_and_labels (list[list[list]]): a list of list which maps from
174 |             video_idx and sec_idx to a list of boxes and corresponding labels.
175 |     """
176 | 
177 |     def sec_to_frame(sec):
178 |         """
179 |         Convert time index (in second) to frame index.
180 |         0: 900
181 |         30: 901
182 |         """
183 |         return (sec - 900) * FPS
184 | 
185 |     keyframe_indices = []
186 |     keyframe_boxes_and_labels = []
187 |     count = 0
188 |     for video_idx in range(len(boxes_and_labels)):
189 |         sec_idx = 0
190 |         keyframe_boxes_and_labels.append([])
191 |         for sec in boxes_and_labels[video_idx].keys():
192 |             if sec not in AVA_VALID_FRAMES:
193 |                 continue
194 | 
195 |             if len(boxes_and_labels[video_idx][sec]) > 0:
196 |                 keyframe_indices.append(
197 |                     (video_idx, sec_idx, sec, sec_to_frame(sec))
198 |                 )
199 |                 keyframe_boxes_and_labels[video_idx].append(
200 |                     boxes_and_labels[video_idx][sec]
201 |                 )
202 |                 sec_idx += 1
203 |                 count += 1
204 |     logger.info("%d keyframes used." % count)
205 | 
206 |     return keyframe_indices, keyframe_boxes_and_labels
207 | 
208 | 
209 | def get_num_boxes_used(keyframe_indices, keyframe_boxes_and_labels):
210 |     """
211 |     Get total number of used boxes.
212 | 
213 |     Args:
214 |         keyframe_indices (list): a list of indices of the keyframes.
215 |         keyframe_boxes_and_labels (list[list[list]]): a list of list which maps from
216 |             video_idx and sec_idx to a list of boxes and corresponding labels.
217 | 
218 |     Returns:
219 |         count (int): total number of used boxes.
220 |     """
221 | 
222 |     count = 0
223 |     for video_idx, sec_idx, _, _ in keyframe_indices:
224 |         count += len(keyframe_boxes_and_labels[video_idx][sec_idx])
225 |     return count
226 | 


--------------------------------------------------------------------------------
/slowfast/models/backbones/regnet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | from ..build import MODEL_REGISTRY
  6 | import slowfast.utils.weight_init_helper as init_helper
  7 | from slowfast.models.batchnorm_helper import get_norm
  8 | 
  9 | 
 10 | _CFG = {
 11 |     "400M": {
 12 |         'd': [1, 2, 7, 12],
 13 |         'wi': [32, 64, 160, 384],
 14 |         'g': 16,
 15 |         'b': 1,
 16 |         'w0': 24
 17 |     },
 18 |     "4G": {
 19 |         'd': [2, 5, 14, 2],
 20 |         'wi': [80, 240, 560, 1360],
 21 |         'g': 24,
 22 |         'b': 1,
 23 |         'w0': 48
 24 |     }
 25 | }
 26 | 
 27 | 
 28 | def conv3x3(in_planes, out_planes, stride=1, groups=1, temporal_k=1, temporal_p=0):
 29 |     """3x3 convolution with padding"""
 30 |     return nn.Conv3d(in_planes, out_planes, kernel_size=(temporal_k, 3, 3), stride=(1, stride, stride),
 31 |                      padding=(temporal_p, 1, 1), groups=groups, bias=False, dilation=1)
 32 | 
 33 | 
 34 | def conv1x1(in_planes, out_planes, stride=1, temporal_k=1, temporal_p=0):
 35 |     """1x1 convolution"""
 36 |     return nn.Conv3d(in_planes, out_planes, kernel_size=(temporal_k, 1, 1), stride=(1, stride, stride),
 37 |                      padding=(temporal_p, 0, 0), bias=False)
 38 | 
 39 | 
 40 | class Bottleneck(nn.Module):
 41 |     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, bottle_ratio=1, temporal_k=1, temporal_p=0):
 42 |         super(Bottleneck, self).__init__()
 43 |         norm_layer = nn.BatchNorm3d
 44 |         intra_plane = planes // bottle_ratio
 45 | 
 46 |         self.conv1 = conv1x1(inplanes, intra_plane,
 47 |                              temporal_k=temporal_k, temporal_p=temporal_p)
 48 |         self.bn1 = norm_layer(intra_plane)
 49 |         self.conv2 = conv3x3(intra_plane, intra_plane, stride, groups)
 50 |         self.bn2 = norm_layer(intra_plane)
 51 |         self.conv3 = conv1x1(intra_plane, planes)
 52 |         self.bn3 = norm_layer(planes)
 53 |         self.relu = nn.ReLU(inplace=True)
 54 |         self.relu_final = nn.ReLU(inplace=True)
 55 |         self.downsample = downsample
 56 |         self.stride = stride
 57 | 
 58 |     def forward(self, x):
 59 |         identity = x
 60 | 
 61 |         out = self.conv1(x)
 62 | 
 63 |         out = self.bn1(out)
 64 |         out = self.relu(out)
 65 | 
 66 |         out = self.conv2(out)
 67 |         out = self.bn2(out)
 68 |         out = self.relu(out)
 69 | 
 70 |         out = self.conv3(out)
 71 |         out = self.bn3(out)
 72 | 
 73 |         if self.downsample is not None:
 74 |             identity = self.downsample(x)
 75 | 
 76 |         if identity.shape != out.shape:
 77 |             identity = identity[:, :, 1:-1]
 78 |         out += identity
 79 | 
 80 |         out = self.relu_final(out)
 81 | 
 82 |         return out
 83 | 
 84 | 
 85 | @MODEL_REGISTRY.register()
 86 | class RegNet(nn.Module):
 87 |     def __init__(self, cfg, zero_init_residual=True):
 88 |         super(RegNet, self).__init__()
 89 | 
 90 |         self.cfg = cfg
 91 |         self.model_cfg = _CFG[self.cfg.REGNET.DEPTH]
 92 |         self.model_cfg['sa'] = [0, 0, 0, 0]  # FIXME
 93 |         if self.cfg.PGT.ENABLE:
 94 |             temporal_p = 0
 95 |         else:
 96 |             temporal_p = 1
 97 |         self.conv1 = conv3x3(3, self.model_cfg['w0'], stride=2)
 98 |         self.bn1 = nn.BatchNorm3d(self.model_cfg['w0'])
 99 |         self.relu = nn.ReLU(inplace=True)
100 | 
101 |         self.layer1 = self._make_layer(
102 |             self.model_cfg['w0'], self.model_cfg['wi'][0], self.model_cfg['d'][0], self.model_cfg['sa'][0])
103 |         self.layer2 = self._make_layer(
104 |             self.model_cfg['wi'][0], self.model_cfg['wi'][1], self.model_cfg['d'][1], self.model_cfg['sa'][1])
105 | 
106 |         self.layer3 = self._make_layer(self.model_cfg['wi'][1], self.model_cfg['wi'][2], self.model_cfg['d'][2], self.model_cfg['sa'][2],
107 |                                        temporal_k=3, temporal_p=temporal_p)
108 |         self.layer4 = self._make_layer(self.model_cfg['wi'][2], self.model_cfg['wi'][3], self.model_cfg['d'][3], self.model_cfg['sa'][3],
109 |                                        temporal_k=3, temporal_p=temporal_p)
110 | 
111 |         # input shape of each temporal layer, [channel, spatial stride]
112 |         self.padding_shape = [*[[self.model_cfg['wi'][1], 8]] * 1,
113 |                               *[[self.model_cfg['wi'][2], 16]] * self.model_cfg['d'][2],
114 |                               *[[self.model_cfg['wi'][3], 32]] * (self.model_cfg['d'][3] - 1), ]
115 |         # self.selfAtt_padding_shape = [*[[self.model_cfg['wi'][0], 4]] * self.model_cfg['sa'][0],
116 |         #                               *[[self.model_cfg['wi'][1], 8]] * self.model_cfg['sa'][1],
117 |         #                               *[[self.model_cfg['wi'][2], 16]] * self.model_cfg['sa'][2],
118 |         #                               *[[self.model_cfg['wi'][3], 32]] * self.model_cfg['sa'][3],
119 |         #                               ]
120 | 
121 |         self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
122 |         if self.cfg.MODEL.DROPOUT_RATE > 0:
123 |             self.dropout = nn.Dropout(self.cfg.MODEL.DROPOUT_RATE)
124 |         self.fc = nn.Linear(self.model_cfg['wi'][3],
125 |                             self.cfg.MODEL.NUM_CLASSES, bias=True)
126 | 
127 |         self.act = nn.Softmax(dim=-1)
128 | 
129 |         for m in self.modules():
130 |             if isinstance(m, nn.Conv3d):
131 |                 nn.init.kaiming_normal_(
132 |                     m.weight, mode='fan_out', nonlinearity='relu')
133 |                 if m.bias is not None:
134 |                     nn.init.constant_(m.bias, 0)
135 |             elif isinstance(m, (nn.BatchNorm3d, nn.GroupNorm)):
136 |                 if hasattr(m, "transform_final_bn") and m.transform_final_bn:
137 |                     batchnorm_weight = 0.0
138 |                 else:
139 |                     batchnorm_weight = 1.0
140 |                 m.weight.data.fill_(batchnorm_weight)
141 |                 m.bias.data.zero_()
142 |             elif isinstance(m, nn.Linear):
143 |                 nn.init.normal_(m.weight, 0, 0.01)
144 |                 nn.init.constant_(m.bias, 0)
145 | 
146 |         # Zero-initialize the last BN in each residual branch,
147 |         # so that the residual branch starts with zeros, and each residual block behaves like an identity.
148 |         # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
149 |         if zero_init_residual:
150 |             for m in self.modules():
151 |                 if isinstance(m, Bottleneck):
152 |                     nn.init.constant_(m.bn3.weight, 0)
153 | 
154 |     def _make_layer(self, inplanes, planes, n_blocks, n_sa, temporal_k=1, temporal_p=0):
155 | 
156 |         downsample = nn.Sequential(
157 |             conv1x1(inplanes, planes, 2),
158 |             nn.BatchNorm3d(planes),
159 |         )
160 | 
161 |         layers = []
162 |         layers.append(Bottleneck(inplanes, planes, 2, downsample, self.model_cfg['g'],
163 |                                  self.model_cfg['b'], temporal_k=temporal_k, temporal_p=temporal_p))
164 |         if n_sa == n_blocks:
165 |             layers.append(PrgSelfAtt(dim=planes,
166 |                                      dim_inner=planes // 2,
167 |                                      pool_size=[None, 4, 4]))
168 |         for i in range(1, n_blocks):
169 |             layers.append(Bottleneck(planes, planes, 1, None, self.model_cfg['g'],
170 |                                      self.model_cfg['b'], temporal_k=temporal_k, temporal_p=temporal_p))
171 |             if i > (n_blocks - n_sa - 1):
172 |                 layers.append(PrgSelfAtt(dim=planes,
173 |                                          dim_inner=planes // 2,
174 |                                          pool_size=[None, 4, 4]))
175 | 
176 |         return nn.Sequential(*layers)
177 | 
178 |     def forward(self, x):
179 |         x = x[0]
180 |         x = self.conv1(x)
181 |         x = self.bn1(x)
182 |         x = self.relu(x)
183 | 
184 |         x = self.layer1(x)
185 |         x = self.layer2(x)
186 |         x = self.layer3(x)
187 |         x = self.layer4(x)
188 | 
189 |         x = self.avgpool(x)
190 | 
191 |         if x.shape[2] > 1:
192 |             need_mean = True
193 |             x = x.permute(0, 2, 1, 3, 4)
194 |             x = x.view(x.size(0), x.size(1), -1)
195 |         else:
196 |             x = x.view(x.size(0), -1)
197 |             need_mean = False
198 | 
199 |         # head
200 |         if hasattr(self, 'dropout'):
201 |             x = self.dropout(x)
202 |         x = self.fc(x)
203 | 
204 |         if not self.training:
205 |             x = self.act(x)
206 | 
207 |         if need_mean:
208 |             x = x.mean(dim=1)
209 | 
210 |         return x
211 | 


--------------------------------------------------------------------------------
/slowfast/datasets/ssv1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | import os
  5 | import random
  6 | import torch
  7 | import numpy as np
  8 | import torch.utils.data
  9 | from itertools import chain as chain
 10 | from fvcore.common.file_io import PathManager
 11 | from PIL import Image
 12 | 
 13 | import slowfast.utils.logging as logging
 14 | from . import transform as transform
 15 | from . import utils as utils
 16 | from .decoder import get_start_end_idx
 17 | from .build import DATASET_REGISTRY
 18 | from .utils import retry_load_images
 19 | 
 20 | logger = logging.get_logger(__name__)
 21 | 
 22 | 
 23 | @DATASET_REGISTRY.register()
 24 | class Ssv1(torch.utils.data.Dataset):
 25 |     """
 26 |     Something-Something v1 (SSV1) video loader. Construct the SSV1 video loader,
 27 |     then sample clips from the videos. For training and validation, a single
 28 |     clip is randomly sampled from every video with random cropping, scaling, and
 29 |     flipping. For testing, multiple clips are uniformaly sampled from every
 30 |     video with uniform cropping. For uniform cropping, we take the left, center,
 31 |     and right crop if the width is larger than height, or take top, center, and
 32 |     bottom crop if the height is larger than the width.
 33 |     """
 34 | 
 35 |     def __init__(self, cfg, mode, num_retries=10):
 36 |         """
 37 |         Construct the Sthv1 video loader with a given csv file. The format of
 38 |         the csv file is:
 39 |         ```
 40 |         path_to_video_1 video_len_1 label_1
 41 |         path_to_video_2 video_len_2 label_2
 42 |         ...
 43 |         path_to_video_N video_len_N label_N
 44 |         ```
 45 |         Args:
 46 |             cfg (CfgNode): configs.
 47 |             mode (string): Options includes `train`, `val`, or `test` mode.
 48 |                 For the train and val mode, the data loader will take data
 49 |                 from the train or val set, and sample one clip per video.
 50 |                 For the test mode, the data loader will take data from test set,
 51 |                 and sample multiple clips per video.
 52 |             num_retries (int): number of retries.
 53 |         """
 54 |         # Only support train, val, and test mode.
 55 |         assert mode in [
 56 |             "train",
 57 |             "val",
 58 |             "test",
 59 |         ], "Split '{}' not supported for Sthv1".format(mode)
 60 |         self.mode = mode
 61 |         self.cfg = cfg
 62 | 
 63 |         self._video_meta = {}
 64 |         self._num_retries = num_retries
 65 |         # For training or validation mode, one single clip is sampled from every
 66 |         # video. For testing, NUM_ENSEMBLE_VIEWS clips are sampled from every
 67 |         # video. For every clip, NUM_SPATIAL_CROPS is cropped spatially from
 68 |         # the frames.
 69 |         if self.mode in ["train", "val"]:
 70 |             self._num_clips = 1
 71 |         elif self.mode in ["test"]:
 72 |             self._num_clips = (
 73 |                 cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS
 74 |             )
 75 | 
 76 |         logger.info("Constructing Something-Something v1 {}...".format(mode))
 77 |         self._construct_loader()
 78 | 
 79 |     def _construct_loader(self):
 80 |         """
 81 |         Construct the video loader.
 82 |         """
 83 |         path_to_file = os.path.join(
 84 |             self.cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(self.mode)
 85 |         )
 86 |         assert PathManager.exists(path_to_file), "{} dir not found".format(
 87 |             path_to_file
 88 |         )
 89 | 
 90 |         self._path_to_videos = []
 91 |         self._video_len = []
 92 |         self._labels = []
 93 |         self._spatial_temporal_idx = []
 94 |         with open(path_to_file, "r") as f:
 95 |             for clip_idx, path_vlen_label in enumerate(f.read().splitlines()):
 96 |                 assert len(path_vlen_label.split()) == 3
 97 |                 path, vlen, label = path_vlen_label.split()
 98 |                 for idx in range(self._num_clips):
 99 |                     self._path_to_videos.append(
100 |                         os.path.join(self.cfg.DATA.PATH_PREFIX, path)
101 |                     )
102 |                     self._video_len.append(int(vlen))
103 |                     self._labels.append(int(label))
104 |                     self._spatial_temporal_idx.append(idx)
105 |                     self._video_meta[clip_idx * self._num_clips + idx] = {}
106 |         assert (
107 |             len(self._path_to_videos) > 0
108 |         ), "Failed to load Something-Something v1 split {} from {}".format(
109 |             self._split_idx, path_to_file
110 |         )
111 |         logger.info(
112 |             "Something-Something v1 dataloader constructed (size: {}) from {}".format(
113 |                 len(self._path_to_videos), path_to_file
114 |             )
115 |         )
116 | 
117 |     def __getitem__(self, index):
118 |         """
119 |         Given the video index, return the list of frames, label, and video
120 |         index if the video frames can be fetched.
121 |         Args:
122 |             index (int): the video index provided by the pytorch sampler.
123 |         Returns:
124 |             frames (tensor): the frames of sampled from the video. The dimension
125 |                 is `channel` x `num frames` x `height` x `width`.
126 |             label (int): the label of the current video.
127 |             index (int): the index of the video.
128 | 
129 |         """
130 |         if self.mode in ["train", "val"]:
131 |             # -1 indicates random sampling.
132 |             temporal_sample_index = -1
133 |             spatial_sample_index = -1
134 |             min_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[0]
135 |             max_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[1]
136 |             crop_size = self.cfg.DATA.TRAIN_CROP_SIZE
137 |         elif self.mode in ["test"]:
138 |             temporal_sample_index = (
139 |                 self._spatial_temporal_idx[index]
140 |                 // self.cfg.TEST.NUM_SPATIAL_CROPS
141 |             )
142 |             # spatial_sample_index is in [0, 1, 2]. Corresponding to left,
143 |             # center, or right if width is larger than height, and top, middle,
144 |             # or bottom if height is larger than width.
145 |             spatial_sample_index = (
146 |                 self._spatial_temporal_idx[index]
147 |                 % self.cfg.TEST.NUM_SPATIAL_CROPS
148 |             )
149 |             min_scale, max_scale, crop_size = [
150 |                 self.cfg.DATA.TEST_CROP_SIZE] * 3
151 |             # The testing is deterministic and no jitter should be performed.
152 |             # min_scale, max_scale, and crop_size are expect to be the same.
153 |             assert len({min_scale, max_scale, crop_size}) == 1
154 |         else:
155 |             raise NotImplementedError(
156 |                 "Does not support {} mode".format(self.mode)
157 |             )
158 |         label = self._labels[index]
159 | 
160 |         num_frames = self.cfg.DATA.NUM_FRAMES
161 |         video_length = self._video_len[index]
162 | 
163 |         seg_size = float(video_length - 1) / num_frames
164 |         seq = []
165 |         for i in range(num_frames):
166 |             start = int(np.round(seg_size * i))
167 |             end = int(np.round(seg_size * (i + 1)))
168 |             if self.mode == "train":
169 |                 seq.append(random.randint(start, end))
170 |             else:
171 |                 seq.append((start + end) // 2)
172 | 
173 |         frames = torch.as_tensor(
174 |             utils.retry_load_images(
175 |                 [os.path.join(self._path_to_videos[index], '%05d.jpg' % (frame + 1)) for frame in seq],
176 |                 self._num_retries,
177 |             )
178 |         )
179 | 
180 |         # Perform color normalization.
181 |         frames = utils.tensor_normalize(
182 |             frames, self.cfg.DATA.MEAN, self.cfg.DATA.STD
183 |         )
184 | 
185 |         # T H W C -> C T H W.
186 |         frames = frames.permute(3, 0, 1, 2)
187 |         # Perform data augmentation.
188 |         frames = utils.spatial_sampling(
189 |             frames,
190 |             spatial_idx=spatial_sample_index,
191 |             min_scale=min_scale,
192 |             max_scale=max_scale,
193 |             crop_size=crop_size,
194 |             random_horizontal_flip=self.cfg.DATA.RANDOM_FLIP,
195 |             inverse_uniform_sampling=self.cfg.DATA.INV_UNIFORM_SAMPLE,
196 |         )
197 |         frames = utils.pack_pathway_output(self.cfg, frames)
198 |         return frames, label, index, {}
199 | 
200 |     def __len__(self):
201 |         """
202 |         Returns:
203 |             (int): the number of videos in the dataset.
204 |         """
205 |         return len(self._path_to_videos)


--------------------------------------------------------------------------------
/slowfast/csrc/cpu/ROIAlign_cpu.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | #include "cpu/vision.h"
  3 | 
  4 | // implementation taken from Caffe2
  5 | template <typename T>
  6 | struct PreCalc {
  7 |   int pos1;
  8 |   int pos2;
  9 |   int pos3;
 10 |   int pos4;
 11 |   T w1;
 12 |   T w2;
 13 |   T w3;
 14 |   T w4;
 15 | };
 16 | 
 17 | template <typename T>
 18 | void pre_calc_for_bilinear_interpolate(
 19 |     const int height,
 20 |     const int width,
 21 |     const int pooled_height,
 22 |     const int pooled_width,
 23 |     const int iy_upper,
 24 |     const int ix_upper,
 25 |     T roi_start_h,
 26 |     T roi_start_w,
 27 |     T bin_size_h,
 28 |     T bin_size_w,
 29 |     int roi_bin_grid_h,
 30 |     int roi_bin_grid_w,
 31 |     std::vector<PreCalc<T>>& pre_calc) {
 32 |   int pre_calc_index = 0;
 33 |   for (int ph = 0; ph < pooled_height; ph++) {
 34 |     for (int pw = 0; pw < pooled_width; pw++) {
 35 |       for (int iy = 0; iy < iy_upper; iy++) {
 36 |         const T yy = roi_start_h + ph * bin_size_h +
 37 |             static_cast<T>(iy + .5f) * bin_size_h /
 38 |                 static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
 39 |         for (int ix = 0; ix < ix_upper; ix++) {
 40 |           const T xx = roi_start_w + pw * bin_size_w +
 41 |               static_cast<T>(ix + .5f) * bin_size_w /
 42 |                   static_cast<T>(roi_bin_grid_w);
 43 | 
 44 |           T x = xx;
 45 |           T y = yy;
 46 |           // deal with: inverse elements are out of feature map boundary
 47 |           if (y < -1.0 || y > height || x < -1.0 || x > width) {
 48 |             // empty
 49 |             PreCalc<T> pc;
 50 |             pc.pos1 = 0;
 51 |             pc.pos2 = 0;
 52 |             pc.pos3 = 0;
 53 |             pc.pos4 = 0;
 54 |             pc.w1 = 0;
 55 |             pc.w2 = 0;
 56 |             pc.w3 = 0;
 57 |             pc.w4 = 0;
 58 |             pre_calc[pre_calc_index] = pc;
 59 |             pre_calc_index += 1;
 60 |             continue;
 61 |           }
 62 | 
 63 |           if (y <= 0) {
 64 |             y = 0;
 65 |           }
 66 |           if (x <= 0) {
 67 |             x = 0;
 68 |           }
 69 | 
 70 |           int y_low = (int)y;
 71 |           int x_low = (int)x;
 72 |           int y_high;
 73 |           int x_high;
 74 | 
 75 |           if (y_low >= height - 1) {
 76 |             y_high = y_low = height - 1;
 77 |             y = (T)y_low;
 78 |           } else {
 79 |             y_high = y_low + 1;
 80 |           }
 81 | 
 82 |           if (x_low >= width - 1) {
 83 |             x_high = x_low = width - 1;
 84 |             x = (T)x_low;
 85 |           } else {
 86 |             x_high = x_low + 1;
 87 |           }
 88 | 
 89 |           T ly = y - y_low;
 90 |           T lx = x - x_low;
 91 |           T hy = 1. - ly, hx = 1. - lx;
 92 |           T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 93 | 
 94 |           // save weights and indices
 95 |           PreCalc<T> pc;
 96 |           pc.pos1 = y_low * width + x_low;
 97 |           pc.pos2 = y_low * width + x_high;
 98 |           pc.pos3 = y_high * width + x_low;
 99 |           pc.pos4 = y_high * width + x_high;
100 |           pc.w1 = w1;
101 |           pc.w2 = w2;
102 |           pc.w3 = w3;
103 |           pc.w4 = w4;
104 |           pre_calc[pre_calc_index] = pc;
105 | 
106 |           pre_calc_index += 1;
107 |         }
108 |       }
109 |     }
110 |   }
111 | }
112 | 
113 | template <typename T>
114 | void ROIAlignForward_cpu_kernel(
115 |     const int nthreads,
116 |     const T* bottom_data,
117 |     const T& spatial_scale,
118 |     const int channels,
119 |     const int height,
120 |     const int width,
121 |     const int pooled_height,
122 |     const int pooled_width,
123 |     const int sampling_ratio,
124 |     const T* bottom_rois,
125 |     //int roi_cols,
126 |     T* top_data) {
127 |   //AT_ASSERT(roi_cols == 4 || roi_cols == 5);
128 |   int roi_cols = 5;
129 | 
130 |   int n_rois = nthreads / channels / pooled_width / pooled_height;
131 |   // (n, c, ph, pw) is an element in the pooled output
132 |   // can be parallelized using omp
133 |   // #pragma omp parallel for num_threads(32)
134 |   for (int n = 0; n < n_rois; n++) {
135 |     int index_n = n * channels * pooled_width * pooled_height;
136 | 
137 |     // roi could have 4 or 5 columns
138 |     const T* offset_bottom_rois = bottom_rois + n * roi_cols;
139 |     int roi_batch_ind = 0;
140 |     if (roi_cols == 5) {
141 |       roi_batch_ind = offset_bottom_rois[0];
142 |       offset_bottom_rois++;
143 |     }
144 | 
145 |     // Do not using rounding; this implementation detail is critical
146 |     T roi_start_w = offset_bottom_rois[0] * spatial_scale;
147 |     T roi_start_h = offset_bottom_rois[1] * spatial_scale;
148 |     T roi_end_w = offset_bottom_rois[2] * spatial_scale;
149 |     T roi_end_h = offset_bottom_rois[3] * spatial_scale;
150 |     // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
151 |     // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
152 |     // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
153 |     // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
154 | 
155 |     // Force malformed ROIs to be 1x1
156 |     T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
157 |     T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
158 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
159 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
160 | 
161 |     // We use roi_bin_grid to sample the grid and mimic integral
162 |     int roi_bin_grid_h = (sampling_ratio > 0)
163 |         ? sampling_ratio
164 |         : ceil(roi_height / pooled_height); // e.g., = 2
165 |     int roi_bin_grid_w =
166 |         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
167 | 
168 |     // We do average (integral) pooling inside a bin
169 |     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
170 | 
171 |     // we want to precalculate indices and weights shared by all channels,
172 |     // this is the key point of optimization
173 |     std::vector<PreCalc<T>> pre_calc(
174 |         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
175 |     pre_calc_for_bilinear_interpolate(
176 |         height,
177 |         width,
178 |         pooled_height,
179 |         pooled_width,
180 |         roi_bin_grid_h,
181 |         roi_bin_grid_w,
182 |         roi_start_h,
183 |         roi_start_w,
184 |         bin_size_h,
185 |         bin_size_w,
186 |         roi_bin_grid_h,
187 |         roi_bin_grid_w,
188 |         pre_calc);
189 | 
190 |       for (int c = 0; c < channels; c++) {
191 |       int index_n_c = index_n + c * pooled_width * pooled_height;
192 |       const T* offset_bottom_data =
193 |           bottom_data + (roi_batch_ind * channels + c) * height * width;
194 |       int pre_calc_index = 0;
195 | 
196 |       for (int ph = 0; ph < pooled_height; ph++) {
197 |         for (int pw = 0; pw < pooled_width; pw++) {
198 |           int index = index_n_c + ph * pooled_width + pw;
199 | 
200 |           T output_val = 0.;
201 |           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
202 |             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
203 |               PreCalc<T> pc = pre_calc[pre_calc_index];
204 |               output_val += pc.w1 * offset_bottom_data[pc.pos1] +
205 |                   pc.w2 * offset_bottom_data[pc.pos2] +
206 |                   pc.w3 * offset_bottom_data[pc.pos3] +
207 |                   pc.w4 * offset_bottom_data[pc.pos4];
208 | 
209 |               pre_calc_index += 1;
210 |             }
211 |           }
212 |           output_val /= count;
213 | 
214 |           top_data[index] = output_val;
215 |         } // for pw
216 |       } // for ph
217 |     } // for c
218 |   } // for n
219 | }
220 | 
221 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
222 |                                 const at::Tensor& rois,
223 |                                 const float spatial_scale,
224 |                                 const int pooled_height,
225 |                                 const int pooled_width,
226 |                                 const int sampling_ratio) {
227 |   AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor");
228 |   AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor");
229 | 
230 |   auto num_rois = rois.size(0);
231 |   auto channels = input.size(1);
232 |   auto height = input.size(2);
233 |   auto width = input.size(3);
234 | 
235 |   auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
236 |   auto output_size = num_rois * pooled_height * pooled_width * channels;
237 | 
238 |   if (output.numel() == 0) {
239 |     return output;
240 |   }
241 | 
242 |   AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
243 |     ROIAlignForward_cpu_kernel<scalar_t>(
244 |          output_size,
245 |          input.data<scalar_t>(),
246 |          spatial_scale,
247 |          channels,
248 |          height,
249 |          width,
250 |          pooled_height,
251 |          pooled_width,
252 |          sampling_ratio,
253 |          rois.data<scalar_t>(),
254 |          output.data<scalar_t>());
255 |   });
256 |   return output;
257 | }


--------------------------------------------------------------------------------
/slowfast/models/backbones/resnet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | import slowfast.utils.weight_init_helper as init_helper
  6 | from slowfast.models.batchnorm_helper import get_norm
  7 | 
  8 | from .. import head_helper, resnet_helper, stem_helper
  9 | from ..build import MODEL_REGISTRY
 10 | from . import _MODEL_STAGE_DEPTH, _TEMPORAL_KERNEL_BASIS, _POOL1
 11 | 
 12 | 
 13 | @MODEL_REGISTRY.register()
 14 | class ResNet(nn.Module):
 15 |     """
 16 |     ResNet model builder. It builds a ResNet like network backbone without
 17 |     lateral connection (C2D, I3D, Slow).
 18 | 
 19 |     Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
 20 |     "SlowFast networks for video recognition."
 21 |     https://arxiv.org/pdf/1812.03982.pdf
 22 | 
 23 |     Xiaolong Wang, Ross Girshick, Abhinav Gupta, and Kaiming He.
 24 |     "Non-local neural networks."
 25 |     https://arxiv.org/pdf/1711.07971.pdf
 26 |     """
 27 | 
 28 |     def __init__(self, cfg):
 29 |         """
 30 |         The `__init__` method of any subclass should also contain these
 31 |             arguments.
 32 | 
 33 |         Args:
 34 |             cfg (CfgNode): model building configs, details are in the
 35 |                 comments of the config file.
 36 |         """
 37 |         super(ResNet, self).__init__()
 38 |         self.norm_module = get_norm(cfg)
 39 |         self.enable_detection = cfg.DETECTION.ENABLE
 40 |         self.num_pathways = 1
 41 |         self._cfg = cfg
 42 |         self._construct_network(cfg)
 43 |         init_helper.init_weights(
 44 |             self, cfg.MODEL.FC_INIT_STD, cfg.RESNET.ZERO_INIT_FINAL_BN
 45 |         )
 46 | 
 47 |     def _construct_network(self, cfg):
 48 |         """
 49 |         Builds a single pathway ResNet model.
 50 | 
 51 |         Args:
 52 |             cfg (CfgNode): model building configs, details are in the
 53 |                 comments of the config file.
 54 |         """
 55 |         assert cfg.MODEL.ARCH in _POOL1.keys()
 56 |         pool_size = _POOL1[cfg.MODEL.ARCH]
 57 |         assert len({len(pool_size), self.num_pathways}) == 1
 58 |         assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys()
 59 | 
 60 |         (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH]
 61 | 
 62 |         num_groups = cfg.RESNET.NUM_GROUPS
 63 |         width_per_group = cfg.RESNET.WIDTH_PER_GROUP
 64 |         dim_inner = num_groups * width_per_group
 65 | 
 66 |         temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH]
 67 | 
 68 |         self.s1 = stem_helper.VideoModelStem(
 69 |             cfg=cfg,
 70 |             dim_in=cfg.DATA.INPUT_CHANNEL_NUM,
 71 |             dim_out=[width_per_group],
 72 |             kernel=[temp_kernel[0][0] + [7, 7]],
 73 |             stride=[[1, 2, 2]],  # [2,2,2] for non-sparse
 74 |             padding=[[temp_kernel[0][0][0] // 2, 3, 3]],
 75 |             stem_func_name=cfg.RESNET.STEM_FUNC,
 76 |             norm_module=self.norm_module,
 77 |         )
 78 | 
 79 |         self.s2 = resnet_helper.ResStage(
 80 |             cfg=cfg,
 81 |             dim_in=[width_per_group],
 82 |             dim_out=[width_per_group * 4],
 83 |             dim_inner=[dim_inner],
 84 |             temp_kernel_sizes=temp_kernel[1],
 85 |             stride=cfg.RESNET.SPATIAL_STRIDES[0],
 86 |             num_blocks=[d2],
 87 |             num_groups=[num_groups],
 88 |             num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[0],
 89 |             nonlocal_inds=cfg.NONLOCAL.LOCATION[0],
 90 |             nonlocal_group=cfg.NONLOCAL.GROUP[0],
 91 |             nonlocal_pool=cfg.NONLOCAL.POOL[0],
 92 |             nonlocal_use_bn=cfg.NONLOCAL.USE_BN,
 93 |             nonlocal_progress=cfg.NONLOCAL.PROGRESS,
 94 |             instantiation=cfg.NONLOCAL.INSTANTIATION,
 95 |             trans_func_name=cfg.RESNET.TRANS_FUNC,
 96 |             stride_1x1=cfg.RESNET.STRIDE_1X1,
 97 |             inplace_relu=cfg.RESNET.INPLACE_RELU,
 98 |             dilation=cfg.RESNET.SPATIAL_DILATIONS[0],
 99 |             norm_module=self.norm_module,
100 |             temp_progress=cfg.PGT.ENABLE,
101 |         )
102 | 
103 |         for pathway in range(self.num_pathways):
104 |             pool = nn.MaxPool3d(
105 |                 kernel_size=pool_size[pathway],
106 |                 stride=pool_size[pathway],
107 |                 padding=[0, 0, 0],
108 |             )
109 |             self.add_module("pathway{}_pool".format(pathway), pool)
110 | 
111 |         self.s3 = resnet_helper.ResStage(
112 |             cfg=cfg,
113 |             dim_in=[width_per_group * 4],
114 |             dim_out=[width_per_group * 8],
115 |             dim_inner=[dim_inner * 2],
116 |             temp_kernel_sizes=temp_kernel[2],
117 |             stride=cfg.RESNET.SPATIAL_STRIDES[1],
118 |             num_blocks=[d3],
119 |             num_groups=[num_groups],
120 |             num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[1],
121 |             nonlocal_inds=cfg.NONLOCAL.LOCATION[1],
122 |             nonlocal_group=cfg.NONLOCAL.GROUP[1],
123 |             nonlocal_pool=cfg.NONLOCAL.POOL[1],
124 |             nonlocal_use_bn=cfg.NONLOCAL.USE_BN,
125 |             nonlocal_progress=cfg.NONLOCAL.PROGRESS,
126 |             instantiation=cfg.NONLOCAL.INSTANTIATION,
127 |             trans_func_name=cfg.RESNET.TRANS_FUNC,
128 |             stride_1x1=cfg.RESNET.STRIDE_1X1,
129 |             inplace_relu=cfg.RESNET.INPLACE_RELU,
130 |             dilation=cfg.RESNET.SPATIAL_DILATIONS[1],
131 |             norm_module=self.norm_module,
132 |             temp_progress=cfg.PGT.ENABLE,
133 |         )
134 | 
135 |         self.s4 = resnet_helper.ResStage(
136 |             cfg=cfg,
137 |             dim_in=[width_per_group * 8],
138 |             dim_out=[width_per_group * 16],
139 |             dim_inner=[dim_inner * 4],
140 |             temp_kernel_sizes=temp_kernel[3],
141 |             stride=cfg.RESNET.SPATIAL_STRIDES[2],
142 |             num_blocks=[d4],
143 |             num_groups=[num_groups],
144 |             num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[2],
145 |             nonlocal_inds=cfg.NONLOCAL.LOCATION[2],
146 |             nonlocal_group=cfg.NONLOCAL.GROUP[2],
147 |             nonlocal_pool=cfg.NONLOCAL.POOL[2],
148 |             nonlocal_use_bn=cfg.NONLOCAL.USE_BN,
149 |             nonlocal_progress=cfg.NONLOCAL.PROGRESS,
150 |             instantiation=cfg.NONLOCAL.INSTANTIATION,
151 |             trans_func_name=cfg.RESNET.TRANS_FUNC,
152 |             stride_1x1=cfg.RESNET.STRIDE_1X1,
153 |             inplace_relu=cfg.RESNET.INPLACE_RELU,
154 |             dilation=cfg.RESNET.SPATIAL_DILATIONS[2],
155 |             norm_module=self.norm_module,
156 |             temp_progress=cfg.PGT.ENABLE,
157 |         )
158 | 
159 |         self.s5 = resnet_helper.ResStage(
160 |             cfg=cfg,
161 |             dim_in=[width_per_group * 16],
162 |             dim_out=[width_per_group * 32],
163 |             dim_inner=[dim_inner * 8],
164 |             temp_kernel_sizes=temp_kernel[4],
165 |             stride=cfg.RESNET.SPATIAL_STRIDES[3],
166 |             num_blocks=[d5],
167 |             num_groups=[num_groups],
168 |             num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[3],
169 |             nonlocal_inds=cfg.NONLOCAL.LOCATION[3],
170 |             nonlocal_group=cfg.NONLOCAL.GROUP[3],
171 |             nonlocal_pool=cfg.NONLOCAL.POOL[3],
172 |             nonlocal_use_bn=cfg.NONLOCAL.USE_BN,
173 |             nonlocal_progress=cfg.NONLOCAL.PROGRESS,
174 |             instantiation=cfg.NONLOCAL.INSTANTIATION,
175 |             trans_func_name=cfg.RESNET.TRANS_FUNC,
176 |             stride_1x1=cfg.RESNET.STRIDE_1X1,
177 |             inplace_relu=cfg.RESNET.INPLACE_RELU,
178 |             dilation=cfg.RESNET.SPATIAL_DILATIONS[3],
179 |             norm_module=self.norm_module,
180 |             temp_progress=cfg.PGT.ENABLE,
181 |         )
182 | 
183 |         if self.enable_detection:
184 |             self.head = head_helper.ResNetRoIHead(
185 |                 cfg=cfg,
186 |                 dim_in=[width_per_group * 32],
187 |                 num_classes=cfg.MODEL.NUM_CLASSES,
188 |                 pool_size=[[cfg.DATA.NUM_FRAMES // pool_size[0][0], 1, 1]],
189 |                 pool_type=cfg.MODEL.FINAL_POOL[1],
190 |                 resolution=[[cfg.DETECTION.ROI_XFORM_RESOLUTION] * 2],
191 |                 scale_factor=[cfg.DETECTION.SPATIAL_SCALE_FACTOR],
192 |                 dropout_rate=cfg.MODEL.DROPOUT_RATE,
193 |                 act_func=cfg.MODEL.HEAD_ACT,
194 |                 aligned=cfg.DETECTION.ALIGNED,
195 |             )
196 |         else:
197 |             self.head = head_helper.ResNetBasicHead(
198 |                 cfg=cfg,
199 |                 dim_in=[width_per_group * 32],
200 |                 num_classes=cfg.MODEL.NUM_CLASSES,
201 |                 pool_size=[[
202 |                     cfg.DATA.NUM_FRAMES // pool_size[0][0],
203 |                     cfg.DATA.CROP_SIZE // 32 // pool_size[0][1],
204 |                     cfg.DATA.CROP_SIZE // 32 // pool_size[0][2],
205 |                 ]],
206 |                 pool_type=cfg.MODEL.FINAL_POOL,
207 |                 dropout_rate=cfg.MODEL.DROPOUT_RATE,
208 |                 act_func=cfg.MODEL.HEAD_ACT,
209 |             )
210 | 
211 |     def forward(self, x, bboxes=None, slices=None):
212 |         x = self.s1(x)
213 |         x = self.s2(x)
214 |         for pathway in range(self.num_pathways):
215 |             pool = getattr(self, "pathway{}_pool".format(pathway))
216 |             x[pathway] = pool(x[pathway])
217 |         x = self.s3(x)
218 |         x = self.s4(x)
219 |         x = self.s5(x)
220 |         if self.enable_detection:
221 |             x = self.head(x, bboxes, slices)
222 |         else:
223 |             x = self.head(x)
224 |         return x


--------------------------------------------------------------------------------
/slowfast/utils/multigrid.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Helper functions for multigrid training."""
  5 | 
  6 | import numpy as np
  7 | 
  8 | import slowfast.utils.logging as logging
  9 | 
 10 | logger = logging.get_logger(__name__)
 11 | 
 12 | 
 13 | class MultigridSchedule(object):
 14 |     """
 15 |     This class defines multigrid training schedule and update cfg accordingly.
 16 |     """
 17 | 
 18 |     def init_multigrid(self, cfg):
 19 |         """
 20 |         Update cfg based on multigrid settings.
 21 |         Args:
 22 |             cfg (configs): configs that contains training and multigrid specific
 23 |                 hyperparameters. Details can be seen in
 24 |                 slowfast/config/defaults.py.
 25 |         Returns:
 26 |             cfg (configs): the updated cfg.
 27 |         """
 28 |         self.schedule = None
 29 |         # We may modify cfg.TRAIN.BATCH_SIZE, cfg.DATA.NUM_FRAMES, and
 30 |         # cfg.DATA.TRAIN_CROP_SIZE during training, so we store their original
 31 |         # value in cfg and use them as global variables.
 32 |         cfg.MULTIGRID.DEFAULT_B = cfg.TRAIN.BATCH_SIZE
 33 |         cfg.MULTIGRID.DEFAULT_T = cfg.DATA.NUM_FRAMES
 34 |         cfg.MULTIGRID.DEFAULT_S = cfg.DATA.TRAIN_CROP_SIZE
 35 | 
 36 |         if cfg.MULTIGRID.LONG_CYCLE:
 37 |             self.schedule = self.get_long_cycle_schedule(cfg)
 38 |             cfg.SOLVER.STEPS = [0] + [s[-1] for s in self.schedule]
 39 |             # Fine-tuning phase.
 40 |             cfg.SOLVER.STEPS[-1] = (
 41 |                 cfg.SOLVER.STEPS[-2] + cfg.SOLVER.STEPS[-1]
 42 |             ) // 2
 43 |             cfg.SOLVER.LRS = [
 44 |                 cfg.SOLVER.GAMMA ** s[0] * s[1][0] for s in self.schedule
 45 |             ]
 46 |             # Fine-tuning phase.
 47 |             cfg.SOLVER.LRS = cfg.SOLVER.LRS[:-1] + [
 48 |                 cfg.SOLVER.LRS[-2],
 49 |                 cfg.SOLVER.LRS[-1],
 50 |             ]
 51 | 
 52 |             cfg.SOLVER.MAX_EPOCH = self.schedule[-1][-1]
 53 | 
 54 |         elif cfg.MULTIGRID.SHORT_CYCLE:
 55 |             cfg.SOLVER.STEPS = [
 56 |                 int(s * cfg.MULTIGRID.EPOCH_FACTOR) for s in cfg.SOLVER.STEPS
 57 |             ]
 58 |             cfg.SOLVER.MAX_EPOCH = int(
 59 |                 cfg.SOLVER.MAX_EPOCH * cfg.MULTIGRID.EPOCH_FACTOR
 60 |             )
 61 |         return cfg
 62 | 
 63 |     def update_long_cycle(self, cfg, cur_epoch):
 64 |         """
 65 |         Before every epoch, check if long cycle shape should change. If it
 66 |             should, update cfg accordingly.
 67 |         Args:
 68 |             cfg (configs): configs that contains training and multigrid specific
 69 |                 hyperparameters. Details can be seen in
 70 |                 slowfast/config/defaults.py.
 71 |             cur_epoch (int): current epoch index.
 72 |         Returns:
 73 |             cfg (configs): the updated cfg.
 74 |             changed (bool): do we change long cycle shape at this epoch?
 75 |         """
 76 |         base_b, base_t, base_s = get_current_long_cycle_shape(
 77 |             self.schedule, cur_epoch
 78 |         )
 79 |         if base_s != cfg.DATA.TRAIN_CROP_SIZE or base_t != cfg.DATA.NUM_FRAMES:
 80 | 
 81 |             cfg.DATA.NUM_FRAMES = base_t
 82 |             cfg.DATA.TRAIN_CROP_SIZE = base_s
 83 |             cfg.TRAIN.BATCH_SIZE = base_b * cfg.MULTIGRID.DEFAULT_B
 84 | 
 85 |             bs_factor = (
 86 |                 float(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS)
 87 |                 / cfg.MULTIGRID.BN_BASE_SIZE
 88 |             )
 89 | 
 90 |             if bs_factor < 1:
 91 |                 cfg.BN.NORM_TYPE = "sync_batchnorm"
 92 |                 cfg.BN.NUM_SYNC_DEVICES = int(1.0 / bs_factor)
 93 |             elif bs_factor > 1:
 94 |                 cfg.BN.NORM_TYPE = "sub_batchnorm"
 95 |                 cfg.BN.NUM_SPLITS = int(bs_factor)
 96 |             else:
 97 |                 cfg.BN.NORM_TYPE = "batchnorm"
 98 | 
 99 |             cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = cfg.DATA.SAMPLING_RATE * (
100 |                 cfg.MULTIGRID.DEFAULT_T // cfg.DATA.NUM_FRAMES
101 |             )
102 |             logger.info("Long cycle updates:")
103 |             logger.info("\tBN.NORM_TYPE: {}".format(cfg.BN.NORM_TYPE))
104 |             if cfg.BN.NORM_TYPE == "sync_batchnorm":
105 |                 logger.info(
106 |                     "\tBN.NUM_SYNC_DEVICES: {}".format(cfg.BN.NUM_SYNC_DEVICES)
107 |                 )
108 |             elif cfg.BN.NORM_TYPE == "sub_batchnorm":
109 |                 logger.info("\tBN.NUM_SPLITS: {}".format(cfg.BN.NUM_SPLITS))
110 |             logger.info("\tTRAIN.BATCH_SIZE: {}".format(cfg.TRAIN.BATCH_SIZE))
111 |             logger.info(
112 |                 "\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format(
113 |                     cfg.DATA.NUM_FRAMES, cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE
114 |                 )
115 |             )
116 |             logger.info(
117 |                 "\tDATA.TRAIN_CROP_SIZE: {}".format(cfg.DATA.TRAIN_CROP_SIZE)
118 |             )
119 |             return cfg, True
120 |         else:
121 |             return cfg, False
122 | 
123 |     def get_long_cycle_schedule(self, cfg):
124 |         """
125 |         Based on multigrid hyperparameters, define the schedule of a long cycle.
126 |         Args:
127 |             cfg (configs): configs that contains training and multigrid specific
128 |                 hyperparameters. Details can be seen in
129 |                 slowfast/config/defaults.py.
130 |         Returns:
131 |             schedule (list): Specifies a list long cycle base shapes and their
132 |                 corresponding training epochs.
133 |         """
134 | 
135 |         steps = cfg.SOLVER.STEPS
136 | 
137 |         default_size = float(
138 |             cfg.DATA.NUM_FRAMES * cfg.DATA.TRAIN_CROP_SIZE ** 2
139 |         )
140 |         default_iters = steps[-1]
141 | 
142 |         # Get shapes and average batch size for each long cycle shape.
143 |         avg_bs = []
144 |         all_shapes = []
145 |         for t_factor, s_factor in cfg.MULTIGRID.LONG_CYCLE_FACTORS:
146 |             base_t = int(round(cfg.DATA.NUM_FRAMES * t_factor))
147 |             base_s = int(round(cfg.DATA.TRAIN_CROP_SIZE * s_factor))
148 |             if cfg.MULTIGRID.SHORT_CYCLE:
149 |                 shapes = [
150 |                     [
151 |                         base_t,
152 |                         cfg.MULTIGRID.DEFAULT_S
153 |                         * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[0],
154 |                     ],
155 |                     [
156 |                         base_t,
157 |                         cfg.MULTIGRID.DEFAULT_S
158 |                         * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[1],
159 |                     ],
160 |                     [base_t, base_s],
161 |                 ]
162 |             else:
163 |                 shapes = [[base_t, base_s]]
164 | 
165 |             # (T, S) -> (B, T, S)
166 |             shapes = [
167 |                 [int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]]
168 |                 for s in shapes
169 |             ]
170 |             avg_bs.append(np.mean([s[0] for s in shapes]))
171 |             all_shapes.append(shapes)
172 | 
173 |         # Get schedule regardless of cfg.MULTIGRID.EPOCH_FACTOR.
174 |         total_iters = 0
175 |         schedule = []
176 |         for step_index in range(len(steps) - 1):
177 |             step_epochs = steps[step_index + 1] - steps[step_index]
178 | 
179 |             for long_cycle_index, shapes in enumerate(all_shapes):
180 |                 cur_epochs = (
181 |                     step_epochs * avg_bs[long_cycle_index] / sum(avg_bs)
182 |                 )
183 | 
184 |                 cur_iters = cur_epochs / avg_bs[long_cycle_index]
185 |                 total_iters += cur_iters
186 |                 schedule.append((step_index, shapes[-1], cur_epochs))
187 | 
188 |         iter_saving = default_iters / total_iters
189 | 
190 |         final_step_epochs = cfg.SOLVER.MAX_EPOCH - steps[-1]
191 | 
192 |         # We define the fine-tuning phase to have the same amount of iteration
193 |         # saving as the rest of the training.
194 |         ft_epochs = final_step_epochs / iter_saving * avg_bs[-1]
195 | 
196 |         schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs))
197 | 
198 |         # Obtrain final schedule given desired cfg.MULTIGRID.EPOCH_FACTOR.
199 |         x = (
200 |             cfg.SOLVER.MAX_EPOCH
201 |             * cfg.MULTIGRID.EPOCH_FACTOR
202 |             / sum(s[-1] for s in schedule)
203 |         )
204 | 
205 |         final_schedule = []
206 |         total_epochs = 0
207 |         for s in schedule:
208 |             epochs = s[2] * x
209 |             total_epochs += epochs
210 |             final_schedule.append((s[0], s[1], int(round(total_epochs))))
211 |         print_schedule(final_schedule)
212 |         return final_schedule
213 | 
214 | 
215 | def print_schedule(schedule):
216 |     """
217 |     Log schedule.
218 |     """
219 |     logger.info("Long cycle index\tBase shape\tEpochs")
220 |     for s in schedule:
221 |         logger.info("{}\t{}\t{}".format(s[0], s[1], s[2]))
222 | 
223 | 
224 | def get_current_long_cycle_shape(schedule, epoch):
225 |     """
226 |     Given a schedule and epoch index, return the long cycle base shape.
227 |     Args:
228 |         schedule (configs): configs that contains training and multigrid specific
229 |             hyperparameters. Details can be seen in
230 |             slowfast/config/defaults.py.
231 |         cur_epoch (int): current epoch index.
232 |     Returns:
233 |         shapes (list): A list describing the base shape in a long cycle:
234 |             [batch size relative to default,
235 |             number of frames, spatial dimension].
236 |     """
237 |     for s in schedule:
238 |         if epoch < s[-1]:
239 |             return s[1]
240 |     return schedule[-1][1]
241 | 


--------------------------------------------------------------------------------
/slowfast/utils/distributed.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Distributed helpers."""
  5 | 
  6 | import functools
  7 | import logging
  8 | import pickle
  9 | import torch
 10 | import torch.distributed as dist
 11 | 
 12 | _LOCAL_PROCESS_GROUP = None
 13 | 
 14 | 
 15 | def all_gather(tensors):
 16 |     """
 17 |     All gathers the provided tensors from all processes across machines.
 18 |     Args:
 19 |         tensors (list): tensors to perform all gather across all processes in
 20 |         all machines.
 21 |     """
 22 | 
 23 |     gather_list = []
 24 |     output_tensor = []
 25 |     world_size = dist.get_world_size()
 26 |     for tensor in tensors:
 27 |         tensor_placeholder = [
 28 |             torch.ones_like(tensor) for _ in range(world_size)
 29 |         ]
 30 |         dist.all_gather(tensor_placeholder, tensor, async_op=False)
 31 |         gather_list.append(tensor_placeholder)
 32 |     for gathered_tensor in gather_list:
 33 |         output_tensor.append(torch.cat(gathered_tensor, dim=0))
 34 |     return output_tensor
 35 | 
 36 | 
 37 | def all_reduce(tensors, average=True):
 38 |     """
 39 |     All reduce the provided tensors from all processes across machines.
 40 |     Args:
 41 |         tensors (list): tensors to perform all reduce across all processes in
 42 |         all machines.
 43 |         average (bool): scales the reduced tensor by the number of overall
 44 |         processes across all machines.
 45 |     """
 46 | 
 47 |     for tensor in tensors:
 48 |         dist.all_reduce(tensor, async_op=False)
 49 |     if average:
 50 |         world_size = dist.get_world_size()
 51 |         for tensor in tensors:
 52 |             tensor.mul_(1.0 / world_size)
 53 |     return tensors
 54 | 
 55 | 
 56 | def init_process_group(
 57 |     local_rank,
 58 |     local_world_size,
 59 |     shard_id,
 60 |     num_shards,
 61 |     init_method,
 62 |     dist_backend="nccl",
 63 | ):
 64 |     """
 65 |     Initializes the default process group.
 66 |     Args:
 67 |         local_rank (int): the rank on the current local machine.
 68 |         local_world_size (int): the world size (number of processes running) on
 69 |         the current local machine.
 70 |         shard_id (int): the shard index (machine rank) of the current machine.
 71 |         num_shards (int): number of shards for distributed training.
 72 |         init_method (string): supporting three different methods for
 73 |             initializing process groups:
 74 |             "file": use shared file system to initialize the groups across
 75 |             different processes.
 76 |             "tcp": use tcp address to initialize the groups across different
 77 |         dist_backend (string): backend to use for distributed training. Options
 78 |             includes gloo, mpi and nccl, the details can be found here:
 79 |             https://pytorch.org/docs/stable/distributed.html
 80 |     """
 81 |     # Sets the GPU to use.
 82 |     torch.cuda.set_device(local_rank)
 83 |     # Initialize the process group.
 84 |     proc_rank = local_rank + shard_id * local_world_size
 85 |     world_size = local_world_size * num_shards
 86 |     dist.init_process_group(
 87 |         backend=dist_backend,
 88 |         init_method=init_method,
 89 |         world_size=world_size,
 90 |         rank=proc_rank,
 91 |     )
 92 | 
 93 | 
 94 | def is_master_proc(num_gpus=8):
 95 |     """
 96 |     Determines if the current process is the master process.
 97 |     """
 98 |     if torch.distributed.is_initialized():
 99 |         return dist.get_rank() % num_gpus == 0
100 |     else:
101 |         return True
102 | 
103 | 
104 | def get_world_size():
105 |     """
106 |     Get the size of the world.
107 |     """
108 |     if not dist.is_available():
109 |         return 1
110 |     if not dist.is_initialized():
111 |         return 1
112 |     return dist.get_world_size()
113 | 
114 | 
115 | def get_rank():
116 |     """
117 |     Get the rank of the current process.
118 |     """
119 |     if not dist.is_available():
120 |         return 0
121 |     if not dist.is_initialized():
122 |         return 0
123 |     return dist.get_rank()
124 | 
125 | 
126 | def synchronize():
127 |     """
128 |     Helper function to synchronize (barrier) among all processes when
129 |     using distributed training
130 |     """
131 |     if not dist.is_available():
132 |         return
133 |     if not dist.is_initialized():
134 |         return
135 |     world_size = dist.get_world_size()
136 |     if world_size == 1:
137 |         return
138 |     dist.barrier()
139 | 
140 | 
141 | @functools.lru_cache()
142 | def _get_global_gloo_group():
143 |     """
144 |     Return a process group based on gloo backend, containing all the ranks
145 |     The result is cached.
146 |     Returns:
147 |         (group): pytorch dist group.
148 |     """
149 |     if dist.get_backend() == "nccl":
150 |         return dist.new_group(backend="gloo")
151 |     else:
152 |         return dist.group.WORLD
153 | 
154 | 
155 | def _serialize_to_tensor(data, group):
156 |     """
157 |     Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl`
158 |         backend is supported.
159 |     Args:
160 |         data (data): data to be serialized.
161 |         group (group): pytorch dist group.
162 |     Returns:
163 |         tensor (ByteTensor): tensor that serialized.
164 |     """
165 | 
166 |     backend = dist.get_backend(group)
167 |     assert backend in ["gloo", "nccl"]
168 |     device = torch.device("cpu" if backend == "gloo" else "cuda")
169 | 
170 |     buffer = pickle.dumps(data)
171 |     if len(buffer) > 1024 ** 3:
172 |         logger = logging.getLogger(__name__)
173 |         logger.warning(
174 |             "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
175 |                 get_rank(), len(buffer) / (1024 ** 3), device
176 |             )
177 |         )
178 |     storage = torch.ByteStorage.from_buffer(buffer)
179 |     tensor = torch.ByteTensor(storage).to(device=device)
180 |     return tensor
181 | 
182 | 
183 | def _pad_to_largest_tensor(tensor, group):
184 |     """
185 |     Padding all the tensors from different GPUs to the largest ones.
186 |     Args:
187 |         tensor (tensor): tensor to pad.
188 |         group (group): pytorch dist group.
189 |     Returns:
190 |         list[int]: size of the tensor, on each rank
191 |         Tensor: padded tensor that has the max size
192 |     """
193 |     world_size = dist.get_world_size(group=group)
194 |     assert (
195 |         world_size >= 1
196 |     ), "comm.gather/all_gather must be called from ranks within the given group!"
197 |     local_size = torch.tensor(
198 |         [tensor.numel()], dtype=torch.int64, device=tensor.device
199 |     )
200 |     size_list = [
201 |         torch.zeros([1], dtype=torch.int64, device=tensor.device)
202 |         for _ in range(world_size)
203 |     ]
204 |     dist.all_gather(size_list, local_size, group=group)
205 |     size_list = [int(size.item()) for size in size_list]
206 | 
207 |     max_size = max(size_list)
208 | 
209 |     # we pad the tensor because torch all_gather does not support
210 |     # gathering tensors of different shapes
211 |     if local_size != max_size:
212 |         padding = torch.zeros(
213 |             (max_size - local_size,), dtype=torch.uint8, device=tensor.device
214 |         )
215 |         tensor = torch.cat((tensor, padding), dim=0)
216 |     return size_list, tensor
217 | 
218 | 
219 | def all_gather_unaligned(data, group=None):
220 |     """
221 |     Run all_gather on arbitrary picklable data (not necessarily tensors).
222 | 
223 |     Args:
224 |         data: any picklable object
225 |         group: a torch process group. By default, will use a group which
226 |             contains all ranks on gloo backend.
227 | 
228 |     Returns:
229 |         list[data]: list of data gathered from each rank
230 |     """
231 |     if get_world_size() == 1:
232 |         return [data]
233 |     if group is None:
234 |         group = _get_global_gloo_group()
235 |     if dist.get_world_size(group) == 1:
236 |         return [data]
237 | 
238 |     tensor = _serialize_to_tensor(data, group)
239 | 
240 |     size_list, tensor = _pad_to_largest_tensor(tensor, group)
241 |     max_size = max(size_list)
242 | 
243 |     # receiving Tensor from all ranks
244 |     tensor_list = [
245 |         torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
246 |         for _ in size_list
247 |     ]
248 |     dist.all_gather(tensor_list, tensor, group=group)
249 | 
250 |     data_list = []
251 |     for size, tensor in zip(size_list, tensor_list):
252 |         buffer = tensor.cpu().numpy().tobytes()[:size]
253 |         data_list.append(pickle.loads(buffer))
254 | 
255 |     return data_list
256 | 
257 | 
258 | def init_distributed_training(cfg):
259 |     """
260 |     Initialize variables needed for distributed training.
261 |     """
262 |     if cfg.NUM_GPUS == 1:
263 |         return
264 |     num_gpus_per_machine = cfg.NUM_GPUS
265 |     num_machines = dist.get_world_size() // num_gpus_per_machine
266 |     for i in range(num_machines):
267 |         ranks_on_i = list(
268 |             range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
269 |         )
270 |         pg = dist.new_group(ranks_on_i)
271 |         if i == cfg.SHARD_ID:
272 |             global _LOCAL_PROCESS_GROUP
273 |             _LOCAL_PROCESS_GROUP = pg
274 | 
275 | 
276 | def get_local_size() -> int:
277 |     """
278 |     Returns:
279 |         The size of the per-machine process group,
280 |         i.e. the number of processes per machine.
281 |     """
282 |     if not dist.is_available():
283 |         return 1
284 |     if not dist.is_initialized():
285 |         return 1
286 |     return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
287 | 
288 | 
289 | def get_local_rank() -> int:
290 |     """
291 |     Returns:
292 |         The rank of the current process within the local (per-machine) process group.
293 |     """
294 |     if not dist.is_available():
295 |         return 0
296 |     if not dist.is_initialized():
297 |         return 0
298 |     assert _LOCAL_PROCESS_GROUP is not None
299 |     return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
300 | 


--------------------------------------------------------------------------------