├── slowfast ├── __init__.py ├── utils │ ├── ava_evaluation │ │ ├── __init__.py │ │ ├── README.md │ │ ├── np_box_mask_list.py │ │ ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt │ │ ├── np_box_ops.py │ │ ├── np_mask_ops.py │ │ ├── np_box_list.py │ │ ├── metrics.py │ │ └── label_map_util.py │ ├── __init__.py │ ├── env.py │ ├── weight_init_helper.py │ ├── multiprocessing.py │ ├── metrics.py │ ├── logging.py │ ├── bn_helper.py │ ├── lr_policy.py │ ├── aia_model_loading.py │ ├── benchmark.py │ ├── parser.py │ ├── setup_moxing_env.py │ ├── c2_model_loading.py │ ├── multigrid.py │ └── distributed.py ├── config │ └── __init__.py ├── visualization │ └── __init__.py ├── csrc │ ├── vision.cpp │ ├── cpu │ │ ├── vision.h │ │ └── ROIAlign_cpu.cpp │ ├── cuda │ │ └── vision.h │ └── ROIAlign.h ├── models │ ├── __init__.py │ ├── losses.py │ ├── build.py │ ├── detection_helper.py │ ├── backbones │ │ ├── __init__.py │ │ ├── x3d.py │ │ ├── regnet.py │ │ └── resnet.py │ ├── operators.py │ ├── optimizer.py │ └── nonlocal_helper.py └── datasets │ ├── __init__.py │ ├── build.py │ ├── video_container.py │ ├── multigrid_helper.py │ ├── loader.py │ ├── ava_helper.py │ └── ssv1.py ├── .gitignore ├── configs ├── Kinetics │ ├── SLOW_8x8_R101_50.yaml │ ├── SLOW_PROG_36x8_R101_50.yaml │ ├── SLOW_PROG_36x8_R50.yaml │ ├── SLOWFAST_PROG_36x8_R50.yaml │ ├── SLOW_8x8_R50.yaml │ ├── SLOW_8x8_R101.yaml │ └── SLOWFAST_8x8_R50.yaml └── Charades │ ├── SLOWFAST_PROG_76x8_R50_K400.yaml │ ├── SLOWFAST_PROG_76x8_R101_K400.yaml │ ├── SLOWFAST_PROG_76x8_R101_K600.yaml │ ├── SLOW_PROG_76x8_R50_K400.yaml │ ├── SLOW_PROG_76x8_R101_K400.yaml │ ├── SLOW_16x8_R50_K400.yaml │ ├── SLOW_16x8_R101_K400.yaml │ ├── SLOWFAST_16x8_R50_K400.yaml │ ├── SLOWFAST_16x8_R101_K400.yaml │ └── SLOWFAST_16x8_R101_K600.yaml ├── requirements.txt ├── README.md ├── MODEL_ZOO.md ├── tools ├── run_net.py ├── eval_ava.py ├── visualize_log.py └── test_net.py └── setup.py /slowfast/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /slowfast/config/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /slowfast/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /slowfast/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .idea/ 3 | __pycache__/ 4 | 5 | *.so 6 | build/ 7 | data/ 8 | scripts/ 9 | logs 10 | slowfast.egg-info/ 11 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/README.md: -------------------------------------------------------------------------------- 1 | The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet). 2 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_8x8_R101_50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOW_8x8_R101.yaml 2 | 3 | RESNET: 4 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 5 | 6 | LOGS: 7 | DIR: logs/Kinetics/SLOW_8x8_R101_50 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | av 2 | filelock 3 | fvcore 4 | moviepy 5 | opencv-python 6 | pandas 7 | psutil 8 | pycocotools 9 | simplejson 10 | sklearn 11 | tensorboardX 12 | torch 13 | torchvision 14 | tqdm -------------------------------------------------------------------------------- /slowfast/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | #include "ROIAlign.h" 2 | 3 | 4 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 5 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 6 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 7 | } -------------------------------------------------------------------------------- /slowfast/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from .build import MODEL_REGISTRY, build_model # noqa 5 | from .backbones.resnet import ResNet # noqa 6 | from .backbones.regnet import RegNet # noqa 7 | from .backbones.slowfast import SlowFast # noqa 8 | from .backbones.x3d import X3D -------------------------------------------------------------------------------- /slowfast/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from .ava_dataset import Ava # noqa 5 | from .build import DATASET_REGISTRY, build_dataset # noqa 6 | from .charades import Charades # noqa 7 | from .kinetics import Kinetics # noqa 8 | from .ssv1 import Ssv1 # noqa 9 | from .ssv2 import Ssv2 # noqa -------------------------------------------------------------------------------- /slowfast/utils/env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Set up Environment.""" 5 | 6 | import slowfast.utils.logging as logging 7 | 8 | _ENV_SETUP_DONE = False 9 | 10 | 11 | def setup_environment(): 12 | global _ENV_SETUP_DONE 13 | if _ENV_SETUP_DONE: 14 | return 15 | _ENV_SETUP_DONE = True 16 | -------------------------------------------------------------------------------- /slowfast/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | 5 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 6 | const at::Tensor& rois, 7 | const float spatial_scale, 8 | const int pooled_height, 9 | const int pooled_width, 10 | const int sampling_ratio); -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_PROG_36x8_R101_50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOW_8x8_R101_50.yaml 2 | 3 | DATA: 4 | NUM_FRAMES: 36 5 | 6 | PGT: 7 | ENABLE: True 8 | STEP_LEN: [8] 9 | STEPS: 5 10 | OVERLAP: [1] 11 | PG_EVAL: False 12 | TRAIN_TOGETHER: True 13 | 14 | SOLVER: 15 | MAX_EPOCH: 98 16 | WARMUP_EPOCHS: 17.0 17 | WEIGHT_DECAY: 2e-4 18 | 19 | TEST: 20 | BATCH_SIZE: 8 21 | NUM_ENSEMBLE_VIEWS: 2 22 | NUM_SPATIAL_CROPS: 3 23 | 24 | LOGS: 25 | DIR: logs/Kinetics/SLOW_PROG_36x8_R101_50 -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_PROG_36x8_R50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOW_8x8_R50.yaml 2 | 3 | DATA: 4 | NUM_FRAMES: 36 5 | 6 | PGT: 7 | ENABLE: True 8 | STEP_LEN: [8] 9 | STEPS: 5 10 | OVERLAP: [1] 11 | CACHE: last 12 | CACHE_MOMENTUM: 0.25 13 | PG_EVAL: False 14 | TRAIN_TOGETHER: True 15 | 16 | SOLVER: 17 | MAX_EPOCH: 98 18 | WARMUP_EPOCHS: 17.0 19 | WEIGHT_DECAY: 2e-4 20 | 21 | TEST: 22 | BATCH_SIZE: 8 23 | NUM_ENSEMBLE_VIEWS: 2 24 | NUM_SPATIAL_CROPS: 3 25 | 26 | LOGS: 27 | DIR: logs/Kinetics/SLOW_PROG_36x8_R50 -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_PROG_36x8_R50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOWFAST_8x8_R50.yaml 2 | 3 | TRAIN: 4 | EVAL_PERIOD: 1 5 | 6 | DATA: 7 | NUM_FRAMES: 144 8 | 9 | PGT: 10 | ENABLE: True 11 | STEP_LEN: [8, 32] 12 | STEPS: 5 13 | OVERLAP: [1, 4] 14 | TRAIN_TOGETHER: True 15 | PG_EVAL: False 16 | TPOOL_SIZE: [4, 4] 17 | 18 | SOLVER: 19 | MAX_EPOCH: 98 20 | WARMUP_EPOCHS: 17.0 21 | WEIGHT_DECAY: 2e-4 22 | 23 | TEST: 24 | BATCH_SIZE: 8 25 | NUM_ENSEMBLE_VIEWS: 2 26 | NUM_SPATIAL_CROPS: 3 27 | 28 | LOGS: 29 | DIR: logs/Kinetics/SLOWFAST_PROG_36x8_R50 -------------------------------------------------------------------------------- /configs/Charades/SLOWFAST_PROG_76x8_R50_K400.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOWFAST_16x8_R50_K400.yaml 2 | 3 | DATA: 4 | NUM_FRAMES: 304 5 | 6 | PGT: 7 | ENABLE: True 8 | STEP_LEN: [16, 64] 9 | STEPS: 5 10 | OVERLAP: [1, 1] 11 | CACHE: max 12 | CACHE_MOMENTUM: 0.25 13 | PG_EVAL: True 14 | TRAIN_TOGETHER: True 15 | ENSEMBLE_METHOD: max 16 | 17 | SOLVER: 18 | BASE_LR: 0.0125 19 | STEPS: [0, 20] 20 | MAX_EPOCH: 25 21 | WARMUP_EPOCHS: 4.0 22 | 23 | TEST: 24 | BATCH_SIZE: 16 25 | NUM_ENSEMBLE_VIEWS: 2 26 | NUM_SPATIAL_CROPS: 3 27 | 28 | LOGS: 29 | DIR: logs/Charades/SLOWFAST_PROG_76x8_R50_K400/ -------------------------------------------------------------------------------- /configs/Charades/SLOWFAST_PROG_76x8_R101_K400.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOWFAST_16x8_R101_K400.yaml 2 | 3 | DATA: 4 | NUM_FRAMES: 304 5 | 6 | PGT: 7 | ENABLE: True 8 | STEP_LEN: [16, 64] 9 | STEPS: 5 10 | OVERLAP: [1, 1] 11 | CACHE: max 12 | CACHE_MOMENTUM: 0.25 13 | PG_EVAL: True 14 | TRAIN_TOGETHER: True 15 | ENSEMBLE_METHOD: max 16 | 17 | SOLVER: 18 | BASE_LR: 0.0125 19 | STEPS: [0, 20] 20 | MAX_EPOCH: 25 21 | WARMUP_EPOCHS: 4.0 22 | 23 | TEST: 24 | BATCH_SIZE: 16 25 | NUM_ENSEMBLE_VIEWS: 2 26 | NUM_SPATIAL_CROPS: 3 27 | 28 | LOGS: 29 | DIR: logs/Charades/SLOWFAST_PROG_76x8_R101_K400/ -------------------------------------------------------------------------------- /configs/Charades/SLOWFAST_PROG_76x8_R101_K600.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOWFAST_16x8_R101_K600.yaml 2 | 3 | DATA: 4 | NUM_FRAMES: 304 5 | 6 | PGT: 7 | ENABLE: True 8 | STEP_LEN: [16, 64] 9 | STEPS: 5 10 | OVERLAP: [1, 1] 11 | CACHE: max 12 | CACHE_MOMENTUM: 0.25 13 | PG_EVAL: True 14 | TRAIN_TOGETHER: True 15 | ENSEMBLE_METHOD: max 16 | 17 | SOLVER: 18 | BASE_LR: 0.0125 19 | STEPS: [0, 20] 20 | MAX_EPOCH: 25 21 | WARMUP_EPOCHS: 4.0 22 | 23 | TEST: 24 | BATCH_SIZE: 16 25 | NUM_ENSEMBLE_VIEWS: 2 26 | NUM_SPATIAL_CROPS: 3 27 | 28 | LOGS: 29 | DIR: logs/Charades/SLOWFAST_PROG_76x8_R101_K600/ -------------------------------------------------------------------------------- /configs/Charades/SLOW_PROG_76x8_R50_K400.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOW_16x8_R50_K400.yaml 2 | 3 | DATA: 4 | NUM_FRAMES: 76 5 | 6 | PGT: 7 | ENABLE: True 8 | STEP_LEN: [16] 9 | STEPS: 5 10 | OVERLAP: [1] 11 | CACHE: max 12 | CACHE_MOMENTUM: 0.25 13 | PG_EVAL: True 14 | TRAIN_TOGETHER: True 15 | ENSEMBLE_METHOD: max 16 | 17 | SOLVER: 18 | BASE_LR: 0.025 19 | STEPS: [0, 20] 20 | MAX_EPOCH: 20 21 | WARMUP_EPOCHS: 1.0 22 | 23 | TEST: 24 | BATCH_SIZE: 16 25 | NUM_ENSEMBLE_VIEWS: 2 26 | NUM_SPATIAL_CROPS: 3 27 | 28 | LOGS: 29 | DIR: logs/Charades/SLOW_PROG_76x8_R50_K400/ 30 | 31 | NUM_GPUS: 4 32 | NUM_SHARDS: 1 33 | RNG_SEED: 0 34 | -------------------------------------------------------------------------------- /slowfast/models/losses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Loss functions.""" 5 | 6 | import torch.nn as nn 7 | 8 | _LOSSES = { 9 | "cross_entropy": nn.CrossEntropyLoss, 10 | "bce": nn.BCELoss, 11 | "bce_logit": nn.BCEWithLogitsLoss, 12 | } 13 | 14 | 15 | def get_loss_func(loss_name): 16 | """ 17 | Retrieve the loss given the loss name. 18 | Args (int): 19 | loss_name: the name of the loss to use. 20 | """ 21 | if loss_name not in _LOSSES.keys(): 22 | raise NotImplementedError("Loss {} is not supported".format(loss_name)) 23 | return _LOSSES[loss_name] 24 | -------------------------------------------------------------------------------- /configs/Charades/SLOW_PROG_76x8_R101_K400.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: SLOW_16x8_R101_K400.yaml 2 | 3 | TRAIN: 4 | BATCH_SIZE: 32 5 | 6 | DATA: 7 | NUM_FRAMES: 76 8 | 9 | PGT: 10 | ENABLE: True 11 | STEP_LEN: [16] 12 | STEPS: 5 13 | OVERLAP: [1] 14 | CACHE: max 15 | CACHE_MOMENTUM: 0.25 16 | PG_EVAL: True 17 | TRAIN_TOGETHER: True 18 | ENSEMBLE_METHOD: max 19 | 20 | SOLVER: 21 | BASE_LR: 0.025 22 | STEPS: [0, 20] 23 | MAX_EPOCH: 20 24 | WARMUP_EPOCHS: 1.0 25 | 26 | TEST: 27 | BATCH_SIZE: 32 28 | NUM_ENSEMBLE_VIEWS: 2 29 | NUM_SPATIAL_CROPS: 3 30 | 31 | LOGS: 32 | DIR: logs/Charades/SLOW_PROG_76x8_R101_K400/ 33 | 34 | NUM_GPUS: 4 35 | NUM_SHARDS: 1 36 | RNG_SEED: 0 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PGT 2 | 3 | Code for paper [PGT: A Progressive Method for Training Models on Long Videos](https://arxiv.org/abs/2103.11313). 4 | 5 | ## Install 6 | 7 | 1. Run `pip install -r requirements.txt`. 8 | 2. Run `python setup.py build develop` to compile RoIAlign python wrapper. 9 | 10 | ## Model zoo 11 | 12 | Please refer to [MODEL_ZOO.md](./MODEL_ZOO.md) 13 | 14 | ## Acknowledgement 15 | 16 | This repository is built on [SlowFast](https://github.com/facebookresearch/SlowFast). 17 | 18 | ## Citing PGT 19 | 20 | ``` 21 | @article{pang2021pgt, 22 | title={PGT: A Progressive Method for Training Models on Long Videos}, 23 | author={Pang, Bo and Peng, Gao and Li, Yizhuo and Lu, Cewu}, 24 | journal={arXiv preprint arXiv:2103.11313}, 25 | year={2021} 26 | } 27 | ``` -------------------------------------------------------------------------------- /MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | # Model zoo 2 | 3 | ## Kinetics 4 | 5 | | Method | Backbone | Pretrain | Config | top-1 | top-5 | Checkpoint | Log | 6 | | --- | --- | --- | --- | --- | --- | --- | --- | 7 | | Slow 36x8 + PGT | R50 | from scratch | Kinetics/SLOW_PROG_36x8_R50.yaml | 75.6 | 92.3 | | 8 | | Slow 36x8 + PGT | R101 | from scratch | Kinetics/SLOW_PROG_36x8_R101_50.yaml | 76.9 | 92.8 | | 9 | | SlowFast 36x8 + PGT | R50 | from scratch | Kinetics/SLOWFAST_PROG_76x8_R50.yaml | 76.6 | 92.5 | | 10 | 11 | ## Charades 12 | 13 | | Method | Backbone | Pretrain | Config | mAP | Checkpoint | Log | 14 | | --- | --- | --- | --- | --- | --- | --- | 15 | | Slow 76x8 + PGT | R50 | K400 | Charades/SLOW_16x8_R50_K400.yaml | 40.2 | | 16 | | SlowFast + PGT 76x8 | R50 | K400 | Charades/SLOWFAST_PROG_76x8_R50_K400.yaml | 43.8 | | | 17 | | Slow + PGT 76x8 | R101 | K400 | Charades/SLOW_PROG_76x8_R101_K400.yaml | 42.7 | | 18 | | SlowFast + PGT 76x8 | R101 | K400 | Charades/SLOWFAST_PROG_76x8_R101_K400.yaml | 44.3 | | | -------------------------------------------------------------------------------- /slowfast/csrc/cuda/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, 5 | const at::Tensor& rois, 6 | const float spatial_scale, 7 | const int pooled_height, 8 | const int pooled_width, 9 | const int sampling_ratio); 10 | 11 | at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int batch_size, 17 | const int channels, 18 | const int height, 19 | const int width, 20 | const int sampling_ratio); 21 | -------------------------------------------------------------------------------- /tools/run_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Wrapper to train and test a video classification model.""" 5 | from slowfast.utils.misc import launch_job 6 | from slowfast.utils.parser import load_config, parse_args 7 | 8 | from test_net import test 9 | from train_net import train 10 | 11 | 12 | def main(): 13 | """ 14 | Main function to spawn the train and test process. 15 | """ 16 | args, opts = parse_args() 17 | cfg = load_config(args, opts) 18 | 19 | # Perform training. 20 | if cfg.TRAIN.ENABLE: 21 | launch_job(cfg=cfg, init_method=args.init_method, func=train) 22 | 23 | # Perform multi-clip testing. 24 | if cfg.TEST.ENABLE: 25 | launch_job(cfg=cfg, init_method=args.init_method, func=test) 26 | 27 | if cfg.DEMO.ENABLE: 28 | launch_job(cfg=cfg, init_method=args.init_method, func=demo) 29 | 30 | if cfg.TENSORBOARD.ENABLE and cfg.TENSORBOARD.MODEL_VIS.ENABLE: 31 | launch_job(cfg=cfg, init_method=args.init_method, func=visualize) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /slowfast/datasets/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from fvcore.common.registry import Registry 5 | 6 | DATASET_REGISTRY = Registry("DATASET") 7 | DATASET_REGISTRY.__doc__ = """ 8 | Registry for dataset. 9 | 10 | The registered object will be called with `obj(cfg, split)`. 11 | The call should return a `torch.utils.data.Dataset` object. 12 | """ 13 | 14 | 15 | def build_dataset(dataset_name, cfg, split): 16 | """ 17 | Build a dataset, defined by `dataset_name`. 18 | Args: 19 | dataset_name (str): the name of the dataset to be constructed. 20 | cfg (CfgNode): configs. Details can be found in 21 | slowfast/config/defaults.py 22 | split (str): the split of the data loader. Options include `train`, 23 | `val`, and `test`. 24 | Returns: 25 | Dataset: a constructed dataset specified by dataset_name. 26 | """ 27 | # Capitalize the the first letter of the dataset_name since the dataset_name 28 | # in configs may be in lowercase but the name of dataset class should always 29 | # start with an uppercase letter. 30 | name = dataset_name.capitalize() 31 | return DATASET_REGISTRY.get(name)(cfg, split) 32 | -------------------------------------------------------------------------------- /slowfast/datasets/video_container.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import av 5 | from slowfast.utils.setup_moxing_env import wrap_input_path2 6 | 7 | 8 | def get_video_container(path_to_vid, multi_thread_decode=False, backend="pyav"): 9 | """ 10 | Given the path to the video, return the pyav video container. 11 | Args: 12 | path_to_vid (str): path to the video. 13 | multi_thread_decode (bool): if True, perform multi-thread decoding. 14 | backend (str): decoder backend, options include `pyav` and 15 | `torchvision`, default is `pyav`. 16 | Returns: 17 | container (container): video container. 18 | """ 19 | path_to_vid = wrap_input_path2(path_to_vid) 20 | if backend == "torchvision": 21 | with open(path_to_vid, "rb") as fp: 22 | container = fp.read() 23 | return container 24 | elif backend == "pyav": 25 | container = av.open(path_to_vid) 26 | if multi_thread_decode: 27 | # Enable multiple threads for decoding. 28 | container.streams.video[0].thread_type = "AUTO" 29 | return container 30 | else: 31 | raise NotImplementedError("Unknown backend {}".format(backend)) 32 | -------------------------------------------------------------------------------- /tools/eval_ava.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | 5 | import slowfast.utils.logging as logging 6 | from slowfast.utils.parser import load_config 7 | from slowfast.utils.ava_eval_helper import evaluate_ava_from_files 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser(description="AVA evaluator.") 12 | parser.add_argument( 13 | "--cfg", 14 | dest="cfg_file", 15 | help="Path to the config file", 16 | default="configs/Kinetics/SLOWFAST_4x16_R50.yaml", 17 | type=str, 18 | ) 19 | parser.add_argument( 20 | "opts", 21 | help="See slowfast/config/defaults.py for all options", 22 | default=None, 23 | nargs=argparse.REMAINDER, 24 | ) 25 | if len(sys.argv) == 1: 26 | parser.print_help() 27 | return parser.parse_args() 28 | 29 | 30 | def main(): 31 | args = parse_args() 32 | cfg = load_config(args) 33 | 34 | logging.setup_logger(cfg, 'test') 35 | evaluate_ava_from_files( 36 | os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.LABEL_MAP_FILE), 37 | os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.GROUNDTRUTH_FILE), 38 | os.path.join(cfg.LOGS.DIR, "detections_latest.csv"), 39 | os.path.join(cfg.AVA.ANNOTATION_DIR, cfg.AVA.EXCLUSION_FILE) 40 | ) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 1 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | 9 | DATA: 10 | NUM_FRAMES: 8 11 | SAMPLING_RATE: 8 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 256 15 | INPUT_CHANNEL_NUM: [3] 16 | PATH_TO_DATA_DIR: /home/pg/data/Kinetics_400/ 17 | PATH_PREFIX: /home/pg/data/Kinetics_400/ 18 | 19 | RESNET: 20 | ZERO_INIT_FINAL_BN: True 21 | WIDTH_PER_GROUP: 64 22 | NUM_GROUPS: 1 23 | DEPTH: 50 24 | TRANS_FUNC: bottleneck_transform 25 | STRIDE_1X1: False 26 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 27 | 28 | NONLOCAL: 29 | LOCATION: [[[]], [[]], [[]], [[]]] 30 | GROUP: [[1], [1], [1], [1]] 31 | INSTANTIATION: dot_product 32 | 33 | BN: 34 | USE_PRECISE_STATS: False 35 | NUM_BATCHES_PRECISE: 200 36 | 37 | SOLVER: 38 | BASE_LR: 0.1 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 196 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 1e-4 43 | WARMUP_EPOCHS: 34.0 44 | WARMUP_START_LR: 0.01 45 | OPTIMIZING_METHOD: sgd 46 | 47 | MODEL: 48 | NUM_CLASSES: 400 49 | ARCH: slow 50 | MODEL_NAME: ResNet 51 | LOSS_FUNC: cross_entropy 52 | DROPOUT_RATE: 0.5 53 | 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | 59 | DATA_LOADER: 60 | NUM_WORKERS: 8 61 | PIN_MEMORY: True 62 | 63 | LOGS: 64 | DIR: logs/Kinetics/SLOW_8x8_R50/ 65 | 66 | NUM_GPUS: 8 67 | NUM_SHARDS: 1 68 | RNG_SEED: 0 69 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_8x8_R101.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | 9 | DATA: 10 | NUM_FRAMES: 8 11 | SAMPLING_RATE: 8 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 256 15 | INPUT_CHANNEL_NUM: [3] 16 | PATH_TO_DATA_DIR: /home/pg/data/Kinetics_400/ 17 | PATH_PREFIX: /home/pg/data/Kinetics_400/ 18 | 19 | RESNET: 20 | ZERO_INIT_FINAL_BN: True 21 | WIDTH_PER_GROUP: 64 22 | NUM_GROUPS: 1 23 | DEPTH: 101 24 | TRANS_FUNC: bottleneck_transform 25 | STRIDE_1X1: False 26 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [23], [3]] 27 | 28 | NONLOCAL: 29 | LOCATION: [[[]], [[]], [[]], [[]]] 30 | GROUP: [[1], [1], [1], [1]] 31 | INSTANTIATION: dot_product 32 | 33 | BN: 34 | USE_PRECISE_STATS: False 35 | NUM_BATCHES_PRECISE: 200 36 | 37 | SOLVER: 38 | BASE_LR: 0.1 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 196 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 1e-4 43 | WARMUP_EPOCHS: 34.0 44 | WARMUP_START_LR: 0.01 45 | OPTIMIZING_METHOD: sgd 46 | 47 | MODEL: 48 | NUM_CLASSES: 400 49 | ARCH: slow 50 | MODEL_NAME: ResNet 51 | LOSS_FUNC: cross_entropy 52 | DROPOUT_RATE: 0.5 53 | 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | 59 | DATA_LOADER: 60 | NUM_WORKERS: 8 61 | PIN_MEMORY: True 62 | 63 | LOGS: 64 | DIR: logs/Kinetics/SLOW_8x8_R101/ 65 | 66 | NUM_GPUS: 8 67 | NUM_SHARDS: 1 68 | RNG_SEED: 0 69 | -------------------------------------------------------------------------------- /slowfast/models/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Model construction functions.""" 5 | 6 | import torch 7 | from fvcore.common.registry import Registry 8 | 9 | MODEL_REGISTRY = Registry("MODEL") 10 | MODEL_REGISTRY.__doc__ = """ 11 | Registry for video model. 12 | 13 | The registered object will be called with `obj(cfg)`. 14 | The call should return a `torch.nn.Module` object. 15 | """ 16 | 17 | 18 | def build_model(cfg): 19 | """ 20 | Builds the video model. 21 | Args: 22 | cfg (configs): configs that contains the hyper-parameters to build the 23 | backbone. Details can be seen in slowfast/config/defaults.py. 24 | """ 25 | assert ( 26 | cfg.NUM_GPUS <= torch.cuda.device_count() 27 | ), "Cannot use more GPU devices than available" 28 | 29 | # Construct the model 30 | name = cfg.MODEL.MODEL_NAME 31 | model = MODEL_REGISTRY.get(name)(cfg) 32 | # Determine the GPU used by the current process 33 | cur_device = torch.cuda.current_device() 34 | # Transfer the model to the current GPU device 35 | model = model.cuda(device=cur_device) 36 | # Use multi-process data parallel model in the multi-gpu setting 37 | if cfg.NUM_GPUS > 1: 38 | # Make model replica operate on the current device 39 | model = torch.nn.parallel.DistributedDataParallel( 40 | module=model, device_ids=[cur_device], output_device=cur_device 41 | ) 42 | return model 43 | -------------------------------------------------------------------------------- /slowfast/utils/weight_init_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Utility function for weight initialization""" 5 | 6 | import torch.nn as nn 7 | from fvcore.nn.weight_init import c2_msra_fill 8 | 9 | 10 | def init_weights(model, fc_init_std=0.01, zero_init_final_bn=True): 11 | """ 12 | Performs ResNet style weight initialization. 13 | Args: 14 | fc_init_std (float): the expected standard deviation for fc layer. 15 | zero_init_final_bn (bool): if True, zero initialize the final bn for 16 | every bottleneck. 17 | """ 18 | for m in model.modules(): 19 | if isinstance(m, nn.Conv3d): 20 | """ 21 | Follow the initialization method proposed in: 22 | {He, Kaiming, et al. 23 | "Delving deep into rectifiers: Surpassing human-level 24 | performance on imagenet classification." 25 | arXiv preprint arXiv:1502.01852 (2015)} 26 | """ 27 | c2_msra_fill(m) 28 | elif isinstance(m, nn.BatchNorm3d): 29 | if ( 30 | hasattr(m, "transform_final_bn") 31 | and m.transform_final_bn 32 | and zero_init_final_bn 33 | ): 34 | batchnorm_weight = 0.0 35 | else: 36 | batchnorm_weight = 1.0 37 | if m.weight is not None: 38 | m.weight.data.fill_(batchnorm_weight) 39 | if m.bias is not None: 40 | m.bias.data.zero_() 41 | if isinstance(m, nn.Linear): 42 | m.weight.data.normal_(mean=0.0, std=fc_init_std) 43 | m.bias.data.zero_() 44 | -------------------------------------------------------------------------------- /slowfast/csrc/ROIAlign.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | // Interface for Python 10 | at::Tensor ROIAlign_forward(const at::Tensor& input, 11 | const at::Tensor& rois, 12 | const float spatial_scale, 13 | const int pooled_height, 14 | const int pooled_width, 15 | const int sampling_ratio) { 16 | if (input.type().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 24 | } 25 | 26 | at::Tensor ROIAlign_backward(const at::Tensor& grad, 27 | const at::Tensor& rois, 28 | const float spatial_scale, 29 | const int pooled_height, 30 | const int pooled_width, 31 | const int batch_size, 32 | const int channels, 33 | const int height, 34 | const int width, 35 | const int sampling_ratio) { 36 | if (grad.type().is_cuda()) { 37 | #ifdef WITH_CUDA 38 | return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); 39 | #else 40 | AT_ERROR("Not compiled with GPU support"); 41 | #endif 42 | } 43 | AT_ERROR("Not implemented on the CPU"); 44 | } 45 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | 9 | DATA: 10 | NUM_FRAMES: 32 11 | SAMPLING_RATE: 2 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 256 15 | INPUT_CHANNEL_NUM: [3, 3] 16 | PATH_TO_DATA_DIR: /home/pg/data/Kinetics_400/ 17 | PATH_PREFIX: /home/pg/data/Kinetics_400/ 18 | 19 | SLOWFAST: 20 | ALPHA: 4 21 | BETA_INV: 8 22 | FUSION_CONV_CHANNEL_RATIO: 2 23 | FUSION_KERNEL_SZ: 5 24 | 25 | RESNET: 26 | ZERO_INIT_FINAL_BN: True 27 | WIDTH_PER_GROUP: 64 28 | NUM_GROUPS: 1 29 | DEPTH: 50 30 | TRANS_FUNC: bottleneck_transform 31 | STRIDE_1X1: False 32 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 33 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 34 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | 36 | NONLOCAL: 37 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 38 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 39 | INSTANTIATION: dot_product 40 | 41 | BN: 42 | USE_PRECISE_STATS: False 43 | NUM_BATCHES_PRECISE: 200 44 | 45 | SOLVER: 46 | BASE_LR: 0.1 47 | LR_POLICY: cosine 48 | MAX_EPOCH: 196 49 | MOMENTUM: 0.9 50 | WEIGHT_DECAY: 1e-4 51 | WARMUP_EPOCHS: 34.0 52 | WARMUP_START_LR: 0.01 53 | OPTIMIZING_METHOD: sgd 54 | 55 | MODEL: 56 | NUM_CLASSES: 400 57 | ARCH: slowfast 58 | MODEL_NAME: SlowFast 59 | LOSS_FUNC: cross_entropy 60 | DROPOUT_RATE: 0.5 61 | 62 | TEST: 63 | ENABLE: True 64 | DATASET: kinetics 65 | BATCH_SIZE: 64 66 | 67 | DATA_LOADER: 68 | NUM_WORKERS: 8 69 | PIN_MEMORY: True 70 | 71 | LOGS: 72 | DIR: logs/Kinetics/SLOWFAST_8x8_R50/ 73 | 74 | NUM_GPUS: 8 75 | NUM_SHARDS: 1 76 | RNG_SEED: 0 77 | 78 | -------------------------------------------------------------------------------- /configs/Charades/SLOW_16x8_R50_K400.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: charades 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 1 6 | FULL_TIME_EVAL: True 7 | CHECKPOINT_PERIOD: 1 8 | AUTO_RESUME: True 9 | CHECKPOINT_TYPE: caffe2 10 | CHECKPOINT_FILE_PATH: /home/pg/data/models/k400/SLOW_8x8_R50.pkl 11 | 12 | DATA: 13 | NUM_FRAMES: 16 14 | SAMPLING_RATE: 8 15 | TRAIN_JITTER_SCALES: [256, 320] 16 | TRAIN_CROP_SIZE: 224 17 | TEST_CROP_SIZE: 256 18 | INPUT_CHANNEL_NUM: [3] 19 | MULTI_LABEL: True 20 | INV_UNIFORM_SAMPLE: True 21 | ENSEMBLE_METHOD: max 22 | REVERSE_INPUT_CHANNEL: True 23 | PATH_TO_DATA_DIR: /home/pg/data/Charades/ 24 | PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb 25 | 26 | RESNET: 27 | ZERO_INIT_FINAL_BN: True 28 | WIDTH_PER_GROUP: 64 29 | NUM_GROUPS: 1 30 | DEPTH: 50 31 | TRANS_FUNC: bottleneck_transform 32 | STRIDE_1X1: False 33 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 34 | 35 | NONLOCAL: 36 | LOCATION: [[[]], [[]], [[]], [[]]] 37 | 38 | BN: 39 | USE_PRECISE_STATS: False 40 | WEIGHT_DECAY: 0.0 41 | NORM_TYPE: frozen_batchnorm 42 | 43 | SOLVER: 44 | BASE_LR: 0.075 45 | LR_POLICY: steps_with_relative_lrs 46 | LRS: [1, 0.1, 0.01] 47 | STEPS: [0, 20, 30] 48 | MAX_EPOCH: 35 49 | MOMENTUM: 0.9 50 | WEIGHT_DECAY: 1e-4 51 | WARMUP_EPOCHS: 4.0 52 | WARMUP_START_LR: 0.0002 53 | OPTIMIZING_METHOD: sgd 54 | 55 | MODEL: 56 | NUM_CLASSES: 157 57 | ARCH: slow 58 | MODEL_NAME: ResNet 59 | LOSS_FUNC: bce_logit 60 | HEAD_ACT: sigmoid 61 | DROPOUT_RATE: 0.5 62 | FINAL_POOL: ["avg", "max"] 63 | 64 | TEST: 65 | ENABLE: True 66 | DATASET: charades 67 | BATCH_SIZE: 32 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | 71 | DATA_LOADER: 72 | NUM_WORKERS: 4 73 | PIN_MEMORY: True 74 | 75 | LOGS: 76 | DIR: logs/Charades/SLOW_16x8_R50_K400/ 77 | 78 | NUM_GPUS: 2 79 | NUM_SHARDS: 1 80 | RNG_SEED: 0 81 | -------------------------------------------------------------------------------- /configs/Charades/SLOW_16x8_R101_K400.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: charades 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | FULL_TIME_EVAL: True 7 | CHECKPOINT_PERIOD: 1 8 | AUTO_RESUME: True 9 | CHECKPOINT_TYPE: pytorch 10 | CHECKPOINT_FILE_PATH: /home/pg/data/models/k400/SLOWFAST_8x8_R101.pyth 11 | 12 | DATA: 13 | NUM_FRAMES: 16 14 | SAMPLING_RATE: 8 15 | TRAIN_JITTER_SCALES: [256, 320] 16 | TRAIN_CROP_SIZE: 224 17 | TEST_CROP_SIZE: 256 18 | INPUT_CHANNEL_NUM: [3] 19 | MULTI_LABEL: True 20 | INV_UNIFORM_SAMPLE: True 21 | ENSEMBLE_METHOD: max 22 | REVERSE_INPUT_CHANNEL: True 23 | PATH_TO_DATA_DIR: /home/pg/data/Charades/ 24 | PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb 25 | 26 | RESNET: 27 | ZERO_INIT_FINAL_BN: True 28 | WIDTH_PER_GROUP: 64 29 | NUM_GROUPS: 1 30 | DEPTH: 101 31 | TRANS_FUNC: bottleneck_transform 32 | STRIDE_1X1: False 33 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [23], [3]] 34 | 35 | NONLOCAL: 36 | LOCATION: [[[]], [[]], [[]], [[]]] 37 | 38 | BN: 39 | USE_PRECISE_STATS: False 40 | WEIGHT_DECAY: 0.0 41 | NORM_TYPE: frozen_batchnorm 42 | 43 | SOLVER: 44 | BASE_LR: 0.0375 45 | LR_POLICY: steps_with_relative_lrs 46 | LRS: [1, 0.1, 0.01] 47 | STEPS: [0, 20, 30] 48 | MAX_EPOCH: 35 49 | MOMENTUM: 0.9 50 | WEIGHT_DECAY: 1e-4 51 | WARMUP_EPOCHS: 4.0 52 | WARMUP_START_LR: 0.0001 53 | OPTIMIZING_METHOD: sgd 54 | 55 | MODEL: 56 | NUM_CLASSES: 157 57 | ARCH: slow 58 | MODEL_NAME: ResNet 59 | LOSS_FUNC: bce_logit 60 | HEAD_ACT: sigmoid 61 | DROPOUT_RATE: 0.5 62 | FINAL_POOL: ["avg", "max"] 63 | 64 | TEST: 65 | ENABLE: True 66 | DATASET: charades 67 | BATCH_SIZE: 16 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | 71 | DATA_LOADER: 72 | NUM_WORKERS: 4 73 | PIN_MEMORY: True 74 | 75 | LOGS: 76 | DIR: logs/Charades/SLOW_16x8_R101_K400/ 77 | 78 | NUM_GPUS: 2 79 | NUM_SHARDS: 1 80 | RNG_SEED: 0 81 | -------------------------------------------------------------------------------- /tools/visualize_log.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from tensorboardX import SummaryWriter 4 | 5 | from slowfast.utils.parser import load_config, parse_args 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--log_dir", type=str, required=True) 11 | return parser.parse_args() 12 | 13 | 14 | def main(): 15 | args = parse_args() 16 | 17 | filename = os.path.join(args.log_dir, 'train.log') 18 | tblogger = SummaryWriter(log_dir=args.log_dir) 19 | 20 | with open(filename) as f: 21 | log = f.readlines() 22 | 23 | for l in log: 24 | if 'train_iter' in l: 25 | l = l.split(']: ')[1].strip() 26 | cur_epoch = int(l.split('epoch: ')[1].split('/')[0]) 27 | cur_iter = int(l.split('iter: ')[1].split('/')[0]) 28 | epoch_iters = int(l.split('iter: ')[1].split('/')[1].split(';')[0]) 29 | iters = cur_iter + 1 + epoch_iters * cur_epoch 30 | for kv in l.split('; '): 31 | k, v = kv.split(': ') 32 | if 'err' in k or 'loss' in k: 33 | tblogger.add_scalar('train/{}'.format(k), float(v), iters) 34 | elif k == 'epoch': 35 | tblogger.add_scalar('other/epoch', cur_epoch + 1, iters) 36 | elif k == 'lr': 37 | tblogger.add_scalar('other/lr', float(v), iters) 38 | else: 39 | continue 40 | elif 'val_epoch' in l: 41 | l = l.split(']: ')[1].strip() 42 | cur_epoch = int(l.split('epoch: ')[1].split('/')[0]) 43 | for kv in l.split('; '): 44 | k, v = kv.split(': ') 45 | if 'err' in k or 'map' in k: 46 | tblogger.add_scalar( 47 | 'val/{}'.format(k), float(v), cur_epoch + 1) 48 | else: 49 | continue 50 | 51 | 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /slowfast/utils/multiprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Multiprocessing helpers.""" 5 | 6 | import torch 7 | 8 | 9 | def run( 10 | local_rank, num_proc, func, init_method, shard_id, num_shards, backend, cfg 11 | ): 12 | """ 13 | Runs a function from a child process. 14 | Args: 15 | local_rank (int): rank of the current process on the current machine. 16 | num_proc (int): number of processes per machine. 17 | func (function): function to execute on each of the process. 18 | init_method (string): method to initialize the distributed training. 19 | TCP initialization: equiring a network address reachable from all 20 | processes followed by the port. 21 | Shared file-system initialization: makes use of a file system that 22 | is shared and visible from all machines. The URL should start with 23 | file:// and contain a path to a non-existent file on a shared file 24 | system. 25 | shard_id (int): the rank of the current machine. 26 | num_shards (int): number of overall machines for the distributed 27 | training job. 28 | backend (string): three distributed backends ('nccl', 'gloo', 'mpi') are 29 | supports, each with different capabilities. Details can be found 30 | here: 31 | https://pytorch.org/docs/stable/distributed.html 32 | cfg (CfgNode): configs. Details can be found in 33 | slowfast/config/defaults.py 34 | """ 35 | # Initialize the process group. 36 | world_size = num_proc * num_shards 37 | rank = shard_id * num_proc + local_rank 38 | 39 | try: 40 | torch.distributed.init_process_group( 41 | backend=backend, 42 | init_method=init_method, 43 | world_size=world_size, 44 | rank=rank, 45 | ) 46 | except Exception as e: 47 | raise e 48 | 49 | torch.cuda.set_device(local_rank) 50 | func(cfg) 51 | -------------------------------------------------------------------------------- /configs/Charades/SLOWFAST_16x8_R50_K400.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: charades 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | FULL_TIME_EVAL: True 7 | CHECKPOINT_PERIOD: 1 8 | AUTO_RESUME: True 9 | CHECKPOINT_TYPE: caffe2 10 | CHECKPOINT_FILE_PATH: /home/pg/data/models/k400/SLOWFAST_8x8_R50.pkl 11 | 12 | DATA: 13 | NUM_FRAMES: 64 14 | SAMPLING_RATE: 2 15 | TRAIN_JITTER_SCALES: [256, 320] 16 | TRAIN_CROP_SIZE: 224 17 | TEST_CROP_SIZE: 256 18 | INPUT_CHANNEL_NUM: [3, 3] 19 | MULTI_LABEL: True 20 | INV_UNIFORM_SAMPLE: True 21 | ENSEMBLE_METHOD: max 22 | REVERSE_INPUT_CHANNEL: True 23 | PATH_TO_DATA_DIR: /home/pg/data/Charades/ 24 | PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb 25 | 26 | SLOWFAST: 27 | ALPHA: 4 28 | BETA_INV: 8 29 | FUSION_CONV_CHANNEL_RATIO: 2 30 | FUSION_KERNEL_SZ: 7 31 | 32 | RESNET: 33 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 34 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | ZERO_INIT_FINAL_BN: True 36 | WIDTH_PER_GROUP: 64 37 | NUM_GROUPS: 1 38 | DEPTH: 50 39 | TRANS_FUNC: bottleneck_transform 40 | STRIDE_1X1: False 41 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 42 | 43 | NONLOCAL: 44 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 45 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 46 | 47 | BN: 48 | USE_PRECISE_STATS: False 49 | NORM_TYPE: frozen_batchnorm 50 | 51 | SOLVER: 52 | BASE_LR: 0.0375 53 | LR_POLICY: steps_with_relative_lrs 54 | LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 55 | STEPS: [0, 23, 39] 56 | MAX_EPOCH: 40 57 | MOMENTUM: 0.9 58 | WEIGHT_DECAY: 1e-4 59 | WARMUP_EPOCHS: 4.0 60 | WARMUP_START_LR: 0.0001 61 | OPTIMIZING_METHOD: sgd 62 | 63 | MODEL: 64 | NUM_CLASSES: 157 65 | ARCH: slowfast 66 | LOSS_FUNC: bce_logit 67 | HEAD_ACT: sigmoid 68 | DROPOUT_RATE: 0.5 69 | FINAL_POOL: ["avg", "max"] 70 | 71 | TEST: 72 | ENABLE: True 73 | DATASET: charades 74 | BATCH_SIZE: 16 75 | NUM_ENSEMBLE_VIEWS: 10 76 | NUM_SPATIAL_CROPS: 3 77 | 78 | DATA_LOADER: 79 | NUM_WORKERS: 8 80 | PIN_MEMORY: True 81 | 82 | LOGS: 83 | DIR: logs/Charades/SLOWFAST_16x8_R50_K400/ 84 | 85 | NUM_GPUS: 8 86 | NUM_SHARDS: 1 87 | RNG_SEED: 0 88 | -------------------------------------------------------------------------------- /slowfast/models/detection_helper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.autograd import Function 4 | from torch.autograd.function import once_differentiable 5 | from torch.nn.modules.utils import _pair 6 | 7 | from slowfast import _C 8 | 9 | 10 | class _ROIAlign(Function): 11 | @staticmethod 12 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 13 | ctx.save_for_backward(roi) 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.sampling_ratio = sampling_ratio 17 | ctx.input_shape = input.size() 18 | output = _C.roi_align_forward( 19 | input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio 20 | ) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | rois, = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | sampling_ratio = ctx.sampling_ratio 30 | bs, ch, h, w = ctx.input_shape 31 | grad_input = _C.roi_align_backward( 32 | grad_output, 33 | rois, 34 | spatial_scale, 35 | output_size[0], 36 | output_size[1], 37 | bs, 38 | ch, 39 | h, 40 | w, 41 | sampling_ratio, 42 | ) 43 | return grad_input, None, None, None, None 44 | 45 | 46 | roi_align = _ROIAlign.apply 47 | 48 | 49 | class ROIAlign(nn.Module): 50 | def __init__(self, output_size, spatial_scale, sampling_ratio): 51 | super(ROIAlign, self).__init__() 52 | self.output_size = output_size 53 | self.spatial_scale = spatial_scale 54 | self.sampling_ratio = sampling_ratio 55 | 56 | def forward(self, input, rois): 57 | return roi_align( 58 | input, rois, self.output_size, self.spatial_scale, self.sampling_ratio 59 | ) 60 | 61 | def __repr__(self): 62 | tmpstr = self.__class__.__name__ + "(" 63 | tmpstr += "output_size=" + str(self.output_size) 64 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 65 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 66 | tmpstr += ")" 67 | return tmpstr 68 | -------------------------------------------------------------------------------- /configs/Charades/SLOWFAST_16x8_R101_K400.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: charades 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | FULL_TIME_EVAL: True 7 | CHECKPOINT_PERIOD: 1 8 | AUTO_RESUME: True 9 | CHECKPOINT_TYPE: pytorch 10 | CHECKPOINT_FILE_PATH: /home/pg/projects/action/progress-action/logs/Kinetics/MOX2_SLOWFAST_8x8_R101.77.22/checkpoints/checkpoint_epoch_00196.pyth 11 | 12 | DATA: 13 | NUM_FRAMES: 64 14 | SAMPLING_RATE: 2 15 | TRAIN_JITTER_SCALES: [256, 320] 16 | TRAIN_CROP_SIZE: 224 17 | TEST_CROP_SIZE: 256 18 | INPUT_CHANNEL_NUM: [3, 3] 19 | MULTI_LABEL: True 20 | INV_UNIFORM_SAMPLE: True 21 | ENSEMBLE_METHOD: max 22 | REVERSE_INPUT_CHANNEL: True 23 | PATH_TO_DATA_DIR: /home/pg/data/Charades/ 24 | PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb 25 | 26 | SLOWFAST: 27 | ALPHA: 4 28 | BETA_INV: 8 29 | FUSION_CONV_CHANNEL_RATIO: 2 30 | FUSION_KERNEL_SZ: 5 31 | 32 | RESNET: 33 | ZERO_INIT_FINAL_BN: True 34 | WIDTH_PER_GROUP: 64 35 | NUM_GROUPS: 1 36 | DEPTH: 101 37 | TRANS_FUNC: bottleneck_transform 38 | STRIDE_1X1: False 39 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [23, 23], [3, 3]] 40 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 41 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 42 | 43 | NONLOCAL: 44 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 45 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 46 | INSTANTIATION: dot_product 47 | 48 | BN: 49 | USE_PRECISE_STATS: False 50 | WEIGHT_DECAY: 0.0 51 | NORM_TYPE: frozen_batchnorm 52 | 53 | SOLVER: 54 | BASE_LR: 0.0375 55 | LR_POLICY: steps_with_relative_lrs 56 | LRS: [1, 0.1, 0.01, 0.001] 57 | STEPS: [0, 23, 39] 58 | MAX_EPOCH: 40 59 | MOMENTUM: 0.9 60 | WEIGHT_DECAY: 1e-4 61 | WARMUP_EPOCHS: 4.0 62 | WARMUP_START_LR: 0.0001 63 | OPTIMIZING_METHOD: sgd 64 | 65 | MODEL: 66 | NUM_CLASSES: 157 67 | ARCH: slowfast 68 | MODEL_NAME: SlowFast 69 | LOSS_FUNC: bce_logit 70 | HEAD_ACT: sigmoid 71 | DROPOUT_RATE: 0.5 72 | FINAL_POOL: ["avg", "max"] 73 | 74 | TEST: 75 | ENABLE: True 76 | DATASET: charades 77 | BATCH_SIZE: 16 78 | NUM_ENSEMBLE_VIEWS: 10 79 | NUM_SPATIAL_CROPS: 3 80 | 81 | DATA_LOADER: 82 | NUM_WORKERS: 8 83 | PIN_MEMORY: True 84 | 85 | LOGS: 86 | DIR: logs/Charades/SLOWFAST_16x8_R101_K400/ 87 | 88 | NUM_GPUS: 8 89 | NUM_SHARDS: 1 90 | RNG_SEED: 0 91 | -------------------------------------------------------------------------------- /configs/Charades/SLOWFAST_16x8_R101_K600.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: charades 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | FULL_TIME_EVAL: True 7 | CHECKPOINT_PERIOD: 1 8 | AUTO_RESUME: True 9 | CHECKPOINT_TYPE: aia 10 | CHECKPOINT_FILE_PATH: /home/pg/data/models/k600/SLOWFAST_8x8_R101.pth 11 | 12 | DATA: 13 | NUM_FRAMES: 64 14 | SAMPLING_RATE: 2 15 | TRAIN_JITTER_SCALES: [256, 320] 16 | TRAIN_CROP_SIZE: 224 17 | TEST_CROP_SIZE: 256 18 | INPUT_CHANNEL_NUM: [3, 3] 19 | MULTI_LABEL: True 20 | INV_UNIFORM_SAMPLE: True 21 | ENSEMBLE_METHOD: max 22 | REVERSE_INPUT_CHANNEL: True 23 | PATH_TO_DATA_DIR: /home/pg/data/Charades/ 24 | PATH_PREFIX: /home/pg/data/Charades/Charades_v1_rgb 25 | 26 | SLOWFAST: 27 | ALPHA: 4 28 | BETA_INV: 8 29 | FUSION_CONV_CHANNEL_RATIO: 2 30 | FUSION_KERNEL_SZ: 5 31 | FUSION_BN: False 32 | FUSION_RELU: False 33 | 34 | RESNET: 35 | ZERO_INIT_FINAL_BN: True 36 | WIDTH_PER_GROUP: 64 37 | NUM_GROUPS: 1 38 | DEPTH: 101 39 | TRANS_FUNC: bottleneck_transform 40 | STRIDE_1X1: False 41 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [23, 23], [3, 3]] 42 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 43 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 44 | STEM_POOL_PAD: False 45 | 46 | NONLOCAL: 47 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 48 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 49 | INSTANTIATION: dot_product 50 | 51 | BN: 52 | USE_PRECISE_STATS: False 53 | WEIGHT_DECAY: 0.0 54 | NORM_TYPE: frozen_batchnorm 55 | 56 | SOLVER: 57 | BASE_LR: 0.0375 58 | LR_POLICY: steps_with_relative_lrs 59 | LRS: [1, 0.1, 0.01, 0.001] 60 | STEPS: [0, 23, 39] 61 | MAX_EPOCH: 40 62 | MOMENTUM: 0.9 63 | WEIGHT_DECAY: 1e-4 64 | WARMUP_EPOCHS: 4.0 65 | WARMUP_START_LR: 0.0001 66 | OPTIMIZING_METHOD: sgd 67 | 68 | MODEL: 69 | NUM_CLASSES: 157 70 | ARCH: slowfast 71 | MODEL_NAME: SlowFast 72 | LOSS_FUNC: bce_logit 73 | HEAD_ACT: sigmoid 74 | DROPOUT_RATE: 0.5 75 | FINAL_POOL: ["avg", "max"] 76 | 77 | TEST: 78 | ENABLE: True 79 | DATASET: charades 80 | BATCH_SIZE: 16 81 | NUM_ENSEMBLE_VIEWS: 10 82 | NUM_SPATIAL_CROPS: 3 83 | 84 | DATA_LOADER: 85 | NUM_WORKERS: 8 86 | PIN_MEMORY: True 87 | 88 | LOGS: 89 | DIR: logs/Charades/SLOWFAST_16x8_R101_K600/ 90 | 91 | NUM_GPUS: 8 92 | NUM_SHARDS: 1 93 | RNG_SEED: 0 94 | -------------------------------------------------------------------------------- /slowfast/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | """Video models.""" 2 | 3 | # Number of blocks for different stages given the model depth. 4 | _MODEL_STAGE_DEPTH = { 5 | # ResNet 6 | 50: (3, 4, 6, 3), 7 | 101: (3, 4, 23, 3), 8 | } 9 | 10 | 11 | # Basis of temporal kernel sizes for each of the stage. 12 | _TEMPORAL_KERNEL_BASIS = { 13 | "c2d": [ 14 | [[1]], # conv1 temporal kernel. 15 | [[1]], # res2 temporal kernel. 16 | [[1]], # res3 temporal kernel. 17 | [[1]], # res4 temporal kernel. 18 | [[1]], # res5 temporal kernel. 19 | ], 20 | "c2d_nopool": [ 21 | [[1]], # conv1 temporal kernel. 22 | [[1]], # res2 temporal kernel. 23 | [[1]], # res3 temporal kernel. 24 | [[1]], # res4 temporal kernel. 25 | [[1]], # res5 temporal kernel. 26 | ], 27 | "i3d": [ 28 | [[5]], # conv1 temporal kernel. 29 | [[3]], # res2 temporal kernel. 30 | [[3, 1]], # res3 temporal kernel. 31 | [[3, 1]], # res4 temporal kernel. 32 | [[1, 3]], # res5 temporal kernel. 33 | ], 34 | "i3d_nopool": [ 35 | [[5]], # conv1 temporal kernel. 36 | [[3]], # res2 temporal kernel. 37 | [[3, 1]], # res3 temporal kernel. 38 | [[3, 1]], # res4 temporal kernel. 39 | [[1, 3]], # res5 temporal kernel. 40 | ], 41 | "slow": [ 42 | [[1]], # conv1 temporal kernel. 43 | [[1]], # res2 temporal kernel. 44 | [[1]], # res3 temporal kernel. 45 | [[3]], # res4 temporal kernel. 46 | [[3]], # res5 temporal kernel. 47 | ], 48 | "slowfast": [ 49 | [[1], [5]], # conv1 temporal kernel for slow and fast pathway. 50 | [[1], [3]], # res2 temporal kernel for slow and fast pathway. 51 | [[1], [3]], # res3 temporal kernel for slow and fast pathway. 52 | [[3], [3]], # res4 temporal kernel for slow and fast pathway. 53 | [[3], [3]], # res5 temporal kernel for slow and fast pathway. 54 | ], 55 | "x3d": [ 56 | [[5]], # conv1 temporal kernels. 57 | [[3]], # res2 temporal kernels. 58 | [[3]], # res3 temporal kernels. 59 | [[3]], # res4 temporal kernels. 60 | [[3]], # res5 temporal kernels. 61 | ], 62 | } 63 | 64 | _POOL1 = { 65 | "c2d": [[2, 1, 1]], 66 | "c2d_nopool": [[1, 1, 1]], 67 | "i3d": [[2, 1, 1]], 68 | "i3d_nopool": [[1, 1, 1]], 69 | "slow": [[1, 1, 1]], 70 | "slowfast": [[1, 1, 1], [1, 1, 1]], 71 | "x3d": [[1, 1, 1]], 72 | } -------------------------------------------------------------------------------- /slowfast/utils/metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Functions for computing metrics.""" 5 | 6 | import torch 7 | 8 | 9 | def topks_correct(preds, labels, ks): 10 | """ 11 | Given the predictions, labels, and a list of top-k values, compute the 12 | number of correct predictions for each top-k value. 13 | 14 | Args: 15 | preds (array): array of predictions. Dimension is batchsize 16 | N x ClassNum. 17 | labels (array): array of labels. Dimension is batchsize N. 18 | ks (list): list of top-k values. For example, ks = [1, 5] correspods 19 | to top-1 and top-5. 20 | 21 | Returns: 22 | topks_correct (list): list of numbers, where the `i`-th entry 23 | corresponds to the number of top-`ks[i]` correct predictions. 24 | """ 25 | assert preds.size(0) == labels.size( 26 | 0 27 | ), "Batch dim of predictions and labels must match" 28 | # Find the top max_k predictions for each sample 29 | _top_max_k_vals, top_max_k_inds = torch.topk( 30 | preds, max(ks), dim=1, largest=True, sorted=True 31 | ) 32 | # (batch_size, max_k) -> (max_k, batch_size). 33 | top_max_k_inds = top_max_k_inds.t() 34 | # (batch_size, ) -> (max_k, batch_size). 35 | rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds) 36 | # (i, j) = 1 if top i-th prediction for the j-th sample is correct. 37 | top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels) 38 | # Compute the number of topk correct predictions for each k. 39 | topks_correct = [ 40 | top_max_k_correct[:k, :].view(-1).float().sum() for k in ks 41 | ] 42 | return topks_correct 43 | 44 | 45 | def topk_errors(preds, labels, ks): 46 | """ 47 | Computes the top-k error for each k. 48 | Args: 49 | preds (array): array of predictions. Dimension is N. 50 | labels (array): array of labels. Dimension is N. 51 | ks (list): list of ks to calculate the top accuracies. 52 | """ 53 | num_topks_correct = topks_correct(preds, labels, ks) 54 | return [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] 55 | 56 | 57 | def topk_accuracies(preds, labels, ks): 58 | """ 59 | Computes the top-k accuracy for each k. 60 | Args: 61 | preds (array): array of predictions. Dimension is N. 62 | labels (array): array of labels. Dimension is N. 63 | ks (list): list of ks to calculate the top accuracies. 64 | """ 65 | num_topks_correct = topks_correct(preds, labels, ks) 66 | return [(x / preds.size(0)) * 100.0 for x in num_topks_correct] 67 | -------------------------------------------------------------------------------- /slowfast/models/operators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Custom operators.""" 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class Swish(nn.Module): 11 | """Swish activation function: x * sigmoid(x).""" 12 | 13 | def __init__(self): 14 | super(Swish, self).__init__() 15 | 16 | def forward(self, x): 17 | return SwishEfficient.apply(x) 18 | 19 | 20 | class SwishEfficient(torch.autograd.Function): 21 | """Swish activation function: x * sigmoid(x).""" 22 | 23 | @staticmethod 24 | def forward(ctx, x): 25 | result = x * torch.sigmoid(x) 26 | ctx.save_for_backward(x) 27 | return result 28 | 29 | @staticmethod 30 | def backward(ctx, grad_output): 31 | x = ctx.saved_variables[0] 32 | sigmoid_x = torch.sigmoid(x) 33 | return grad_output * (sigmoid_x * (1 + x * (1 - sigmoid_x))) 34 | 35 | 36 | class SE(nn.Module): 37 | """Squeeze-and-Excitation (SE) block w/ Swish: AvgPool, FC, Swish, FC, Sigmoid.""" 38 | 39 | def _round_width(self, width, multiplier, min_width=8, divisor=8): 40 | """ 41 | Round width of filters based on width multiplier 42 | Args: 43 | width (int): the channel dimensions of the input. 44 | multiplier (float): the multiplication factor. 45 | min_width (int): the minimum width after multiplication. 46 | divisor (int): the new width should be dividable by divisor. 47 | """ 48 | if not multiplier: 49 | return width 50 | 51 | width *= multiplier 52 | min_width = min_width or divisor 53 | width_out = max( 54 | min_width, int(width + divisor / 2) // divisor * divisor 55 | ) 56 | if width_out < 0.9 * width: 57 | width_out += divisor 58 | return int(width_out) 59 | 60 | def __init__(self, dim_in, ratio, relu_act=True): 61 | """ 62 | Args: 63 | dim_in (int): the channel dimensions of the input. 64 | ratio (float): the channel reduction ratio for squeeze. 65 | relu_act (bool): whether to use ReLU activation instead 66 | of Swish (default). 67 | divisor (int): the new width should be dividable by divisor. 68 | """ 69 | super(SE, self).__init__() 70 | self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 71 | dim_fc = self._round_width(dim_in, ratio) 72 | self.fc1 = nn.Conv3d(dim_in, dim_fc, 1, bias=True) 73 | self.fc1_act = nn.ReLU() if relu_act else Swish() 74 | self.fc2 = nn.Conv3d(dim_fc, dim_in, 1, bias=True) 75 | 76 | self.fc2_sig = nn.Sigmoid() 77 | 78 | def forward(self, x): 79 | x_in = x 80 | for module in self.children(): 81 | x = module(x) 82 | return x_in * x 83 | -------------------------------------------------------------------------------- /slowfast/datasets/multigrid_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Helper functions for multigrid training.""" 5 | 6 | import numpy as np 7 | from torch._six import int_classes as _int_classes 8 | from torch.utils.data.sampler import Sampler 9 | 10 | 11 | class ShortCycleBatchSampler(Sampler): 12 | """ 13 | Extend Sampler to support "short cycle" sampling. 14 | See paper "A Multigrid Method for Efficiently Training Video Models", 15 | Wu et al., 2019 (https://arxiv.org/abs/1912.00998) for details. 16 | """ 17 | 18 | def __init__(self, sampler, batch_size, drop_last, cfg): 19 | if not isinstance(sampler, Sampler): 20 | raise ValueError( 21 | "sampler should be an instance of " 22 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 23 | ) 24 | if ( 25 | not isinstance(batch_size, _int_classes) 26 | or isinstance(batch_size, bool) 27 | or batch_size <= 0 28 | ): 29 | raise ValueError( 30 | "batch_size should be a positive integer value, " 31 | "but got batch_size={}".format(batch_size) 32 | ) 33 | if not isinstance(drop_last, bool): 34 | raise ValueError( 35 | "drop_last should be a boolean value, but got " 36 | "drop_last={}".format(drop_last) 37 | ) 38 | self.sampler = sampler 39 | self.drop_last = drop_last 40 | 41 | bs_factor = [ 42 | int( 43 | round( 44 | ( 45 | float(cfg.DATA.TRAIN_CROP_SIZE) 46 | / (s * cfg.MULTIGRID.DEFAULT_S) 47 | ) 48 | ** 2 49 | ) 50 | ) 51 | for s in cfg.MULTIGRID.SHORT_CYCLE_FACTORS 52 | ] 53 | 54 | self.batch_sizes = [ 55 | batch_size * bs_factor[0], 56 | batch_size * bs_factor[1], 57 | batch_size, 58 | ] 59 | 60 | def __iter__(self): 61 | counter = 0 62 | batch_size = self.batch_sizes[0] 63 | batch = [] 64 | for idx in self.sampler: 65 | batch.append((idx, counter % 3)) 66 | if len(batch) == batch_size: 67 | yield batch 68 | counter += 1 69 | batch_size = self.batch_sizes[counter % 3] 70 | batch = [] 71 | if len(batch) > 0 and not self.drop_last: 72 | yield batch 73 | 74 | def __len__(self): 75 | avg_batch_size = sum(self.batch_sizes) / 3.0 76 | if self.drop_last: 77 | return int(np.floor(len(self.sampler) / avg_batch_size)) 78 | else: 79 | return int(np.ceil(len(self.sampler) / avg_batch_size)) 80 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/np_box_mask_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxMaskList classes and functions.""" 17 | 18 | from __future__ import ( 19 | absolute_import, 20 | division, 21 | print_function, 22 | unicode_literals, 23 | ) 24 | import numpy as np 25 | 26 | from . import np_box_list 27 | 28 | 29 | class BoxMaskList(np_box_list.BoxList): 30 | """Convenience wrapper for BoxList with masks. 31 | 32 | BoxMaskList extends the np_box_list.BoxList to contain masks as well. 33 | In particular, its constructor receives both boxes and masks. Note that the 34 | masks correspond to the full image. 35 | """ 36 | 37 | def __init__(self, box_data, mask_data): 38 | """Constructs box collection. 39 | 40 | Args: 41 | box_data: a numpy array of shape [N, 4] representing box coordinates 42 | mask_data: a numpy array of shape [N, height, width] representing masks 43 | with values are in {0,1}. The masks correspond to the full 44 | image. The height and the width will be equal to image height and width. 45 | 46 | Raises: 47 | ValueError: if bbox data is not a numpy array 48 | ValueError: if invalid dimensions for bbox data 49 | ValueError: if mask data is not a numpy array 50 | ValueError: if invalid dimension for mask data 51 | """ 52 | super(BoxMaskList, self).__init__(box_data) 53 | if not isinstance(mask_data, np.ndarray): 54 | raise ValueError("Mask data must be a numpy array.") 55 | if len(mask_data.shape) != 3: 56 | raise ValueError("Invalid dimensions for mask data.") 57 | if mask_data.dtype != np.uint8: 58 | raise ValueError( 59 | "Invalid data type for mask data: uint8 is required." 60 | ) 61 | if mask_data.shape[0] != box_data.shape[0]: 62 | raise ValueError( 63 | "There should be the same number of boxes and masks." 64 | ) 65 | self.data["masks"] = mask_data 66 | 67 | def get_masks(self): 68 | """Convenience function for accessing masks. 69 | 70 | Returns: 71 | a numpy array of shape [N, height, width] representing masks 72 | """ 73 | return self.get_field("masks") 74 | -------------------------------------------------------------------------------- /slowfast/utils/logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Logging.""" 5 | 6 | import os 7 | import builtins 8 | import decimal 9 | import functools 10 | import logging 11 | import os 12 | import sys 13 | import simplejson 14 | import torch.distributed as dist 15 | from datetime import datetime 16 | from fvcore.common.file_io import PathManager 17 | 18 | import slowfast.utils.distributed as du 19 | 20 | 21 | def _suppress_print(): 22 | """ 23 | Suppresses printing from the current process. 24 | """ 25 | 26 | def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False): 27 | pass 28 | 29 | builtins.print = print_pass 30 | 31 | 32 | def setup_logger(cfg, name=None): 33 | logger = logging.getLogger('progress-action') 34 | logger.setLevel(logging.DEBUG) 35 | logger.propogate = False 36 | # don't log results for the non-master process 37 | if not du.is_master_proc(): 38 | _suppress_print() 39 | return logger 40 | formatter = logging.Formatter( 41 | "%(asctime)s [%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s") 42 | 43 | ch = logging.StreamHandler(stream=sys.stdout) 44 | ch.setLevel(logging.INFO) 45 | ch.setFormatter(formatter) 46 | logger.addHandler(ch) 47 | 48 | # multi-machine 49 | if cfg.NUM_GPUS != du.get_world_size(): 50 | assert du.is_master_proc() 51 | num_gpus_per_machine = cfg.NUM_GPUS 52 | worker = du.get_rank() // cfg.NUM_GPUS 53 | filename = os.path.join(cfg.LOGS.DIR, f"{name}-worker-{worker}.log") 54 | else: 55 | filename = os.path.join(cfg.LOGS.DIR, f"{name}.log") 56 | if name is None or os.path.exists(filename): 57 | filename = os.path.join( 58 | cfg.LOGS.DIR, '{} {}.log'.format(name, datetime.now())) 59 | fh = logging.FileHandler(filename) 60 | fh.setLevel(logging.DEBUG) 61 | fh.setFormatter(formatter) 62 | logger.addHandler(fh) 63 | 64 | return logger 65 | 66 | 67 | def get_logger(name): 68 | """ 69 | Retrieve the logger with the specified name or, if name is None, return a 70 | logger which is the root logger of the hierarchy. 71 | Args: 72 | name (string): name of the logger. 73 | """ 74 | return logging.getLogger('progress-action.' + name) 75 | 76 | 77 | def log_json_stats(stats): 78 | """ 79 | Logs json stats. 80 | Args: 81 | stats (dict): a dictionary of statistical information to log. 82 | """ 83 | stats = { 84 | k: decimal.Decimal("{:.6f}".format(v)) if isinstance(v, float) else v 85 | for k, v in stats.items() 86 | } 87 | # json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True) 88 | logstr = "; ".join(["{}: {}".format(k, v) for k, v in stats.items()]) 89 | if du.is_master_proc(): 90 | logger = get_logger(__name__) 91 | logger.info(logstr) 92 | # logger.info("json_stats: {:s}".format(json_stats)) 93 | -------------------------------------------------------------------------------- /slowfast/utils/bn_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """bn helper.""" 5 | 6 | import itertools 7 | import torch 8 | 9 | 10 | @torch.no_grad() 11 | def compute_and_update_bn_stats(model, data_loader, num_batches=200): 12 | """ 13 | Compute and update the batch norm stats to make it more precise. During 14 | training both bn stats and the weight are changing after every iteration, 15 | so the bn can not precisely reflect the latest stats of the current model. 16 | Here the bn stats is recomputed without change of weights, to make the 17 | running mean and running var more precise. 18 | Args: 19 | model (model): the model using to compute and update the bn stats. 20 | data_loader (dataloader): dataloader using to provide inputs. 21 | num_batches (int): running iterations using to compute the stats. 22 | """ 23 | 24 | # Prepares all the bn layers. 25 | bn_layers = [ 26 | m 27 | for m in model.modules() 28 | if any( 29 | ( 30 | isinstance(m, bn_type) 31 | for bn_type in ( 32 | torch.nn.BatchNorm1d, 33 | torch.nn.BatchNorm2d, 34 | torch.nn.BatchNorm3d, 35 | ) 36 | ) 37 | ) 38 | ] 39 | 40 | # In order to make the running stats only reflect the current batch, the 41 | # momentum is disabled. 42 | # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean 43 | # Setting the momentum to 1.0 to compute the stats without momentum. 44 | momentum_actual = [bn.momentum for bn in bn_layers] 45 | for bn in bn_layers: 46 | bn.momentum = 1.0 47 | 48 | # Calculates the running iterations for precise stats computation. 49 | running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers] 50 | running_square_mean = [torch.zeros_like(bn.running_var) for bn in bn_layers] 51 | 52 | for ind, (inputs, _, _) in enumerate( 53 | itertools.islice(data_loader, num_batches) 54 | ): 55 | # Forwards the model to update the bn stats. 56 | if isinstance(inputs, (list,)): 57 | for i in range(len(inputs)): 58 | inputs[i] = inputs[i].float().cuda(non_blocking=True) 59 | else: 60 | inputs = inputs.cuda(non_blocking=True) 61 | model(inputs) 62 | 63 | for i, bn in enumerate(bn_layers): 64 | # Accumulates the bn stats. 65 | running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1) 66 | # $E(x^2) = Var(x) + E(x)^2$. 67 | cur_square_mean = bn.running_var + bn.running_mean ** 2 68 | running_square_mean[i] += ( 69 | cur_square_mean - running_square_mean[i] 70 | ) / (ind + 1) 71 | 72 | for i, bn in enumerate(bn_layers): 73 | bn.running_mean = running_mean[i] 74 | # Var(x) = $E(x^2) - E(x)^2$. 75 | bn.running_var = running_square_mean[i] - bn.running_mean ** 2 76 | # Sets the precise bn stats. 77 | bn.momentum = momentum_actual[i] 78 | -------------------------------------------------------------------------------- /slowfast/utils/lr_policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Learning rate policy.""" 5 | 6 | import math 7 | 8 | 9 | def get_lr_at_epoch(cfg, cur_epoch): 10 | """ 11 | Retrieve the learning rate of the current epoch with the option to perform 12 | warm up in the beginning of the training stage. 13 | Args: 14 | cfg (CfgNode): configs. Details can be found in 15 | slowfast/config/defaults.py 16 | cur_epoch (float): the number of epoch of the current training stage. 17 | """ 18 | lr = get_lr_func(cfg.SOLVER.LR_POLICY)(cfg, cur_epoch) 19 | # Perform warm up. 20 | if cur_epoch < cfg.SOLVER.WARMUP_EPOCHS: 21 | lr_start = cfg.SOLVER.WARMUP_START_LR 22 | lr_end = get_lr_func(cfg.SOLVER.LR_POLICY)( 23 | cfg, cfg.SOLVER.WARMUP_EPOCHS 24 | ) 25 | alpha = (lr_end - lr_start) / cfg.SOLVER.WARMUP_EPOCHS 26 | lr = cur_epoch * alpha + lr_start 27 | return lr 28 | 29 | 30 | def lr_func_cosine(cfg, cur_epoch): 31 | """ 32 | Retrieve the learning rate to specified values at specified epoch with the 33 | cosine learning rate schedule. Details can be found in: 34 | Ilya Loshchilov, and Frank Hutter 35 | SGDR: Stochastic Gradient Descent With Warm Restarts. 36 | Args: 37 | cfg (CfgNode): configs. Details can be found in 38 | slowfast/config/defaults.py 39 | cur_epoch (float): the number of epoch of the current training stage. 40 | """ 41 | return ( 42 | cfg.SOLVER.BASE_LR 43 | * (math.cos(math.pi * cur_epoch / cfg.SOLVER.MAX_EPOCH) + 1.0) 44 | * 0.5 45 | ) 46 | 47 | 48 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch): 49 | """ 50 | Retrieve the learning rate to specified values at specified epoch with the 51 | steps with relative learning rate schedule. 52 | Args: 53 | cfg (CfgNode): configs. Details can be found in 54 | slowfast/config/defaults.py 55 | cur_epoch (float): the number of epoch of the current training stage. 56 | """ 57 | ind = get_step_index(cfg, cur_epoch) 58 | return cfg.SOLVER.LRS[ind] * cfg.SOLVER.BASE_LR 59 | 60 | 61 | def get_step_index(cfg, cur_epoch): 62 | """ 63 | Retrieves the lr step index for the given epoch. 64 | Args: 65 | cfg (CfgNode): configs. Details can be found in 66 | slowfast/config/defaults.py 67 | cur_epoch (float): the number of epoch of the current training stage. 68 | """ 69 | steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_EPOCH] 70 | for ind, step in enumerate(steps): # NoQA 71 | if cur_epoch < step: 72 | break 73 | return ind - 1 74 | 75 | 76 | def get_lr_func(lr_policy): 77 | """ 78 | Given the configs, retrieve the specified lr policy function. 79 | Args: 80 | lr_policy (string): the learning rate policy to use for the job. 81 | """ 82 | policy = "lr_func_" + lr_policy 83 | if policy not in globals(): 84 | raise NotImplementedError("Unknown LR policy: {}".format(lr_policy)) 85 | else: 86 | return globals()[policy] 87 | -------------------------------------------------------------------------------- /slowfast/utils/aia_model_loading.py: -------------------------------------------------------------------------------- 1 | """AIA to PyTorch checkpoint name converting utility.""" 2 | 3 | import re 4 | 5 | 6 | def get_name_convert_func(): 7 | """ 8 | Get the function to convert AIA layer names to SlowFast layer names. 9 | Returns: 10 | (func): function to convert parameter name from AIA format to PyTorch 11 | format. 12 | """ 13 | 14 | pairs = [ 15 | # fuse fast to slow 16 | # ----------------------------------------------------- 17 | # fast.Tconv1.conv.weight -> s1_fuse.conv_f2s.weight 18 | [r"^fast.Tconv([1-4]).conv.(.*)", r"s\1_fuse.conv_f2s.\2"], 19 | 20 | # pathway 21 | # ----------------------------------------------------- 22 | # slow -> pathway0, fast -> pathway1 23 | [r"^slow(.*)", r"pathway0_\1"], 24 | [r"^fast(.*)", r"pathway1_\1"], 25 | 26 | # stem 27 | # ---------------------------------------------------- 28 | # slow.conv1.weight -> s1.pathway0_stem.conv.weight 29 | [r"(.*).conv1.weight", r"s0.\1stem.conv.weight"], 30 | # slow.bn1.weight -> s1.pathway0_stem.bn.weight 31 | [r"(.*).bn1(.*)", r"s0.\1stem.bn\2"], 32 | 33 | # res stage 34 | # ----------------------------------------------------- 35 | # conv1 -> a 36 | [r"(.*).conv1.(.*)", r"\1.a.\2",], 37 | # conv2 -> b 38 | [r"(.*).conv2.(.*)", r"\1.b.\2",], 39 | # conv3 -> c 40 | [r"(.*).conv3.(.*)", r"\1.c.\2",], 41 | # btnk -> branch2 42 | [r"(.*).btnk.(.*)", r"\1.branch2.\2",], 43 | # shortcut -> branch1 44 | [r"(.*).shortcut.(.*)", r"\1.branch1.\2",], 45 | # conv.weight -> weight 46 | [r"(.*)([abc123]).conv.weight\Z", r"\1\2.weight"], 47 | # .bn. -> _bn. 48 | [r"(.*)([abc123]).bn\.(.*)", r"\1\2_bn.\3"], 49 | 50 | # res_nl1 -> s1 51 | [r"(.*).res_nl([1-4])(.*)", r"s\2.\1\3"], 52 | # .res_0 -> _res0 53 | [r"(.*).res_([0-9]+)(.*)", r"\1res\2\3"], 54 | 55 | # stage number 56 | [r"^s4\.(.*)", r"s5.\1"], 57 | [r"^s3\.(.*)", r"s4.\1"], 58 | [r"^s2\.(.*)", r"s3.\1"], 59 | [r"^s1\.(.*)", r"s2.\1"], 60 | [r"^s0\.(.*)", r"s1.\1"], 61 | 62 | # head 63 | # ----------------------------------------------------- 64 | # cls_head.pred.weight -> head.projection.weight 65 | [r"cls_head.pred", r"head.projection"], 66 | ] 67 | 68 | def convert_aia_name_to_pytorch(aia_layer_name): 69 | """ 70 | Convert the aia_layer_name to slowfast format by apply the list of 71 | regular expressions. 72 | Args: 73 | aia_layer_name (str): aia layer name. 74 | Returns: 75 | (str): pytorch layer name. 76 | """ 77 | if aia_layer_name.startswith("module"): 78 | aia_layer_name = aia_layer_name.split("module.")[1] 79 | if aia_layer_name.startswith("backbone"): 80 | aia_layer_name = aia_layer_name.split("backbone.")[1] 81 | for source, dest in pairs: 82 | aia_layer_name = re.sub(source, dest, aia_layer_name) 83 | return aia_layer_name 84 | 85 | return convert_aia_name_to_pytorch 86 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import glob 4 | import os 5 | 6 | import numpy as np 7 | import torch 8 | from setuptools import setup, Extension, find_packages 9 | from torch.utils.cpp_extension import CUDA_HOME 10 | from torch.utils.cpp_extension import CppExtension 11 | from torch.utils.cpp_extension import CUDAExtension 12 | from Cython.Build import cythonize 13 | import platform 14 | 15 | requirements = ["torch", "torchvision"] 16 | 17 | 18 | def make_cython_ext(name, module, sources): 19 | extra_compile_args = None 20 | if platform.system() != 'Windows': 21 | extra_compile_args = { 22 | 'cxx': ['-Wno-unused-function', '-Wno-write-strings'] 23 | } 24 | 25 | extension = Extension( 26 | '{}.{}'.format(module, name), 27 | [os.path.join(*module.split('.'), p) for p in sources], 28 | include_dirs=[np.get_include()], 29 | language='c++', 30 | extra_compile_args=extra_compile_args) 31 | extension, = cythonize(extension) 32 | return extension 33 | 34 | 35 | def make_cuda_ext(name, module, sources): 36 | return CUDAExtension( 37 | name='{}.{}'.format(module, name), 38 | sources=[os.path.join(*module.split('.'), p) for p in sources], 39 | extra_compile_args={ 40 | 'cxx': [], 41 | 'nvcc': [ 42 | '-D__CUDA_NO_HALF_OPERATORS__', 43 | '-D__CUDA_NO_HALF_CONVERSIONS__', 44 | '-D__CUDA_NO_HALF2_OPERATORS__', 45 | ] 46 | }) 47 | 48 | 49 | def get_extensions(): 50 | this_dir = os.path.dirname(os.path.abspath(__file__)) 51 | extensions_dir = os.path.join(this_dir, "slowfast", "csrc") 52 | 53 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 54 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 55 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 56 | 57 | sources = main_file + source_cpu 58 | extension = CppExtension 59 | 60 | extra_compile_args = {"cxx": []} 61 | define_macros = [] 62 | 63 | if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": 64 | extension = CUDAExtension 65 | sources += source_cuda 66 | define_macros += [("WITH_CUDA", None)] 67 | extra_compile_args["nvcc"] = [ 68 | "-DCUDA_HAS_FP16=1", 69 | "-D__CUDA_NO_HALF_OPERATORS__", 70 | "-D__CUDA_NO_HALF_CONVERSIONS__", 71 | "-D__CUDA_NO_HALF2_OPERATORS__", 72 | "--expt-relaxed-constexpr", 73 | ] 74 | 75 | sources = [os.path.join(extensions_dir, s) for s in sources] 76 | 77 | include_dirs = [extensions_dir] 78 | 79 | ext_modules = [ 80 | extension( 81 | "slowfast._C", 82 | sources, 83 | include_dirs=include_dirs, 84 | define_macros=define_macros, 85 | extra_compile_args=extra_compile_args, 86 | ), 87 | ] 88 | 89 | return ext_modules 90 | 91 | 92 | setup( 93 | name="slowfast", 94 | ext_modules=get_extensions(), 95 | packages=find_packages(".", exclude=[ 96 | "configs", "scripts", "logs", "tools", "data", 97 | ]), 98 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 99 | ) 100 | -------------------------------------------------------------------------------- /slowfast/models/optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Optimizer.""" 5 | 6 | import torch 7 | 8 | import slowfast.utils.lr_policy as lr_policy 9 | 10 | 11 | def construct_optimizer(model, cfg): 12 | """ 13 | Construct a stochastic gradient descent or ADAM optimizer with momentum. 14 | Details can be found in: 15 | Herbert Robbins, and Sutton Monro. "A stochastic approximation method." 16 | and 17 | Diederik P.Kingma, and Jimmy Ba. 18 | "Adam: A Method for Stochastic Optimization." 19 | 20 | Args: 21 | model (model): model to perform stochastic gradient descent 22 | optimization or ADAM optimization. 23 | cfg (config): configs of hyper-parameters of SGD or ADAM, includes base 24 | learning rate, momentum, weight_decay, dampening, and etc. 25 | """ 26 | # Batchnorm parameters. 27 | bn_params = [] 28 | # Non-batchnorm parameters. 29 | non_bn_parameters = [] 30 | for name, p in model.named_parameters(): 31 | if "bn" in name: 32 | bn_params.append(p) 33 | else: 34 | non_bn_parameters.append(p) 35 | # Apply different weight decay to Batchnorm and non-batchnorm parameters. 36 | # In Caffe2 classification codebase the weight decay for batchnorm is 0.0. 37 | # Having a different weight decay on batchnorm might cause a performance 38 | # drop. 39 | optim_params = [ 40 | {"params": bn_params, "weight_decay": cfg.BN.WEIGHT_DECAY}, 41 | {"params": non_bn_parameters, "weight_decay": cfg.SOLVER.WEIGHT_DECAY}, 42 | ] 43 | # Check all parameters will be passed into optimizer. 44 | assert len(list(model.parameters())) == len(non_bn_parameters) + len( 45 | bn_params 46 | ), "parameter size does not match: {} + {} != {}".format( 47 | len(non_bn_parameters), len(bn_params), len(list(model.parameters())) 48 | ) 49 | 50 | if cfg.SOLVER.OPTIMIZING_METHOD == "sgd": 51 | return torch.optim.SGD( 52 | optim_params, 53 | lr=cfg.SOLVER.BASE_LR, 54 | momentum=cfg.SOLVER.MOMENTUM, 55 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 56 | dampening=cfg.SOLVER.DAMPENING, 57 | nesterov=cfg.SOLVER.NESTEROV, 58 | ) 59 | elif cfg.SOLVER.OPTIMIZING_METHOD == "adam": 60 | return torch.optim.Adam( 61 | optim_params, 62 | lr=cfg.SOLVER.BASE_LR, 63 | betas=(0.9, 0.999), 64 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 65 | ) 66 | else: 67 | raise NotImplementedError( 68 | "Does not support {} optimizer".format(cfg.SOLVER.OPTIMIZING_METHOD) 69 | ) 70 | 71 | 72 | def get_epoch_lr(cur_epoch, cfg): 73 | """ 74 | Retrieves the lr for the given epoch (as specified by the lr policy). 75 | Args: 76 | cfg (config): configs of hyper-parameters of ADAM, includes base 77 | learning rate, betas, and weight decays. 78 | cur_epoch (float): the number of epoch of the current training stage. 79 | """ 80 | return lr_policy.get_lr_at_epoch(cfg, cur_epoch) 81 | 82 | 83 | def set_lr(optimizer, new_lr): 84 | """ 85 | Sets the optimizer lr to the specified value. 86 | Args: 87 | optimizer (optim): the optimizer using to optimize the current network. 88 | new_lr (float): the new learning rate to set. 89 | """ 90 | for param_group in optimizer.param_groups: 91 | param_group["lr"] = new_lr 92 | -------------------------------------------------------------------------------- /slowfast/utils/benchmark.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Functions for benchmarks. 4 | """ 5 | 6 | import numpy as np 7 | import pprint 8 | import torch 9 | import tqdm 10 | from fvcore.common.timer import Timer 11 | 12 | import slowfast.utils.logging as logging 13 | import slowfast.utils.misc as misc 14 | from slowfast.datasets import loader 15 | from slowfast.utils.env import setup_environment 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | def benchmark_data_loading(cfg): 21 | """ 22 | Benchmark the speed of data loading in PySlowFast. 23 | Args: 24 | 25 | cfg (CfgNode): configs. Details can be found in 26 | slowfast/config/defaults.py 27 | """ 28 | # Set up environment. 29 | setup_environment() 30 | # Set random seed from configs. 31 | np.random.seed(cfg.RNG_SEED) 32 | torch.manual_seed(cfg.RNG_SEED) 33 | 34 | # Setup logging format. 35 | logging.setup_logger(cfg.LOGS.DIR) 36 | 37 | # Print config. 38 | logger.info("Benchmark data loading with config:") 39 | logger.info(pprint.pformat(cfg)) 40 | 41 | timer = Timer() 42 | dataloader = loader.construct_loader(cfg, "train") 43 | logger.info( 44 | "Initialize loader using {:.2f} seconds.".format(timer.seconds()) 45 | ) 46 | # Total batch size across different machines. 47 | batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS 48 | log_period = cfg.BENCHMARK.LOG_PERIOD 49 | epoch_times = [] 50 | # Test for a few epochs. 51 | for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS): 52 | timer = Timer() 53 | timer_epoch = Timer() 54 | iter_times = [] 55 | if cfg.BENCHMARK.SHUFFLE: 56 | loader.shuffle_dataset(dataloader, cur_epoch) 57 | for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)): 58 | if cur_iter > 0 and cur_iter % log_period == 0: 59 | iter_times.append(timer.seconds()) 60 | ram_usage, ram_total = misc.cpu_mem_usage() 61 | logger.info( 62 | "Epoch {}: {} iters ({} videos) in {:.2f} seconds. " 63 | "RAM Usage: {:.2f}/{:.2f} GB.".format( 64 | cur_epoch, 65 | log_period, 66 | log_period * batch_size, 67 | iter_times[-1], 68 | ram_usage, 69 | ram_total, 70 | ) 71 | ) 72 | timer.reset() 73 | epoch_times.append(timer_epoch.seconds()) 74 | ram_usage, ram_total = misc.cpu_mem_usage() 75 | logger.info( 76 | "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. " 77 | "RAM Usage: {:.2f}/{:.2f} GB.".format( 78 | cur_epoch, 79 | len(dataloader), 80 | len(dataloader) * batch_size, 81 | epoch_times[-1], 82 | ram_usage, 83 | ram_total, 84 | ) 85 | ) 86 | logger.info( 87 | "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} " 88 | "(avg/std) seconds.".format( 89 | cur_epoch, 90 | log_period, 91 | log_period * batch_size, 92 | np.mean(iter_times), 93 | np.std(iter_times), 94 | ) 95 | ) 96 | logger.info( 97 | "On average every epoch ({} videos) takes {:.2f}/{:.2f} " 98 | "(avg/std) seconds.".format( 99 | len(dataloader) * batch_size, 100 | np.mean(epoch_times), 101 | np.std(epoch_times), 102 | ) 103 | ) 104 | -------------------------------------------------------------------------------- /slowfast/utils/parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Argument parser functions.""" 5 | 6 | import argparse 7 | import sys 8 | 9 | import slowfast.utils.checkpoint as cu 10 | from slowfast.config.defaults import get_cfg 11 | 12 | 13 | def parse_args(): 14 | """ 15 | Parse the following arguments for a default parser for PySlowFast users. 16 | Args: 17 | shard_id (int): shard id for the current machine. Starts from 0 to 18 | num_shards - 1. If single machine is used, then set shard id to 0. 19 | num_shards (int): number of shards using by the job. 20 | init_method (str): initialization method to launch the job with multiple 21 | devices. Options includes TCP or shared file-system for 22 | initialization. details can be find in 23 | https://pytorch.org/docs/stable/distributed.html#tcp-initialization 24 | cfg (str): path to the config file. 25 | opts (argument): provide addtional options from the command line, it 26 | overwrites the config loaded from file. 27 | """ 28 | parser = argparse.ArgumentParser( 29 | description="Provide SlowFast video training and testing pipeline." 30 | ) 31 | parser.add_argument( 32 | "--rank", 33 | help="The shard id of current node, Starts from 0 to num_shards - 1", 34 | default=0, 35 | type=int, 36 | ) 37 | parser.add_argument( 38 | "--world_size", 39 | help="Number of shards using by the job", 40 | default=1, 41 | type=int, 42 | ) 43 | parser.add_argument( 44 | "--init_method", 45 | help="Initialization method, includes TCP or shared file-system.", 46 | default="auto", 47 | type=str, 48 | ) 49 | parser.add_argument( 50 | "--cfg", 51 | dest="cfg_file", 52 | help="Path to the config file", 53 | default="configs/Kinetics/SLOWFAST_4x16_R50.yaml", 54 | type=str, 55 | ) 56 | if len(sys.argv) == 1: 57 | parser.print_help() 58 | return parser.parse_known_args() 59 | 60 | 61 | def load_config(args, opts=None): 62 | """ 63 | Given the arguemnts, load and initialize the configs. 64 | Args: 65 | args (argument): arguments includes `shard_id`, `num_shards`, 66 | `init_method`, `cfg_file`, and `opts`. 67 | """ 68 | # Setup cfg. 69 | cfg = get_cfg() 70 | # Load config from cfg. 71 | if args.cfg_file is not None: 72 | cfg.merge_from_file(args.cfg_file) 73 | # Load config from command line, overwrite config from opts. 74 | if opts is not None: 75 | # remove unkown args 76 | remove_lists = [] 77 | for i in range(len(opts)): 78 | if opts[i].startswith("--"): 79 | remove_lists.append(opts[i]) 80 | if "=" not in opts[i]: 81 | remove_lists.append(opts[i+1]) 82 | elif opts[i].startswith("-"): 83 | remove_lists.append(opts[i]) 84 | for opt in remove_lists: 85 | print("Remove unkonwn args {}".format(opt)) 86 | opts.remove(opt) 87 | cfg.merge_from_list(opts) 88 | 89 | # Inherit parameters from args. 90 | if hasattr(args, "world_size") and hasattr(args, "rank"): 91 | cfg.NUM_SHARDS = args.world_size 92 | cfg.SHARD_ID = args.rank 93 | if hasattr(args, "rng_seed"): 94 | cfg.RNG_SEED = args.rng_seed 95 | if hasattr(args, "output_dir"): 96 | cfg.LOGS.DIR = args.output_dir 97 | 98 | # Create the checkpoint dir. 99 | cu.make_checkpoint_dir(cfg.LOGS.DIR) 100 | cfg.freeze() 101 | return cfg 102 | -------------------------------------------------------------------------------- /slowfast/utils/setup_moxing_env.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | import os 3 | import filelock 4 | import logging 5 | import tempfile 6 | import six 7 | 8 | import torch 9 | filelock.logger().setLevel(logging.WARNING) 10 | 11 | 12 | def safe_s3_cache(org_path, targ_path, copy_type): 13 | import moxing as mox 14 | mox.file.shift("os", "mox") 15 | 16 | safe_flag = targ_path + ".safe" 17 | if os.path.exists(safe_flag): 18 | return 19 | lock = filelock.FileLock(targ_path + ".lock") 20 | with lock: 21 | if not os.path.exists(safe_flag) and os.path.exists(org_path): 22 | if copy_type == "file": 23 | mox.file.copy(org_path, targ_path) 24 | else: 25 | mox.file.copy_parallel(org_path, targ_path, is_processing=False) 26 | open(safe_flag, "a").close() 27 | 28 | 29 | def wrap_input_path(module, func_name, tmp_dir="/cache/", copy_method="file"): 30 | origin_func = getattr(module, func_name) 31 | 32 | def wrapped_func(input_path, *args, **kwargs): 33 | if input_path.startswith("obs://"): 34 | import moxing as mox 35 | mox.file.shift("os", "mox") 36 | 37 | relative_path = os.path.join("obs/", input_path[6:]) 38 | local_path = os.path.join(tmp_dir, relative_path) 39 | local_dir, _ = os.path.split(local_path) 40 | os.makedirs(local_dir, exist_ok=True) 41 | if copy_method == "file": 42 | safe_s3_cache(input_path, local_path, copy_method) 43 | else: 44 | safe_s3_cache(os.path.split(input_path)[0], local_dir, copy_method) 45 | return origin_func(local_path, *args, **kwargs) 46 | else: 47 | return origin_func(input_path, *args, **kwargs) 48 | 49 | setattr(module, func_name, wrapped_func) 50 | 51 | 52 | def wrap_output_path(module, func_name, tmp_dir="/cache/"): 53 | origin_func = getattr(module, func_name) 54 | 55 | def wrapped_func(data, output_path, *args, **kwargs): 56 | if isinstance(output_path, six.string_types) and output_path.startswith("obs://"): 57 | import moxing as mox 58 | mox.file.shift("os", "mox") 59 | 60 | with tempfile.NamedTemporaryFile(dir=tmp_dir) as f: 61 | temp_path = f.name 62 | origin_ret = origin_func(data, temp_path, *args, **kwargs) 63 | mox.file.copy(temp_path, output_path) 64 | else: 65 | origin_ret = origin_func(data, output_path, *args, *kwargs) 66 | return origin_ret 67 | 68 | setattr(module, func_name, wrapped_func) 69 | 70 | 71 | def wrap_input_path2(input_path, tmp_dir="/cache/", copy_method="file"): 72 | if input_path.startswith("obs://"): 73 | import moxing as mox 74 | mox.file.shift("os", "mox") 75 | 76 | relative_path = os.path.join("obs/", input_path[6:]) 77 | local_path = os.path.join(tmp_dir, relative_path) 78 | local_dir, _ = os.path.split(local_path) 79 | os.makedirs(local_dir, exist_ok=True) 80 | if copy_method == "file": 81 | safe_s3_cache(input_path, local_path, copy_method) 82 | else: 83 | safe_s3_cache(os.path.split(input_path)[0], local_dir, copy_method) 84 | return local_path 85 | else: 86 | return input_path 87 | 88 | 89 | def wrap_output_path2(origin_func, data, output_path, *args, **kwargs): 90 | if isinstance(output_path, six.string_types) and output_path.startswith("obs://"): 91 | import moxing as mox 92 | mox.file.shift("os", "mox") 93 | 94 | with tempfile.NamedTemporaryFile(dir="/cache/") as f: 95 | temp_path = f.name 96 | origin_ret = origin_func(data, temp_path, *args, **kwargs) 97 | mox.file.copy(temp_path, output_path) 98 | else: 99 | origin_ret = origin_func(data, output_path, *args, **kwargs) 100 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt: -------------------------------------------------------------------------------- 1 | item { 2 | name: "bend/bow (at the waist)" 3 | id: 1 4 | } 5 | item { 6 | name: "crouch/kneel" 7 | id: 3 8 | } 9 | item { 10 | name: "dance" 11 | id: 4 12 | } 13 | item { 14 | name: "fall down" 15 | id: 5 16 | } 17 | item { 18 | name: "get up" 19 | id: 6 20 | } 21 | item { 22 | name: "jump/leap" 23 | id: 7 24 | } 25 | item { 26 | name: "lie/sleep" 27 | id: 8 28 | } 29 | item { 30 | name: "martial art" 31 | id: 9 32 | } 33 | item { 34 | name: "run/jog" 35 | id: 10 36 | } 37 | item { 38 | name: "sit" 39 | id: 11 40 | } 41 | item { 42 | name: "stand" 43 | id: 12 44 | } 45 | item { 46 | name: "swim" 47 | id: 13 48 | } 49 | item { 50 | name: "walk" 51 | id: 14 52 | } 53 | item { 54 | name: "answer phone" 55 | id: 15 56 | } 57 | item { 58 | name: "carry/hold (an object)" 59 | id: 17 60 | } 61 | item { 62 | name: "climb (e.g., a mountain)" 63 | id: 20 64 | } 65 | item { 66 | name: "close (e.g., a door, a box)" 67 | id: 22 68 | } 69 | item { 70 | name: "cut" 71 | id: 24 72 | } 73 | item { 74 | name: "dress/put on clothing" 75 | id: 26 76 | } 77 | item { 78 | name: "drink" 79 | id: 27 80 | } 81 | item { 82 | name: "drive (e.g., a car, a truck)" 83 | id: 28 84 | } 85 | item { 86 | name: "eat" 87 | id: 29 88 | } 89 | item { 90 | name: "enter" 91 | id: 30 92 | } 93 | item { 94 | name: "hit (an object)" 95 | id: 34 96 | } 97 | item { 98 | name: "lift/pick up" 99 | id: 36 100 | } 101 | item { 102 | name: "listen (e.g., to music)" 103 | id: 37 104 | } 105 | item { 106 | name: "open (e.g., a window, a car door)" 107 | id: 38 108 | } 109 | item { 110 | name: "play musical instrument" 111 | id: 41 112 | } 113 | item { 114 | name: "point to (an object)" 115 | id: 43 116 | } 117 | item { 118 | name: "pull (an object)" 119 | id: 45 120 | } 121 | item { 122 | name: "push (an object)" 123 | id: 46 124 | } 125 | item { 126 | name: "put down" 127 | id: 47 128 | } 129 | item { 130 | name: "read" 131 | id: 48 132 | } 133 | item { 134 | name: "ride (e.g., a bike, a car, a horse)" 135 | id: 49 136 | } 137 | item { 138 | name: "sail boat" 139 | id: 51 140 | } 141 | item { 142 | name: "shoot" 143 | id: 52 144 | } 145 | item { 146 | name: "smoke" 147 | id: 54 148 | } 149 | item { 150 | name: "take a photo" 151 | id: 56 152 | } 153 | item { 154 | name: "text on/look at a cellphone" 155 | id: 57 156 | } 157 | item { 158 | name: "throw" 159 | id: 58 160 | } 161 | item { 162 | name: "touch (an object)" 163 | id: 59 164 | } 165 | item { 166 | name: "turn (e.g., a screwdriver)" 167 | id: 60 168 | } 169 | item { 170 | name: "watch (e.g., TV)" 171 | id: 61 172 | } 173 | item { 174 | name: "work on a computer" 175 | id: 62 176 | } 177 | item { 178 | name: "write" 179 | id: 63 180 | } 181 | item { 182 | name: "fight/hit (a person)" 183 | id: 64 184 | } 185 | item { 186 | name: "give/serve (an object) to (a person)" 187 | id: 65 188 | } 189 | item { 190 | name: "grab (a person)" 191 | id: 66 192 | } 193 | item { 194 | name: "hand clap" 195 | id: 67 196 | } 197 | item { 198 | name: "hand shake" 199 | id: 68 200 | } 201 | item { 202 | name: "hand wave" 203 | id: 69 204 | } 205 | item { 206 | name: "hug (a person)" 207 | id: 70 208 | } 209 | item { 210 | name: "kiss (a person)" 211 | id: 72 212 | } 213 | item { 214 | name: "lift (a person)" 215 | id: 73 216 | } 217 | item { 218 | name: "listen to (a person)" 219 | id: 74 220 | } 221 | item { 222 | name: "push (another person)" 223 | id: 76 224 | } 225 | item { 226 | name: "sing to (e.g., self, a person, a group)" 227 | id: 77 228 | } 229 | item { 230 | name: "take (an object) from (a person)" 231 | id: 78 232 | } 233 | item { 234 | name: "talk to (e.g., self, a person, a group)" 235 | id: 79 236 | } 237 | item { 238 | name: "watch (a person)" 239 | id: 80 240 | } 241 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/np_box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, 4] numpy arrays representing bounding boxes. 17 | 18 | Example box operations that are supported: 19 | * Areas: compute bounding box areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | from __future__ import ( 23 | absolute_import, 24 | division, 25 | print_function, 26 | unicode_literals, 27 | ) 28 | import numpy as np 29 | 30 | 31 | def area(boxes): 32 | """Computes area of boxes. 33 | 34 | Args: 35 | boxes: Numpy array with shape [N, 4] holding N boxes 36 | 37 | Returns: 38 | a numpy array with shape [N*1] representing box areas 39 | """ 40 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 41 | 42 | 43 | def intersection(boxes1, boxes2): 44 | """Compute pairwise intersection areas between boxes. 45 | 46 | Args: 47 | boxes1: a numpy array with shape [N, 4] holding N boxes 48 | boxes2: a numpy array with shape [M, 4] holding M boxes 49 | 50 | Returns: 51 | a numpy array with shape [N*M] representing pairwise intersection area 52 | """ 53 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 54 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 55 | 56 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 57 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 58 | intersect_heights = np.maximum( 59 | np.zeros(all_pairs_max_ymin.shape), 60 | all_pairs_min_ymax - all_pairs_max_ymin, 61 | ) 62 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 63 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 64 | intersect_widths = np.maximum( 65 | np.zeros(all_pairs_max_xmin.shape), 66 | all_pairs_min_xmax - all_pairs_max_xmin, 67 | ) 68 | return intersect_heights * intersect_widths 69 | 70 | 71 | def iou(boxes1, boxes2): 72 | """Computes pairwise intersection-over-union between box collections. 73 | 74 | Args: 75 | boxes1: a numpy array with shape [N, 4] holding N boxes. 76 | boxes2: a numpy array with shape [M, 4] holding N boxes. 77 | 78 | Returns: 79 | a numpy array with shape [N, M] representing pairwise iou scores. 80 | """ 81 | intersect = intersection(boxes1, boxes2) 82 | area1 = area(boxes1) 83 | area2 = area(boxes2) 84 | union = ( 85 | np.expand_dims(area1, axis=1) 86 | + np.expand_dims(area2, axis=0) 87 | - intersect 88 | ) 89 | return intersect / union 90 | 91 | 92 | def ioa(boxes1, boxes2): 93 | """Computes pairwise intersection-over-area between box collections. 94 | 95 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 96 | their intersection area over box2's area. Note that ioa is not symmetric, 97 | that is, IOA(box1, box2) != IOA(box2, box1). 98 | 99 | Args: 100 | boxes1: a numpy array with shape [N, 4] holding N boxes. 101 | boxes2: a numpy array with shape [M, 4] holding N boxes. 102 | 103 | Returns: 104 | a numpy array with shape [N, M] representing pairwise ioa scores. 105 | """ 106 | intersect = intersection(boxes1, boxes2) 107 | areas = np.expand_dims(area(boxes2), axis=0) 108 | return intersect / areas 109 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/np_mask_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, height, width] numpy arrays representing masks. 17 | 18 | Example mask operations that are supported: 19 | * Areas: compute mask areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | from __future__ import ( 23 | absolute_import, 24 | division, 25 | print_function, 26 | unicode_literals, 27 | ) 28 | import numpy as np 29 | 30 | EPSILON = 1e-7 31 | 32 | 33 | def area(masks): 34 | """Computes area of masks. 35 | 36 | Args: 37 | masks: Numpy array with shape [N, height, width] holding N masks. Masks 38 | values are of type np.uint8 and values are in {0,1}. 39 | 40 | Returns: 41 | a numpy array with shape [N*1] representing mask areas. 42 | 43 | Raises: 44 | ValueError: If masks.dtype is not np.uint8 45 | """ 46 | if masks.dtype != np.uint8: 47 | raise ValueError("Masks type should be np.uint8") 48 | return np.sum(masks, axis=(1, 2), dtype=np.float32) 49 | 50 | 51 | def intersection(masks1, masks2): 52 | """Compute pairwise intersection areas between masks. 53 | 54 | Args: 55 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 56 | values are of type np.uint8 and values are in {0,1}. 57 | masks2: a numpy array with shape [M, height, width] holding M masks. Masks 58 | values are of type np.uint8 and values are in {0,1}. 59 | 60 | Returns: 61 | a numpy array with shape [N*M] representing pairwise intersection area. 62 | 63 | Raises: 64 | ValueError: If masks1 and masks2 are not of type np.uint8. 65 | """ 66 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 67 | raise ValueError("masks1 and masks2 should be of type np.uint8") 68 | n = masks1.shape[0] 69 | m = masks2.shape[0] 70 | answer = np.zeros([n, m], dtype=np.float32) 71 | for i in np.arange(n): 72 | for j in np.arange(m): 73 | answer[i, j] = np.sum( 74 | np.minimum(masks1[i], masks2[j]), dtype=np.float32 75 | ) 76 | return answer 77 | 78 | 79 | def iou(masks1, masks2): 80 | """Computes pairwise intersection-over-union between mask collections. 81 | 82 | Args: 83 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 84 | values are of type np.uint8 and values are in {0,1}. 85 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 86 | values are of type np.uint8 and values are in {0,1}. 87 | 88 | Returns: 89 | a numpy array with shape [N, M] representing pairwise iou scores. 90 | 91 | Raises: 92 | ValueError: If masks1 and masks2 are not of type np.uint8. 93 | """ 94 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 95 | raise ValueError("masks1 and masks2 should be of type np.uint8") 96 | intersect = intersection(masks1, masks2) 97 | area1 = area(masks1) 98 | area2 = area(masks2) 99 | union = ( 100 | np.expand_dims(area1, axis=1) 101 | + np.expand_dims(area2, axis=0) 102 | - intersect 103 | ) 104 | return intersect / np.maximum(union, EPSILON) 105 | 106 | 107 | def ioa(masks1, masks2): 108 | """Computes pairwise intersection-over-area between box collections. 109 | 110 | Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as 111 | their intersection area over mask2's area. Note that ioa is not symmetric, 112 | that is, IOA(mask1, mask2) != IOA(mask2, mask1). 113 | 114 | Args: 115 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 116 | values are of type np.uint8 and values are in {0,1}. 117 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 118 | values are of type np.uint8 and values are in {0,1}. 119 | 120 | Returns: 121 | a numpy array with shape [N, M] representing pairwise ioa scores. 122 | 123 | Raises: 124 | ValueError: If masks1 and masks2 are not of type np.uint8. 125 | """ 126 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 127 | raise ValueError("masks1 and masks2 should be of type np.uint8") 128 | intersect = intersection(masks1, masks2) 129 | areas = np.expand_dims(area(masks2), axis=0) 130 | return intersect / (areas + EPSILON) 131 | -------------------------------------------------------------------------------- /slowfast/utils/c2_model_loading.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Caffe2 to PyTorch checkpoint name converting utility.""" 5 | 6 | import re 7 | 8 | 9 | def get_name_convert_func(): 10 | """ 11 | Get the function to convert Caffe2 layer names to PyTorch layer names. 12 | Returns: 13 | (func): function to convert parameter name from Caffe2 format to PyTorch 14 | format. 15 | """ 16 | pairs = [ 17 | # ------------------------------------------------------------ 18 | # 'nonlocal_conv3_1_theta_w' -> 's3.pathway0_nonlocal3.conv_g.weight' 19 | [ 20 | r"^nonlocal_conv([0-9]+)_([0-9]+)_(.*)", 21 | r"s\1.pathway0_nonlocal\2_\3", 22 | ], 23 | # 'theta' -> 'conv_theta' 24 | [r"^(.*)_nonlocal([0-9]+)_(theta)(.*)", r"\1_nonlocal\2.conv_\3\4"], 25 | # 'g' -> 'conv_g' 26 | [r"^(.*)_nonlocal([0-9]+)_(g)(.*)", r"\1_nonlocal\2.conv_\3\4"], 27 | # 'phi' -> 'conv_phi' 28 | [r"^(.*)_nonlocal([0-9]+)_(phi)(.*)", r"\1_nonlocal\2.conv_\3\4"], 29 | # 'out' -> 'conv_out' 30 | [r"^(.*)_nonlocal([0-9]+)_(out)(.*)", r"\1_nonlocal\2.conv_\3\4"], 31 | # 'nonlocal_conv4_5_bn_s' -> 's4.pathway0_nonlocal3.bn.weight' 32 | [r"^(.*)_nonlocal([0-9]+)_(bn)_(.*)", r"\1_nonlocal\2.\3.\4"], 33 | # ------------------------------------------------------------ 34 | # 't_pool1_subsample_bn' -> 's1_fuse.conv_f2s.bn.running_mean' 35 | [r"^t_pool1_subsample_bn_(.*)", r"s1_fuse.bn.\1"], 36 | # 't_pool1_subsample' -> 's1_fuse.conv_f2s' 37 | [r"^t_pool1_subsample_(.*)", r"s1_fuse.conv_f2s.\1"], 38 | # 't_res4_5_branch2c_bn_subsample_bn_rm' -> 's4_fuse.conv_f2s.bias' 39 | [ 40 | r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_bn_(.*)", 41 | r"s\1_fuse.bn.\3", 42 | ], 43 | # 't_pool1_subsample' -> 's1_fuse.conv_f2s' 44 | [ 45 | r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_(.*)", 46 | r"s\1_fuse.conv_f2s.\3", 47 | ], 48 | # ------------------------------------------------------------ 49 | # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b' 50 | [ 51 | r"^res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)", 52 | r"s\1.pathway0_res\2.branch\3.\4_\5", 53 | ], 54 | # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.' 55 | [r"^res_conv1_bn_(.*)", r"s1.pathway0_stem.bn.\1"], 56 | # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.' 57 | [r"^conv1_(.*)", r"s1.pathway0_stem.conv.\1"], 58 | # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight' 59 | [ 60 | r"^res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)", 61 | r"s\1.pathway0_res\2.branch\3_\4", 62 | ], 63 | # 'res_conv1_' -> 's1.pathway0_stem.conv.' 64 | [r"^res_conv1_(.*)", r"s1.pathway0_stem.conv.\1"], 65 | # ------------------------------------------------------------ 66 | # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b' 67 | [ 68 | r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)", 69 | r"s\1.pathway1_res\2.branch\3.\4_\5", 70 | ], 71 | # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.' 72 | [r"^t_res_conv1_bn_(.*)", r"s1.pathway1_stem.bn.\1"], 73 | # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.' 74 | [r"^t_conv1_(.*)", r"s1.pathway1_stem.conv.\1"], 75 | # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight' 76 | [ 77 | r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)", 78 | r"s\1.pathway1_res\2.branch\3_\4", 79 | ], 80 | # 'res_conv1_' -> 's1.pathway0_stem.conv.' 81 | [r"^t_res_conv1_(.*)", r"s1.pathway1_stem.conv.\1"], 82 | # ------------------------------------------------------------ 83 | # pred_ -> head.projection. 84 | [r"pred_(.*)", r"head.projection.\1"], 85 | # '.bn_b' -> '.weight' 86 | [r"(.*)bn.b\Z", r"\1bn.bias"], 87 | # '.bn_s' -> '.weight' 88 | [r"(.*)bn.s\Z", r"\1bn.weight"], 89 | # '_bn_rm' -> '.running_mean' 90 | [r"(.*)bn.rm\Z", r"\1bn.running_mean"], 91 | # '_bn_riv' -> '.running_var' 92 | [r"(.*)bn.riv\Z", r"\1bn.running_var"], 93 | # '_b' -> '.bias' 94 | [r"(.*)[\._]b\Z", r"\1.bias"], 95 | # '_w' -> '.weight' 96 | [r"(.*)[\._]w\Z", r"\1.weight"], 97 | ] 98 | 99 | def convert_caffe2_name_to_pytorch(caffe2_layer_name): 100 | """ 101 | Convert the caffe2_layer_name to pytorch format by apply the list of 102 | regular expressions. 103 | Args: 104 | caffe2_layer_name (str): caffe2 layer name. 105 | Returns: 106 | (str): pytorch layer name. 107 | """ 108 | for source, dest in pairs: 109 | caffe2_layer_name = re.sub(source, dest, caffe2_layer_name) 110 | return caffe2_layer_name 111 | 112 | return convert_caffe2_name_to_pytorch 113 | -------------------------------------------------------------------------------- /slowfast/datasets/loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Data loader.""" 5 | 6 | import itertools 7 | import numpy as np 8 | import torch 9 | from torch.utils.data._utils.collate import default_collate 10 | from torch.utils.data.distributed import DistributedSampler 11 | from torch.utils.data.sampler import RandomSampler 12 | 13 | from slowfast.datasets.multigrid_helper import ShortCycleBatchSampler 14 | 15 | from .build import build_dataset 16 | 17 | 18 | def detection_collate(batch): 19 | """ 20 | Collate function for detection task. Concatanate bboxes, labels and 21 | metadata from different samples in the first dimension instead of 22 | stacking them to have a batch-size dimension. 23 | Args: 24 | batch (tuple or list): data batch to collate. 25 | Returns: 26 | (tuple): collated detection data batch. 27 | """ 28 | inputs, labels, video_idx, extra_data = zip(*batch) 29 | inputs, video_idx = default_collate(inputs), default_collate(video_idx) 30 | labels = torch.tensor(np.concatenate(labels, axis=0)).float() 31 | 32 | collated_extra_data = {} 33 | for key in extra_data[0].keys(): 34 | data = [d[key] for d in extra_data] 35 | if key == "boxes" or key == "ori_boxes": 36 | # Append idx info to the bboxes before concatenating them. 37 | bboxes = [ 38 | np.concatenate( 39 | [np.full((data[i].shape[0], 1), float(i)), data[i]], axis=1 40 | ) 41 | for i in range(len(data)) 42 | ] 43 | bboxes = np.concatenate(bboxes, axis=0) 44 | collated_extra_data[key] = torch.tensor(bboxes).float() 45 | elif key == "step_idxes": 46 | collated_extra_data[key] = torch.tensor(np.concatenate(data, axis=0)) 47 | elif key == "metadata": 48 | collated_extra_data[key] = torch.tensor( 49 | list(itertools.chain(*data)) 50 | ).view(-1, 3) 51 | else: 52 | collated_extra_data[key] = default_collate(data) 53 | 54 | return inputs, labels, video_idx, collated_extra_data 55 | 56 | 57 | def construct_loader(cfg, split, is_precise_bn=False): 58 | """ 59 | Constructs the data loader for the given dataset. 60 | Args: 61 | cfg (CfgNode): configs. Details can be found in 62 | slowfast/config/defaults.py 63 | split (str): the split of the data loader. Options include `train`, 64 | `val`, and `test`. 65 | """ 66 | assert split in ["train", "val", "test"] 67 | if split in ["train"]: 68 | dataset_name = cfg.TRAIN.DATASET 69 | batch_size = int(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS) 70 | shuffle = True 71 | drop_last = True 72 | elif split in ["val"]: 73 | dataset_name = cfg.TRAIN.DATASET 74 | batch_size = int(cfg.TEST.BATCH_SIZE / cfg.NUM_GPUS) 75 | shuffle = False 76 | drop_last = False 77 | elif split in ["test"]: 78 | dataset_name = cfg.TEST.DATASET 79 | batch_size = int(cfg.TEST.BATCH_SIZE / cfg.NUM_GPUS) 80 | shuffle = False 81 | drop_last = False 82 | 83 | # Construct the dataset 84 | dataset = build_dataset(dataset_name, cfg, split) 85 | 86 | if cfg.MULTIGRID.SHORT_CYCLE and split in ["train"] and not is_precise_bn: 87 | # Create a sampler for multi-process training 88 | sampler = ( 89 | DistributedSampler(dataset) 90 | if cfg.NUM_GPUS > 1 91 | else RandomSampler(dataset) 92 | ) 93 | batch_sampler = ShortCycleBatchSampler( 94 | sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg 95 | ) 96 | # Create a loader 97 | loader = torch.utils.data.DataLoader( 98 | dataset, 99 | batch_sampler=batch_sampler, 100 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 101 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 102 | ) 103 | else: 104 | # Create a sampler for multi-process training 105 | sampler = DistributedSampler(dataset) if cfg.NUM_GPUS > 1 else None 106 | # Create a loader 107 | loader = torch.utils.data.DataLoader( 108 | dataset, 109 | batch_size=batch_size, 110 | shuffle=(False if sampler else shuffle), 111 | sampler=sampler, 112 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 113 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 114 | drop_last=drop_last, 115 | collate_fn=detection_collate if cfg.DETECTION.ENABLE else None, 116 | ) 117 | return loader 118 | 119 | 120 | def shuffle_dataset(loader, cur_epoch): 121 | """" 122 | Shuffles the data. 123 | Args: 124 | loader (loader): data loader to perform shuffle. 125 | cur_epoch (int): number of the current epoch. 126 | """ 127 | sampler = ( 128 | loader.batch_sampler.sampler 129 | if isinstance(loader.batch_sampler, ShortCycleBatchSampler) 130 | else loader.sampler 131 | ) 132 | assert isinstance( 133 | sampler, (RandomSampler, DistributedSampler) 134 | ), "Sampler type '{}' not supported".format(type(sampler)) 135 | # RandomSampler handles shuffling automatically 136 | if isinstance(sampler, DistributedSampler): 137 | # DistributedSampler shuffles data based on epoch 138 | sampler.set_epoch(cur_epoch) 139 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/np_box_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxList classes and functions.""" 17 | 18 | from __future__ import ( 19 | absolute_import, 20 | division, 21 | print_function, 22 | unicode_literals, 23 | ) 24 | import numpy as np 25 | 26 | 27 | class BoxList(object): 28 | """Box collection. 29 | 30 | BoxList represents a list of bounding boxes as numpy array, where each 31 | bounding box is represented as a row of 4 numbers, 32 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within a 33 | given list correspond to a single image. 34 | 35 | Optionally, users can add additional related fields (such as 36 | objectness/classification scores). 37 | """ 38 | 39 | def __init__(self, data): 40 | """Constructs box collection. 41 | 42 | Args: 43 | data: a numpy array of shape [N, 4] representing box coordinates 44 | 45 | Raises: 46 | ValueError: if bbox data is not a numpy array 47 | ValueError: if invalid dimensions for bbox data 48 | """ 49 | if not isinstance(data, np.ndarray): 50 | raise ValueError("data must be a numpy array.") 51 | if len(data.shape) != 2 or data.shape[1] != 4: 52 | raise ValueError("Invalid dimensions for box data.") 53 | if data.dtype != np.float32 and data.dtype != np.float64: 54 | raise ValueError( 55 | "Invalid data type for box data: float is required." 56 | ) 57 | if not self._is_valid_boxes(data): 58 | raise ValueError( 59 | "Invalid box data. data must be a numpy array of " 60 | "N*[y_min, x_min, y_max, x_max]" 61 | ) 62 | self.data = {"boxes": data} 63 | 64 | def num_boxes(self): 65 | """Return number of boxes held in collections.""" 66 | return self.data["boxes"].shape[0] 67 | 68 | def get_extra_fields(self): 69 | """Return all non-box fields.""" 70 | return [k for k in self.data.keys() if k != "boxes"] 71 | 72 | def has_field(self, field): 73 | return field in self.data 74 | 75 | def add_field(self, field, field_data): 76 | """Add data to a specified field. 77 | 78 | Args: 79 | field: a string parameter used to speficy a related field to be accessed. 80 | field_data: a numpy array of [N, ...] representing the data associated 81 | with the field. 82 | Raises: 83 | ValueError: if the field is already exist or the dimension of the field 84 | data does not matches the number of boxes. 85 | """ 86 | if self.has_field(field): 87 | raise ValueError("Field " + field + "already exists") 88 | if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(): 89 | raise ValueError("Invalid dimensions for field data") 90 | self.data[field] = field_data 91 | 92 | def get(self): 93 | """Convenience function for accesssing box coordinates. 94 | 95 | Returns: 96 | a numpy array of shape [N, 4] representing box corners 97 | """ 98 | return self.get_field("boxes") 99 | 100 | def get_field(self, field): 101 | """Accesses data associated with the specified field in the box collection. 102 | 103 | Args: 104 | field: a string parameter used to speficy a related field to be accessed. 105 | 106 | Returns: 107 | a numpy 1-d array representing data of an associated field 108 | 109 | Raises: 110 | ValueError: if invalid field 111 | """ 112 | if not self.has_field(field): 113 | raise ValueError("field {} does not exist".format(field)) 114 | return self.data[field] 115 | 116 | def get_coordinates(self): 117 | """Get corner coordinates of boxes. 118 | 119 | Returns: 120 | a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] 121 | """ 122 | box_coordinates = self.get() 123 | y_min = box_coordinates[:, 0] 124 | x_min = box_coordinates[:, 1] 125 | y_max = box_coordinates[:, 2] 126 | x_max = box_coordinates[:, 3] 127 | return [y_min, x_min, y_max, x_max] 128 | 129 | def _is_valid_boxes(self, data): 130 | """Check whether data fullfills the format of N*[ymin, xmin, ymax, xmin]. 131 | 132 | Args: 133 | data: a numpy array of shape [N, 4] representing box coordinates 134 | 135 | Returns: 136 | a boolean indicating whether all ymax of boxes are equal or greater than 137 | ymin, and all xmax of boxes are equal or greater than xmin. 138 | """ 139 | if data.shape[0] > 0: 140 | for i in range(data.shape[0]): 141 | if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]: 142 | return False 143 | return True 144 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Functions for computing metrics like precision, recall, CorLoc and etc.""" 17 | from __future__ import division 18 | import numpy as np 19 | 20 | 21 | def compute_precision_recall(scores, labels, num_gt): 22 | """Compute precision and recall. 23 | 24 | Args: 25 | scores: A float numpy array representing detection score 26 | labels: A boolean numpy array representing true/false positive labels 27 | num_gt: Number of ground truth instances 28 | 29 | Raises: 30 | ValueError: if the input is not of the correct format 31 | 32 | Returns: 33 | precision: Fraction of positive instances over detected ones. This value is 34 | None if no ground truth labels are present. 35 | recall: Fraction of detected positive instance over all positive instances. 36 | This value is None if no ground truth labels are present. 37 | 38 | """ 39 | if ( 40 | not isinstance(labels, np.ndarray) 41 | or labels.dtype != np.bool 42 | or len(labels.shape) != 1 43 | ): 44 | raise ValueError("labels must be single dimension bool numpy array") 45 | 46 | if not isinstance(scores, np.ndarray) or len(scores.shape) != 1: 47 | raise ValueError("scores must be single dimension numpy array") 48 | 49 | if num_gt < np.sum(labels): 50 | raise ValueError( 51 | "Number of true positives must be smaller than num_gt." 52 | ) 53 | 54 | if len(scores) != len(labels): 55 | raise ValueError("scores and labels must be of the same size.") 56 | 57 | if num_gt == 0: 58 | return None, None 59 | 60 | sorted_indices = np.argsort(scores) 61 | sorted_indices = sorted_indices[::-1] 62 | labels = labels.astype(int) 63 | true_positive_labels = labels[sorted_indices] 64 | false_positive_labels = 1 - true_positive_labels 65 | cum_true_positives = np.cumsum(true_positive_labels) 66 | cum_false_positives = np.cumsum(false_positive_labels) 67 | precision = cum_true_positives.astype(float) / ( 68 | cum_true_positives + cum_false_positives 69 | ) 70 | recall = cum_true_positives.astype(float) / num_gt 71 | return precision, recall 72 | 73 | 74 | def compute_average_precision(precision, recall): 75 | """Compute Average Precision according to the definition in VOCdevkit. 76 | 77 | Precision is modified to ensure that it does not decrease as recall 78 | decrease. 79 | 80 | Args: 81 | precision: A float [N, 1] numpy array of precisions 82 | recall: A float [N, 1] numpy array of recalls 83 | 84 | Raises: 85 | ValueError: if the input is not of the correct format 86 | 87 | Returns: 88 | average_precison: The area under the precision recall curve. NaN if 89 | precision and recall are None. 90 | 91 | """ 92 | if precision is None: 93 | if recall is not None: 94 | raise ValueError("If precision is None, recall must also be None") 95 | return np.NAN 96 | 97 | if not isinstance(precision, np.ndarray) or not isinstance( 98 | recall, np.ndarray 99 | ): 100 | raise ValueError("precision and recall must be numpy array") 101 | if precision.dtype != np.float or recall.dtype != np.float: 102 | raise ValueError("input must be float numpy array.") 103 | if len(precision) != len(recall): 104 | raise ValueError("precision and recall must be of the same size.") 105 | if not precision.size: 106 | return 0.0 107 | if np.amin(precision) < 0 or np.amax(precision) > 1: 108 | raise ValueError("Precision must be in the range of [0, 1].") 109 | if np.amin(recall) < 0 or np.amax(recall) > 1: 110 | raise ValueError("recall must be in the range of [0, 1].") 111 | if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): 112 | raise ValueError("recall must be a non-decreasing array") 113 | 114 | recall = np.concatenate([[0], recall, [1]]) 115 | precision = np.concatenate([[0], precision, [0]]) 116 | 117 | # Preprocess precision to be a non-decreasing array 118 | for i in range(len(precision) - 2, -1, -1): 119 | precision[i] = np.maximum(precision[i], precision[i + 1]) 120 | 121 | indices = np.where(recall[1:] != recall[:-1])[0] + 1 122 | average_precision = np.sum( 123 | (recall[indices] - recall[indices - 1]) * precision[indices] 124 | ) 125 | return average_precision 126 | 127 | 128 | def compute_cor_loc( 129 | num_gt_imgs_per_class, num_images_correctly_detected_per_class 130 | ): 131 | """Compute CorLoc according to the definition in the following paper. 132 | 133 | https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf 134 | 135 | Returns nans if there are no ground truth images for a class. 136 | 137 | Args: 138 | num_gt_imgs_per_class: 1D array, representing number of images containing 139 | at least one object instance of a particular class 140 | num_images_correctly_detected_per_class: 1D array, representing number of 141 | images that are correctly detected at least one object instance of a 142 | particular class 143 | 144 | Returns: 145 | corloc_per_class: A float numpy array represents the corloc score of each 146 | class 147 | """ 148 | # Divide by zero expected for classes with no gt examples. 149 | with np.errstate(divide="ignore", invalid="ignore"): 150 | return np.where( 151 | num_gt_imgs_per_class == 0, 152 | np.nan, 153 | num_images_correctly_detected_per_class / num_gt_imgs_per_class, 154 | ) 155 | -------------------------------------------------------------------------------- /slowfast/models/backbones/x3d.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | 5 | import slowfast.utils.weight_init_helper as init_helper 6 | from slowfast.models.batchnorm_helper import get_norm 7 | 8 | from .. import head_helper, resnet_helper, stem_helper 9 | from ..build import MODEL_REGISTRY 10 | from . import _MODEL_STAGE_DEPTH, _TEMPORAL_KERNEL_BASIS, _POOL1 11 | 12 | 13 | @MODEL_REGISTRY.register() 14 | class X3D(nn.Module): 15 | """ 16 | X3D model builder. It builds a X3D network backbone, which is a ResNet. 17 | Christoph Feichtenhofer. 18 | "X3D: Expanding Architectures for Efficient Video Recognition." 19 | https://arxiv.org/abs/2004.04730 20 | """ 21 | 22 | def __init__(self, cfg): 23 | """ 24 | The `__init__` method of any subclass should also contain these 25 | arguments. 26 | Args: 27 | cfg (CfgNode): model building configs, details are in the 28 | comments of the config file. 29 | """ 30 | super(X3D, self).__init__() 31 | self.norm_module = get_norm(cfg) 32 | self.enable_detection = cfg.DETECTION.ENABLE 33 | self.num_pathways = 1 34 | 35 | exp_stage = 2.0 36 | self.dim_c1 = cfg.X3D.DIM_C1 37 | 38 | self.dim_res2 = ( 39 | self._round_width(self.dim_c1, exp_stage, divisor=8) 40 | if cfg.X3D.SCALE_RES2 41 | else self.dim_c1 42 | ) 43 | self.dim_res3 = self._round_width(self.dim_res2, exp_stage, divisor=8) 44 | self.dim_res4 = self._round_width(self.dim_res3, exp_stage, divisor=8) 45 | self.dim_res5 = self._round_width(self.dim_res4, exp_stage, divisor=8) 46 | 47 | self.block_basis = [ 48 | # blocks, c, stride 49 | [1, self.dim_res2, 2], 50 | [2, self.dim_res3, 2], 51 | [5, self.dim_res4, 2], 52 | [3, self.dim_res5, 2], 53 | ] 54 | self._construct_network(cfg) 55 | init_helper.init_weights( 56 | self, cfg.MODEL.FC_INIT_STD, cfg.RESNET.ZERO_INIT_FINAL_BN 57 | ) 58 | 59 | def _round_width(self, width, multiplier, min_depth=8, divisor=8): 60 | """Round width of filters based on width multiplier.""" 61 | if not multiplier: 62 | return width 63 | 64 | width *= multiplier 65 | min_depth = min_depth or divisor 66 | new_filters = max( 67 | min_depth, int(width + divisor / 2) // divisor * divisor 68 | ) 69 | if new_filters < 0.9 * width: 70 | new_filters += divisor 71 | return int(new_filters) 72 | 73 | def _round_repeats(self, repeats, multiplier): 74 | """Round number of layers based on depth multiplier.""" 75 | multiplier = multiplier 76 | if not multiplier: 77 | return repeats 78 | return int(math.ceil(multiplier * repeats)) 79 | 80 | def _construct_network(self, cfg): 81 | """ 82 | Builds a single pathway X3D model. 83 | Args: 84 | cfg (CfgNode): model building configs, details are in the 85 | comments of the config file. 86 | """ 87 | assert cfg.MODEL.ARCH in _POOL1.keys() 88 | assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys() 89 | 90 | (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH] 91 | 92 | num_groups = cfg.RESNET.NUM_GROUPS 93 | width_per_group = cfg.RESNET.WIDTH_PER_GROUP 94 | dim_inner = num_groups * width_per_group 95 | 96 | w_mul = cfg.X3D.WIDTH_FACTOR 97 | d_mul = cfg.X3D.DEPTH_FACTOR 98 | dim_res1 = self._round_width(self.dim_c1, w_mul) 99 | 100 | temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] 101 | 102 | self.s1 = stem_helper.VideoModelStem( 103 | cfg=cfg, 104 | dim_in=cfg.DATA.INPUT_CHANNEL_NUM, 105 | dim_out=[dim_res1], 106 | kernel=[temp_kernel[0][0] + [3, 3]], 107 | stride=[[1, 2, 2]], 108 | padding=[[temp_kernel[0][0][0] // 2, 1, 1]], 109 | norm_module=self.norm_module, 110 | stem_func_name="x3d_stem", 111 | ) 112 | 113 | # blob_in = s1 114 | dim_in = dim_res1 115 | for stage, block in enumerate(self.block_basis): 116 | dim_out = self._round_width(block[1], w_mul) 117 | dim_inner = int(cfg.X3D.BOTTLENECK_FACTOR * dim_out) 118 | 119 | n_rep = self._round_repeats(block[0], d_mul) 120 | prefix = "s{}".format( 121 | stage + 2 122 | ) # start w res2 to follow convention 123 | 124 | s = resnet_helper.ResStage( 125 | cfg=cfg, 126 | dim_in=[dim_in], 127 | dim_out=[dim_out], 128 | dim_inner=[dim_inner], 129 | temp_kernel_sizes=temp_kernel[1], 130 | stride=[block[2]], 131 | num_blocks=[n_rep], 132 | num_groups=[dim_inner] 133 | if cfg.X3D.CHANNELWISE_3x3x3 134 | else [num_groups], 135 | num_block_temp_kernel=[n_rep], 136 | nonlocal_inds=cfg.NONLOCAL.LOCATION[0], 137 | nonlocal_group=cfg.NONLOCAL.GROUP[0], 138 | nonlocal_pool=cfg.NONLOCAL.POOL[0], 139 | nonlocal_progress=cfg.NONLOCAL.PROGRESS, 140 | nonlocal_use_bn=cfg.NONLOCAL.USE_BN, 141 | instantiation=cfg.NONLOCAL.INSTANTIATION, 142 | trans_func_name=cfg.RESNET.TRANS_FUNC, 143 | stride_1x1=cfg.RESNET.STRIDE_1X1, 144 | norm_module=self.norm_module, 145 | dilation=cfg.RESNET.SPATIAL_DILATIONS[stage], 146 | temp_progress=cfg.PGT.ENABLE, 147 | ) 148 | dim_in = dim_out 149 | self.add_module(prefix, s) 150 | 151 | if self.enable_detection: 152 | NotImplementedError 153 | else: 154 | spat_sz = int(math.ceil(cfg.DATA.TRAIN_CROP_SIZE / 32.0)) 155 | self.head = head_helper.X3DHead( 156 | cfg=cfg, 157 | dim_in=dim_out, 158 | dim_inner=dim_inner, 159 | dim_out=cfg.X3D.DIM_C5, 160 | num_classes=cfg.MODEL.NUM_CLASSES, 161 | pool_size=[cfg.DATA.NUM_FRAMES, spat_sz, spat_sz], 162 | pool_type=cfg.MODEL.FINAL_POOL, 163 | dropout_rate=cfg.MODEL.DROPOUT_RATE, 164 | act_func=cfg.MODEL.HEAD_ACT, 165 | bn_lin5_on=cfg.X3D.BN_LIN5, 166 | ) 167 | 168 | def forward(self, x, bboxes=None): 169 | for module in self.children(): 170 | x = module(x) 171 | return x -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/label_map_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Label map utility functions.""" 16 | 17 | from __future__ import ( 18 | absolute_import, 19 | division, 20 | print_function, 21 | unicode_literals, 22 | ) 23 | import logging 24 | 25 | # from google.protobuf import text_format 26 | # from google3.third_party.tensorflow_models.object_detection.protos import string_int_label_map_pb2 27 | 28 | 29 | def _validate_label_map(label_map): 30 | """Checks if a label map is valid. 31 | 32 | Args: 33 | label_map: StringIntLabelMap to validate. 34 | 35 | Raises: 36 | ValueError: if label map is invalid. 37 | """ 38 | for item in label_map.item: 39 | if item.id < 1: 40 | raise ValueError("Label map ids should be >= 1.") 41 | 42 | 43 | def create_category_index(categories): 44 | """Creates dictionary of COCO compatible categories keyed by category id. 45 | 46 | Args: 47 | categories: a list of dicts, each of which has the following keys: 48 | 'id': (required) an integer id uniquely identifying this category. 49 | 'name': (required) string representing category name 50 | e.g., 'cat', 'dog', 'pizza'. 51 | 52 | Returns: 53 | category_index: a dict containing the same entries as categories, but keyed 54 | by the 'id' field of each category. 55 | """ 56 | category_index = {} 57 | for cat in categories: 58 | category_index[cat["id"]] = cat 59 | return category_index 60 | 61 | 62 | def get_max_label_map_index(label_map): 63 | """Get maximum index in label map. 64 | 65 | Args: 66 | label_map: a StringIntLabelMapProto 67 | 68 | Returns: 69 | an integer 70 | """ 71 | return max([item.id for item in label_map.item]) 72 | 73 | 74 | def convert_label_map_to_categories( 75 | label_map, max_num_classes, use_display_name=True 76 | ): 77 | """Loads label map proto and returns categories list compatible with eval. 78 | 79 | This function loads a label map and returns a list of dicts, each of which 80 | has the following keys: 81 | 'id': (required) an integer id uniquely identifying this category. 82 | 'name': (required) string representing category name 83 | e.g., 'cat', 'dog', 'pizza'. 84 | We only allow class into the list if its id-label_id_offset is 85 | between 0 (inclusive) and max_num_classes (exclusive). 86 | If there are several items mapping to the same id in the label map, 87 | we will only keep the first one in the categories list. 88 | 89 | Args: 90 | label_map: a StringIntLabelMapProto or None. If None, a default categories 91 | list is created with max_num_classes categories. 92 | max_num_classes: maximum number of (consecutive) label indices to include. 93 | use_display_name: (boolean) choose whether to load 'display_name' field 94 | as category name. If False or if the display_name field does not exist, 95 | uses 'name' field as category names instead. 96 | Returns: 97 | categories: a list of dictionaries representing all possible categories. 98 | """ 99 | categories = [] 100 | list_of_ids_already_added = [] 101 | if not label_map: 102 | label_id_offset = 1 103 | for class_id in range(max_num_classes): 104 | categories.append( 105 | { 106 | "id": class_id + label_id_offset, 107 | "name": "category_{}".format(class_id + label_id_offset), 108 | } 109 | ) 110 | return categories 111 | for item in label_map.item: 112 | if not 0 < item.id <= max_num_classes: 113 | logging.info( 114 | "Ignore item %d since it falls outside of requested " 115 | "label range.", 116 | item.id, 117 | ) 118 | continue 119 | if use_display_name and item.HasField("display_name"): 120 | name = item.display_name 121 | else: 122 | name = item.name 123 | if item.id not in list_of_ids_already_added: 124 | list_of_ids_already_added.append(item.id) 125 | categories.append({"id": item.id, "name": name}) 126 | return categories 127 | 128 | 129 | def load_labelmap(path): 130 | """Loads label map proto. 131 | 132 | Args: 133 | path: path to StringIntLabelMap proto text file. 134 | Returns: 135 | a StringIntLabelMapProto 136 | """ 137 | with open(path, "r") as fid: 138 | label_map_string = fid.read() 139 | label_map = string_int_label_map_pb2.StringIntLabelMap() 140 | try: 141 | text_format.Merge(label_map_string, label_map) 142 | except text_format.ParseError: 143 | label_map.ParseFromString(label_map_string) 144 | _validate_label_map(label_map) 145 | return label_map 146 | 147 | 148 | def get_label_map_dict(label_map_path, use_display_name=False): 149 | """Reads a label map and returns a dictionary of label names to id. 150 | 151 | Args: 152 | label_map_path: path to label_map. 153 | use_display_name: whether to use the label map items' display names as keys. 154 | 155 | Returns: 156 | A dictionary mapping label names to id. 157 | """ 158 | label_map = load_labelmap(label_map_path) 159 | label_map_dict = {} 160 | for item in label_map.item: 161 | if use_display_name: 162 | label_map_dict[item.display_name] = item.id 163 | else: 164 | label_map_dict[item.name] = item.id 165 | return label_map_dict 166 | 167 | 168 | def create_category_index_from_labelmap(label_map_path): 169 | """Reads a label map and returns a category index. 170 | 171 | Args: 172 | label_map_path: Path to `StringIntLabelMap` proto text file. 173 | 174 | Returns: 175 | A category index, which is a dictionary that maps integer ids to dicts 176 | containing categories, e.g. 177 | {1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...} 178 | """ 179 | label_map = load_labelmap(label_map_path) 180 | max_num_classes = max(item.id for item in label_map.item) 181 | categories = convert_label_map_to_categories(label_map, max_num_classes) 182 | return create_category_index(categories) 183 | 184 | 185 | def create_class_agnostic_category_index(): 186 | """Creates a category index with a single `object` class.""" 187 | return {1: {"id": 1, "name": "object"}} 188 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Multi-view test a video classification model.""" 5 | 6 | import numpy as np 7 | import torch 8 | from tqdm import tqdm 9 | 10 | import slowfast.utils.checkpoint as cu 11 | import slowfast.utils.distributed as du 12 | import slowfast.utils.logging as logging 13 | import slowfast.utils.misc as misc 14 | from slowfast.datasets import loader 15 | from slowfast.models import build_model 16 | from slowfast.utils.meters import AVAMeter, TestMeter 17 | from slowfast.models.progress_helper import ProgressTrainer 18 | 19 | logger = logging.get_logger(__name__) 20 | 21 | 22 | @torch.no_grad() 23 | def perform_test(test_loader, model, test_meter, cfg, writer=None, cur_epoch=None): 24 | """ 25 | For classification: 26 | Perform mutli-view testing that uniformly samples N clips from a video along 27 | its temporal axis. For each clip, it takes 3 crops to cover the spatial 28 | dimension, followed by averaging the softmax scores across all Nx3 views to 29 | form a video-level prediction. All video predictions are compared to 30 | ground-truth labels and the final testing performance is logged. 31 | For detection: 32 | Perform fully-convolutional testing on the full frames without crop. 33 | Args: 34 | test_loader (loader): video testing loader. 35 | model (model): the pretrained video model to test. 36 | test_meter (TestMeter): testing meters to log and ensemble the testing 37 | results. 38 | cfg (CfgNode): configs. Details can be found in 39 | slowfast/config/defaults.py 40 | writer (TensorboardWriter object, optional): TensorboardWriter object 41 | to writer Tensorboard log. 42 | """ 43 | # Enable eval mode. 44 | model.eval() 45 | test_meter.iter_tic() 46 | 47 | if cfg.PGT.ENABLE: 48 | pg_trainer = ProgressTrainer(model, cfg, cur_epoch) 49 | 50 | if du.get_world_size() == 1: 51 | extra_args = {} 52 | else: 53 | rank = du.get_rank() 54 | extra_args = dict(desc="rank {}".format(rank)) 55 | 56 | for _, (inputs, labels, video_idx, meta) in enumerate(tqdm(test_loader, **extra_args)): 57 | # Transfer the data to the current GPU device. 58 | if isinstance(inputs, (list,)): 59 | for i in range(len(inputs)): 60 | inputs[i] = inputs[i].cuda(non_blocking=True) 61 | else: 62 | inputs = inputs.cuda(non_blocking=True) 63 | 64 | # Transfer the data to the current GPU device. 65 | labels = labels.cuda() 66 | video_idx = video_idx.cuda() 67 | for key, val in meta.items(): 68 | if isinstance(val, (list,)): 69 | for i in range(len(val)): 70 | val[i] = val[i].cuda(non_blocking=True) 71 | else: 72 | meta[key] = val.cuda(non_blocking=True) 73 | 74 | if cfg.DETECTION.ENABLE: 75 | # Compute the predictions. 76 | if not cfg.PGT.ENABLE: 77 | preds = model(inputs, meta["boxes"]) 78 | else: 79 | # Take the meta of last step 80 | if not cfg.PGT.ALL_STEP_TEST: 81 | step_idx = pg_trainer.steps - 1 82 | meta["boxes"] = meta["boxes"][meta["step_idxes"] == step_idx] 83 | meta["ori_boxes"] = meta["ori_boxes"][meta["step_idxes"] == step_idx] 84 | meta["metadata"] = meta["metadata"][meta["step_idxes"] == step_idx] 85 | preds = pg_trainer.step_eval(inputs, meta["boxes"]) 86 | else: 87 | preds = pg_trainer.step_eval(inputs, meta["boxes"], meta["step_idxes"]) 88 | preds = preds.cpu() 89 | ori_boxes = meta["ori_boxes"].cpu() 90 | metadata = meta["metadata"].cpu() 91 | 92 | if cfg.NUM_GPUS > 1: 93 | preds = torch.cat(du.all_gather_unaligned(preds), dim=0) 94 | ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) 95 | metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) 96 | 97 | test_meter.iter_toc() 98 | # Update and log stats. 99 | test_meter.update_stats( 100 | preds.detach().cpu(), 101 | ori_boxes.detach().cpu(), 102 | metadata.detach().cpu(), 103 | ) 104 | else: 105 | # Perform the forward pass. 106 | if not cfg.PGT.ENABLE: 107 | preds = model(inputs) 108 | else: 109 | preds = pg_trainer.step_eval(inputs) 110 | 111 | # Gather all the predictions across all the devices to perform ensemble. 112 | if cfg.NUM_GPUS > 1: 113 | preds, labels, video_idx = du.all_gather( 114 | [preds, labels, video_idx] 115 | ) 116 | 117 | test_meter.iter_toc() 118 | # Update and log stats. 119 | test_meter.update_stats( 120 | preds.detach().cpu(), 121 | labels.detach().cpu(), 122 | video_idx.detach().cpu(), 123 | ) 124 | 125 | test_meter.iter_tic() 126 | # Log epoch stats and print the final testing results. 127 | if writer is not None: 128 | all_preds_cpu = [ 129 | pred.clone().detach().cpu() for pred in test_meter.video_preds 130 | ] 131 | all_labels_cpu = [ 132 | label.clone().detach().cpu() for label in test_meter.video_labels 133 | ] 134 | writer.plot_eval(preds=all_preds_cpu, labels=all_labels_cpu) 135 | 136 | test_meter.finalize_metrics(cur_epoch=cur_epoch) 137 | test_meter.reset() 138 | 139 | 140 | def test(cfg): 141 | """ 142 | Perform multi-view testing on the pretrained video model. 143 | Args: 144 | cfg (CfgNode): configs. Details can be found in 145 | slowfast/config/defaults.py 146 | """ 147 | # Set up environment. 148 | filename = "test" if cfg.LOGS.FILE_NAME == "" else cfg.LOGS.FILE_NAME 149 | logging.setup_logger(cfg, filename) 150 | du.init_distributed_training(cfg) 151 | # Set random seed from configs. 152 | np.random.seed(cfg.RNG_SEED) 153 | torch.manual_seed(cfg.RNG_SEED) 154 | 155 | # Print config. 156 | logger.info("Test with config:") 157 | logger.info(cfg) 158 | 159 | # Build the video model and print model statistics. 160 | model = build_model(cfg) 161 | if du.is_master_proc() and cfg.LOGS.LOG_MODEL: 162 | misc.log_model_info(model, cfg, use_train_input=False) 163 | 164 | cu.load_test_checkpoint(cfg, model) 165 | 166 | # Create video testing loaders. 167 | test_loader = loader.construct_loader(cfg, "test") 168 | logger.info("Testing model for {} iterations".format(len(test_loader))) 169 | logger.info("Testing contains {} videos".format(len(test_loader.dataset))) 170 | 171 | if cfg.DETECTION.ENABLE: 172 | assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE 173 | test_meter = AVAMeter(len(test_loader), cfg, mode="test") 174 | else: 175 | assert ( 176 | len(test_loader.dataset) 177 | % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) 178 | == 0 179 | ) 180 | # Create meters for multi-view testing. 181 | test_meter = TestMeter( 182 | len(test_loader.dataset) 183 | // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), 184 | cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, 185 | cfg.MODEL.NUM_CLASSES, 186 | len(test_loader), 187 | cfg.DATA.MULTI_LABEL, 188 | cfg.DATA.ENSEMBLE_METHOD, 189 | ) 190 | 191 | # Set up writer for logging to Tensorboard format. 192 | if cfg.TENSORBOARD.ENABLE and du.is_master_proc( 193 | cfg.NUM_GPUS * cfg.NUM_SHARDS 194 | ): 195 | writer = tb.TensorboardWriter(cfg) 196 | else: 197 | writer = None 198 | 199 | # # Perform multi-view test on the entire dataset. 200 | perform_test(test_loader, model, test_meter, cfg, writer) 201 | if writer is not None: 202 | writer.close() 203 | 204 | logger.info(f"Testing completed. Log directory: {cfg.LOGS.DIR}") 205 | -------------------------------------------------------------------------------- /slowfast/models/nonlocal_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Non-local helper""" 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class Nonlocal(nn.Module): 11 | """ 12 | Builds Non-local Neural Networks as a generic family of building 13 | blocks for capturing long-range dependencies. Non-local Network 14 | computes the response at a position as a weighted sum of the 15 | features at all positions. This building block can be plugged into 16 | many computer vision architectures. 17 | More details in the paper: https://arxiv.org/pdf/1711.07971.pdf 18 | """ 19 | 20 | def __init__( 21 | self, 22 | dim, 23 | dim_inner, 24 | pool_size=None, 25 | instantiation="softmax", 26 | zero_init_final_conv=False, 27 | zero_init_final_norm=True, 28 | use_bn=True, 29 | norm_eps=1e-5, 30 | norm_momentum=0.1, 31 | norm_module=nn.BatchNorm3d, 32 | ): 33 | """ 34 | Args: 35 | dim (int): number of dimension for the input. 36 | dim_inner (int): number of dimension inside of the Non-local block. 37 | pool_size (list): the kernel size of spatial temporal pooling, 38 | temporal pool kernel size, spatial pool kernel size, spatial 39 | pool kernel size in order. By default pool_size is None, 40 | then there would be no pooling used. 41 | instantiation (string): supports two different instantiation method: 42 | "dot_product": normalizing correlation matrix with L2. 43 | "softmax": normalizing correlation matrix with Softmax. 44 | zero_init_final_conv (bool): If true, zero initializing the final 45 | convolution of the Non-local block. 46 | zero_init_final_norm (bool): 47 | If true, zero initializing the final batch norm of the Non-local 48 | block. 49 | norm_module (nn.Module): nn.Module for the normalization layer. The 50 | default is nn.BatchNorm3d. 51 | """ 52 | super(Nonlocal, self).__init__() 53 | self.dim = dim 54 | self.dim_inner = dim_inner 55 | self.pool_size = pool_size 56 | self.instantiation = instantiation 57 | self.use_pool = ( 58 | False 59 | if pool_size is None 60 | else any((size > 1 for size in pool_size)) 61 | ) 62 | self.use_bn = use_bn 63 | self.norm_eps = norm_eps 64 | self.norm_momentum = norm_momentum 65 | self._construct_nonlocal( 66 | zero_init_final_conv, zero_init_final_norm, norm_module 67 | ) 68 | 69 | def _construct_nonlocal( 70 | self, zero_init_final_conv, zero_init_final_norm, norm_module 71 | ): 72 | # Three convolution heads: theta, phi, and g. 73 | self.conv_theta = nn.Conv3d( 74 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 75 | ) 76 | self.conv_phi = nn.Conv3d( 77 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 78 | ) 79 | self.conv_g = nn.Conv3d( 80 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 81 | ) 82 | 83 | # Final convolution output. 84 | self.conv_out = nn.Conv3d( 85 | self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0 86 | ) 87 | # Zero initializing the final convolution output. 88 | self.conv_out.zero_init = zero_init_final_conv 89 | 90 | # TODO: change the name to `norm` 91 | if self.use_bn: 92 | self.bn = norm_module( 93 | num_features=self.dim, 94 | eps=self.norm_eps, 95 | momentum=self.norm_momentum, 96 | ) 97 | # Zero initializing the final bn. 98 | self.bn.transform_final_bn = zero_init_final_norm 99 | 100 | # Optional to add the spatial-temporal pooling. 101 | if self.use_pool: 102 | self.pool = nn.MaxPool3d( 103 | kernel_size=self.pool_size, 104 | stride=self.pool_size, 105 | padding=[0, 0, 0], 106 | ) 107 | 108 | def forward(self, x): 109 | x_identity = x 110 | N, C, T, H, W = x.size() 111 | 112 | theta = self.conv_theta(x) 113 | 114 | # Perform temporal-spatial pooling to reduce the computation. 115 | if self.use_pool: 116 | x = self.pool(x) 117 | 118 | phi = self.conv_phi(x) 119 | g = self.conv_g(x) 120 | 121 | theta = theta.view(N, self.dim_inner, -1) 122 | phi = phi.view(N, self.dim_inner, -1) 123 | g = g.view(N, self.dim_inner, -1) 124 | 125 | # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW). 126 | theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi)) 127 | # For original Non-local paper, there are two main ways to normalize 128 | # the affinity tensor: 129 | # 1) Softmax normalization (norm on exp). 130 | # 2) dot_product normalization. 131 | if self.instantiation == "softmax": 132 | # Normalizing the affinity tensor theta_phi before softmax. 133 | theta_phi = theta_phi * (self.dim_inner ** -0.5) 134 | theta_phi = nn.functional.softmax(theta_phi, dim=2) 135 | elif self.instantiation == "dot_product": 136 | spatial_temporal_dim = theta_phi.shape[2] 137 | theta_phi = theta_phi / spatial_temporal_dim 138 | else: 139 | raise NotImplementedError( 140 | "Unknown norm type {}".format(self.instantiation) 141 | ) 142 | 143 | # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW). 144 | theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g)) 145 | 146 | # (N, C, TxHxW) => (N, C, T, H, W). 147 | theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W) 148 | 149 | p = self.conv_out(theta_phi_g) 150 | if self.use_bn: 151 | p = self.bn(p) 152 | return x_identity + p 153 | 154 | 155 | class Featurebank(Nonlocal): 156 | """Feature Bank Operator""" 157 | def __init__( 158 | self, 159 | dim, 160 | dim_inner, 161 | pool_size=None, 162 | ): 163 | super(Featurebank, self).__init__( 164 | dim, 165 | dim_inner, 166 | pool_size, 167 | instantiation="softmax", 168 | zero_init_final_conv=True, 169 | use_bn=True, 170 | ) 171 | 172 | def _construct_nonlocal(self): 173 | super(Featurebank, self)._construct_nonlocal( 174 | zero_init_final_conv, 175 | zero_init_final_norm, 176 | norm_module 177 | ) 178 | # GroupNorm with group = 1 is equivalent to LayerNorm. 179 | # Set affine to False to match with caffe2. 180 | self.bn = nn.GroupNorm(1, self.dim, eps=self.norm_eps, affine=False) 181 | 182 | def forward(self, x): 183 | x_identity = x 184 | N, C, T, H, W = x.size() 185 | 186 | theta = self.conv_theta(x) 187 | 188 | # Perform temporal-spatial pooling to reduce the computation. 189 | if self.use_pool: 190 | y = self.pool(x) # FIXME: where does y comes from? 191 | 192 | phi = self.conv_phi(y) 193 | g = self.conv_g(y) 194 | 195 | theta = theta.view(N, self.dim_inner, -1) 196 | phi = phi.view(N, self.dim_inner, -1) 197 | g = g.view(N, self.dim_inner, -1) 198 | 199 | # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW). 200 | theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi)) 201 | 202 | # Normalizing the affinity tensor theta_phi before softmax. 203 | theta_phi = theta_phi * (self.dim_inner ** -0.5) 204 | theta_phi = nn.functional.softmax(theta_phi, dim=2) 205 | 206 | # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW). 207 | theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g)) 208 | 209 | # (N, C, TxHxW) => (N, C, T, H, W). 210 | theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W) 211 | 212 | p = self.conv_out(theta_phi_g) 213 | p = self.bn(p) 214 | return x_identity + p -------------------------------------------------------------------------------- /slowfast/datasets/ava_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import os 5 | from collections import defaultdict 6 | from fvcore.common.file_io import PathManager 7 | 8 | import slowfast.utils.logging as logging 9 | from slowfast.utils.setup_moxing_env import wrap_input_path2 10 | 11 | logger = logging.get_logger(__name__) 12 | 13 | FPS = 30 14 | AVA_VALID_FRAMES = range(902, 1799) 15 | 16 | 17 | def load_image_lists(cfg, is_train): 18 | """ 19 | Loading image paths from corresponding files. 20 | 21 | Args: 22 | cfg (CfgNode): config. 23 | is_train (bool): if it is training dataset or not. 24 | 25 | Returns: 26 | image_paths (list[list]): a list of items. Each item (also a list) 27 | corresponds to one video and contains the paths of images for 28 | this video. 29 | video_idx_to_name (list): a list which stores video names. 30 | """ 31 | list_filenames = [ 32 | os.path.join(cfg.AVA.FRAME_LIST_DIR, filename) 33 | for filename in ( 34 | cfg.AVA.TRAIN_LISTS if is_train else cfg.AVA.TEST_LISTS 35 | ) 36 | ] 37 | image_paths = defaultdict(list) 38 | video_name_to_idx = {} 39 | video_idx_to_name = [] 40 | for list_filename in list_filenames: 41 | list_filename = wrap_input_path2(list_filename) 42 | with open(list_filename, "r") as f: 43 | f.readline() 44 | for i, line in enumerate(f): 45 | if cfg.DEBUG and i > 10000: 46 | break 47 | row = line.split() 48 | # The format of each row should follow: 49 | # original_vido_id video_id frame_id path labels. 50 | assert len(row) == 5 51 | video_name = row[0] 52 | 53 | if video_name not in video_name_to_idx: 54 | idx = len(video_name_to_idx) 55 | video_name_to_idx[video_name] = idx 56 | video_idx_to_name.append(video_name) 57 | 58 | data_key = video_name_to_idx[video_name] 59 | 60 | image_paths[data_key].append( 61 | os.path.join(cfg.AVA.FRAME_DIR, row[3]) 62 | ) 63 | 64 | image_paths = [image_paths[i] for i in range(len(image_paths))] 65 | 66 | logger.info( 67 | "Finished loading image paths from: %s" % ", ".join(list_filenames) 68 | ) 69 | 70 | return image_paths, video_idx_to_name 71 | 72 | 73 | def load_boxes_and_labels(cfg, mode): 74 | """ 75 | Loading boxes and labels from csv files. 76 | 77 | Args: 78 | cfg (CfgNode): config. 79 | mode (str): 'train', 'val', or 'test' mode. 80 | Returns: 81 | all_boxes (dict): a dict which maps from `video_name` and 82 | `frame_sec` to a list of `box`. Each `box` is a 83 | [`box_coord`, `box_labels`] where `box_coord` is the 84 | coordinates of box and 'box_labels` are the corresponding 85 | labels for the box. 86 | """ 87 | gt_lists = cfg.AVA.TRAIN_GT_BOX_LISTS if mode == "train" else [] 88 | pred_lists = ( 89 | cfg.AVA.TRAIN_PREDICT_BOX_LISTS 90 | if mode == "train" 91 | else cfg.AVA.TEST_PREDICT_BOX_LISTS 92 | ) 93 | ann_filenames = [ 94 | os.path.join(cfg.AVA.ANNOTATION_DIR, filename) 95 | for filename in gt_lists + pred_lists 96 | ] 97 | ann_is_gt_box = [True] * len(gt_lists) + [False] * len(pred_lists) 98 | 99 | detect_thresh = cfg.AVA.DETECTION_SCORE_THRESH 100 | all_boxes = {} 101 | count = 0 102 | unique_box_count = 0 103 | for filename, is_gt_box in zip(ann_filenames, ann_is_gt_box): 104 | filename = wrap_input_path2(filename) 105 | with open(filename, "r") as f: 106 | for i, line in enumerate(f): 107 | if cfg.DEBUG and i > 10000: 108 | break 109 | row = line.strip().split(",") 110 | # When we use predicted boxes to train/eval, we need to 111 | # ignore the boxes whose scores are below the threshold. 112 | if not is_gt_box: 113 | score = float(row[7]) 114 | if score < detect_thresh: 115 | continue 116 | 117 | video_name, frame_sec = row[0], int(row[1]) 118 | 119 | # Only select frame_sec % 4 = 0 samples for validation if not 120 | # set FULL_TEST_ON_VAL. 121 | if ( 122 | mode == "val" 123 | and not cfg.AVA.FULL_TEST_ON_VAL 124 | and frame_sec % 4 != 0 125 | ): 126 | continue 127 | 128 | # Box with format [x1, y1, x2, y2] with a range of [0, 1] as float. 129 | box_key = ",".join(row[2:6]) 130 | box = list(map(float, row[2:6])) 131 | label = -1 if row[6] == "" else int(row[6]) 132 | 133 | if video_name not in all_boxes: 134 | all_boxes[video_name] = {} 135 | for sec in AVA_VALID_FRAMES: 136 | all_boxes[video_name][sec] = {} 137 | 138 | if box_key not in all_boxes[video_name][frame_sec]: 139 | all_boxes[video_name][frame_sec][box_key] = [box, []] 140 | unique_box_count += 1 141 | 142 | all_boxes[video_name][frame_sec][box_key][1].append(label) 143 | if label != -1: 144 | count += 1 145 | 146 | for video_name in all_boxes.keys(): 147 | for frame_sec in all_boxes[video_name].keys(): 148 | # Save in format of a list of [box_i, box_i_labels]. 149 | all_boxes[video_name][frame_sec] = list( 150 | all_boxes[video_name][frame_sec].values() 151 | ) 152 | 153 | logger.info( 154 | "Finished loading annotations from: %s" % ", ".join(ann_filenames) 155 | ) 156 | logger.info("Detection threshold: {}".format(detect_thresh)) 157 | logger.info("Number of unique boxes: %d" % unique_box_count) 158 | logger.info("Number of annotations: %d" % count) 159 | 160 | return all_boxes 161 | 162 | 163 | def get_keyframe_data(boxes_and_labels): 164 | """ 165 | Getting keyframe indices, boxes and labels in the dataset. 166 | 167 | Args: 168 | boxes_and_labels (list[dict]): a list which maps from video_idx to a dict. 169 | Each dict `frame_sec` to a list of boxes and corresponding labels. 170 | 171 | Returns: 172 | keyframe_indices (list): a list of indices of the keyframes. 173 | keyframe_boxes_and_labels (list[list[list]]): a list of list which maps from 174 | video_idx and sec_idx to a list of boxes and corresponding labels. 175 | """ 176 | 177 | def sec_to_frame(sec): 178 | """ 179 | Convert time index (in second) to frame index. 180 | 0: 900 181 | 30: 901 182 | """ 183 | return (sec - 900) * FPS 184 | 185 | keyframe_indices = [] 186 | keyframe_boxes_and_labels = [] 187 | count = 0 188 | for video_idx in range(len(boxes_and_labels)): 189 | sec_idx = 0 190 | keyframe_boxes_and_labels.append([]) 191 | for sec in boxes_and_labels[video_idx].keys(): 192 | if sec not in AVA_VALID_FRAMES: 193 | continue 194 | 195 | if len(boxes_and_labels[video_idx][sec]) > 0: 196 | keyframe_indices.append( 197 | (video_idx, sec_idx, sec, sec_to_frame(sec)) 198 | ) 199 | keyframe_boxes_and_labels[video_idx].append( 200 | boxes_and_labels[video_idx][sec] 201 | ) 202 | sec_idx += 1 203 | count += 1 204 | logger.info("%d keyframes used." % count) 205 | 206 | return keyframe_indices, keyframe_boxes_and_labels 207 | 208 | 209 | def get_num_boxes_used(keyframe_indices, keyframe_boxes_and_labels): 210 | """ 211 | Get total number of used boxes. 212 | 213 | Args: 214 | keyframe_indices (list): a list of indices of the keyframes. 215 | keyframe_boxes_and_labels (list[list[list]]): a list of list which maps from 216 | video_idx and sec_idx to a list of boxes and corresponding labels. 217 | 218 | Returns: 219 | count (int): total number of used boxes. 220 | """ 221 | 222 | count = 0 223 | for video_idx, sec_idx, _, _ in keyframe_indices: 224 | count += len(keyframe_boxes_and_labels[video_idx][sec_idx]) 225 | return count 226 | -------------------------------------------------------------------------------- /slowfast/models/backbones/regnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | 5 | from ..build import MODEL_REGISTRY 6 | import slowfast.utils.weight_init_helper as init_helper 7 | from slowfast.models.batchnorm_helper import get_norm 8 | 9 | 10 | _CFG = { 11 | "400M": { 12 | 'd': [1, 2, 7, 12], 13 | 'wi': [32, 64, 160, 384], 14 | 'g': 16, 15 | 'b': 1, 16 | 'w0': 24 17 | }, 18 | "4G": { 19 | 'd': [2, 5, 14, 2], 20 | 'wi': [80, 240, 560, 1360], 21 | 'g': 24, 22 | 'b': 1, 23 | 'w0': 48 24 | } 25 | } 26 | 27 | 28 | def conv3x3(in_planes, out_planes, stride=1, groups=1, temporal_k=1, temporal_p=0): 29 | """3x3 convolution with padding""" 30 | return nn.Conv3d(in_planes, out_planes, kernel_size=(temporal_k, 3, 3), stride=(1, stride, stride), 31 | padding=(temporal_p, 1, 1), groups=groups, bias=False, dilation=1) 32 | 33 | 34 | def conv1x1(in_planes, out_planes, stride=1, temporal_k=1, temporal_p=0): 35 | """1x1 convolution""" 36 | return nn.Conv3d(in_planes, out_planes, kernel_size=(temporal_k, 1, 1), stride=(1, stride, stride), 37 | padding=(temporal_p, 0, 0), bias=False) 38 | 39 | 40 | class Bottleneck(nn.Module): 41 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, bottle_ratio=1, temporal_k=1, temporal_p=0): 42 | super(Bottleneck, self).__init__() 43 | norm_layer = nn.BatchNorm3d 44 | intra_plane = planes // bottle_ratio 45 | 46 | self.conv1 = conv1x1(inplanes, intra_plane, 47 | temporal_k=temporal_k, temporal_p=temporal_p) 48 | self.bn1 = norm_layer(intra_plane) 49 | self.conv2 = conv3x3(intra_plane, intra_plane, stride, groups) 50 | self.bn2 = norm_layer(intra_plane) 51 | self.conv3 = conv1x1(intra_plane, planes) 52 | self.bn3 = norm_layer(planes) 53 | self.relu = nn.ReLU(inplace=True) 54 | self.relu_final = nn.ReLU(inplace=True) 55 | self.downsample = downsample 56 | self.stride = stride 57 | 58 | def forward(self, x): 59 | identity = x 60 | 61 | out = self.conv1(x) 62 | 63 | out = self.bn1(out) 64 | out = self.relu(out) 65 | 66 | out = self.conv2(out) 67 | out = self.bn2(out) 68 | out = self.relu(out) 69 | 70 | out = self.conv3(out) 71 | out = self.bn3(out) 72 | 73 | if self.downsample is not None: 74 | identity = self.downsample(x) 75 | 76 | if identity.shape != out.shape: 77 | identity = identity[:, :, 1:-1] 78 | out += identity 79 | 80 | out = self.relu_final(out) 81 | 82 | return out 83 | 84 | 85 | @MODEL_REGISTRY.register() 86 | class RegNet(nn.Module): 87 | def __init__(self, cfg, zero_init_residual=True): 88 | super(RegNet, self).__init__() 89 | 90 | self.cfg = cfg 91 | self.model_cfg = _CFG[self.cfg.REGNET.DEPTH] 92 | self.model_cfg['sa'] = [0, 0, 0, 0] # FIXME 93 | if self.cfg.PGT.ENABLE: 94 | temporal_p = 0 95 | else: 96 | temporal_p = 1 97 | self.conv1 = conv3x3(3, self.model_cfg['w0'], stride=2) 98 | self.bn1 = nn.BatchNorm3d(self.model_cfg['w0']) 99 | self.relu = nn.ReLU(inplace=True) 100 | 101 | self.layer1 = self._make_layer( 102 | self.model_cfg['w0'], self.model_cfg['wi'][0], self.model_cfg['d'][0], self.model_cfg['sa'][0]) 103 | self.layer2 = self._make_layer( 104 | self.model_cfg['wi'][0], self.model_cfg['wi'][1], self.model_cfg['d'][1], self.model_cfg['sa'][1]) 105 | 106 | self.layer3 = self._make_layer(self.model_cfg['wi'][1], self.model_cfg['wi'][2], self.model_cfg['d'][2], self.model_cfg['sa'][2], 107 | temporal_k=3, temporal_p=temporal_p) 108 | self.layer4 = self._make_layer(self.model_cfg['wi'][2], self.model_cfg['wi'][3], self.model_cfg['d'][3], self.model_cfg['sa'][3], 109 | temporal_k=3, temporal_p=temporal_p) 110 | 111 | # input shape of each temporal layer, [channel, spatial stride] 112 | self.padding_shape = [*[[self.model_cfg['wi'][1], 8]] * 1, 113 | *[[self.model_cfg['wi'][2], 16]] * self.model_cfg['d'][2], 114 | *[[self.model_cfg['wi'][3], 32]] * (self.model_cfg['d'][3] - 1), ] 115 | # self.selfAtt_padding_shape = [*[[self.model_cfg['wi'][0], 4]] * self.model_cfg['sa'][0], 116 | # *[[self.model_cfg['wi'][1], 8]] * self.model_cfg['sa'][1], 117 | # *[[self.model_cfg['wi'][2], 16]] * self.model_cfg['sa'][2], 118 | # *[[self.model_cfg['wi'][3], 32]] * self.model_cfg['sa'][3], 119 | # ] 120 | 121 | self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1)) 122 | if self.cfg.MODEL.DROPOUT_RATE > 0: 123 | self.dropout = nn.Dropout(self.cfg.MODEL.DROPOUT_RATE) 124 | self.fc = nn.Linear(self.model_cfg['wi'][3], 125 | self.cfg.MODEL.NUM_CLASSES, bias=True) 126 | 127 | self.act = nn.Softmax(dim=-1) 128 | 129 | for m in self.modules(): 130 | if isinstance(m, nn.Conv3d): 131 | nn.init.kaiming_normal_( 132 | m.weight, mode='fan_out', nonlinearity='relu') 133 | if m.bias is not None: 134 | nn.init.constant_(m.bias, 0) 135 | elif isinstance(m, (nn.BatchNorm3d, nn.GroupNorm)): 136 | if hasattr(m, "transform_final_bn") and m.transform_final_bn: 137 | batchnorm_weight = 0.0 138 | else: 139 | batchnorm_weight = 1.0 140 | m.weight.data.fill_(batchnorm_weight) 141 | m.bias.data.zero_() 142 | elif isinstance(m, nn.Linear): 143 | nn.init.normal_(m.weight, 0, 0.01) 144 | nn.init.constant_(m.bias, 0) 145 | 146 | # Zero-initialize the last BN in each residual branch, 147 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 148 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 149 | if zero_init_residual: 150 | for m in self.modules(): 151 | if isinstance(m, Bottleneck): 152 | nn.init.constant_(m.bn3.weight, 0) 153 | 154 | def _make_layer(self, inplanes, planes, n_blocks, n_sa, temporal_k=1, temporal_p=0): 155 | 156 | downsample = nn.Sequential( 157 | conv1x1(inplanes, planes, 2), 158 | nn.BatchNorm3d(planes), 159 | ) 160 | 161 | layers = [] 162 | layers.append(Bottleneck(inplanes, planes, 2, downsample, self.model_cfg['g'], 163 | self.model_cfg['b'], temporal_k=temporal_k, temporal_p=temporal_p)) 164 | if n_sa == n_blocks: 165 | layers.append(PrgSelfAtt(dim=planes, 166 | dim_inner=planes // 2, 167 | pool_size=[None, 4, 4])) 168 | for i in range(1, n_blocks): 169 | layers.append(Bottleneck(planes, planes, 1, None, self.model_cfg['g'], 170 | self.model_cfg['b'], temporal_k=temporal_k, temporal_p=temporal_p)) 171 | if i > (n_blocks - n_sa - 1): 172 | layers.append(PrgSelfAtt(dim=planes, 173 | dim_inner=planes // 2, 174 | pool_size=[None, 4, 4])) 175 | 176 | return nn.Sequential(*layers) 177 | 178 | def forward(self, x): 179 | x = x[0] 180 | x = self.conv1(x) 181 | x = self.bn1(x) 182 | x = self.relu(x) 183 | 184 | x = self.layer1(x) 185 | x = self.layer2(x) 186 | x = self.layer3(x) 187 | x = self.layer4(x) 188 | 189 | x = self.avgpool(x) 190 | 191 | if x.shape[2] > 1: 192 | need_mean = True 193 | x = x.permute(0, 2, 1, 3, 4) 194 | x = x.view(x.size(0), x.size(1), -1) 195 | else: 196 | x = x.view(x.size(0), -1) 197 | need_mean = False 198 | 199 | # head 200 | if hasattr(self, 'dropout'): 201 | x = self.dropout(x) 202 | x = self.fc(x) 203 | 204 | if not self.training: 205 | x = self.act(x) 206 | 207 | if need_mean: 208 | x = x.mean(dim=1) 209 | 210 | return x 211 | -------------------------------------------------------------------------------- /slowfast/datasets/ssv1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import os 5 | import random 6 | import torch 7 | import numpy as np 8 | import torch.utils.data 9 | from itertools import chain as chain 10 | from fvcore.common.file_io import PathManager 11 | from PIL import Image 12 | 13 | import slowfast.utils.logging as logging 14 | from . import transform as transform 15 | from . import utils as utils 16 | from .decoder import get_start_end_idx 17 | from .build import DATASET_REGISTRY 18 | from .utils import retry_load_images 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | 23 | @DATASET_REGISTRY.register() 24 | class Ssv1(torch.utils.data.Dataset): 25 | """ 26 | Something-Something v1 (SSV1) video loader. Construct the SSV1 video loader, 27 | then sample clips from the videos. For training and validation, a single 28 | clip is randomly sampled from every video with random cropping, scaling, and 29 | flipping. For testing, multiple clips are uniformaly sampled from every 30 | video with uniform cropping. For uniform cropping, we take the left, center, 31 | and right crop if the width is larger than height, or take top, center, and 32 | bottom crop if the height is larger than the width. 33 | """ 34 | 35 | def __init__(self, cfg, mode, num_retries=10): 36 | """ 37 | Construct the Sthv1 video loader with a given csv file. The format of 38 | the csv file is: 39 | ``` 40 | path_to_video_1 video_len_1 label_1 41 | path_to_video_2 video_len_2 label_2 42 | ... 43 | path_to_video_N video_len_N label_N 44 | ``` 45 | Args: 46 | cfg (CfgNode): configs. 47 | mode (string): Options includes `train`, `val`, or `test` mode. 48 | For the train and val mode, the data loader will take data 49 | from the train or val set, and sample one clip per video. 50 | For the test mode, the data loader will take data from test set, 51 | and sample multiple clips per video. 52 | num_retries (int): number of retries. 53 | """ 54 | # Only support train, val, and test mode. 55 | assert mode in [ 56 | "train", 57 | "val", 58 | "test", 59 | ], "Split '{}' not supported for Sthv1".format(mode) 60 | self.mode = mode 61 | self.cfg = cfg 62 | 63 | self._video_meta = {} 64 | self._num_retries = num_retries 65 | # For training or validation mode, one single clip is sampled from every 66 | # video. For testing, NUM_ENSEMBLE_VIEWS clips are sampled from every 67 | # video. For every clip, NUM_SPATIAL_CROPS is cropped spatially from 68 | # the frames. 69 | if self.mode in ["train", "val"]: 70 | self._num_clips = 1 71 | elif self.mode in ["test"]: 72 | self._num_clips = ( 73 | cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS 74 | ) 75 | 76 | logger.info("Constructing Something-Something v1 {}...".format(mode)) 77 | self._construct_loader() 78 | 79 | def _construct_loader(self): 80 | """ 81 | Construct the video loader. 82 | """ 83 | path_to_file = os.path.join( 84 | self.cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(self.mode) 85 | ) 86 | assert PathManager.exists(path_to_file), "{} dir not found".format( 87 | path_to_file 88 | ) 89 | 90 | self._path_to_videos = [] 91 | self._video_len = [] 92 | self._labels = [] 93 | self._spatial_temporal_idx = [] 94 | with open(path_to_file, "r") as f: 95 | for clip_idx, path_vlen_label in enumerate(f.read().splitlines()): 96 | assert len(path_vlen_label.split()) == 3 97 | path, vlen, label = path_vlen_label.split() 98 | for idx in range(self._num_clips): 99 | self._path_to_videos.append( 100 | os.path.join(self.cfg.DATA.PATH_PREFIX, path) 101 | ) 102 | self._video_len.append(int(vlen)) 103 | self._labels.append(int(label)) 104 | self._spatial_temporal_idx.append(idx) 105 | self._video_meta[clip_idx * self._num_clips + idx] = {} 106 | assert ( 107 | len(self._path_to_videos) > 0 108 | ), "Failed to load Something-Something v1 split {} from {}".format( 109 | self._split_idx, path_to_file 110 | ) 111 | logger.info( 112 | "Something-Something v1 dataloader constructed (size: {}) from {}".format( 113 | len(self._path_to_videos), path_to_file 114 | ) 115 | ) 116 | 117 | def __getitem__(self, index): 118 | """ 119 | Given the video index, return the list of frames, label, and video 120 | index if the video frames can be fetched. 121 | Args: 122 | index (int): the video index provided by the pytorch sampler. 123 | Returns: 124 | frames (tensor): the frames of sampled from the video. The dimension 125 | is `channel` x `num frames` x `height` x `width`. 126 | label (int): the label of the current video. 127 | index (int): the index of the video. 128 | 129 | """ 130 | if self.mode in ["train", "val"]: 131 | # -1 indicates random sampling. 132 | temporal_sample_index = -1 133 | spatial_sample_index = -1 134 | min_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[0] 135 | max_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[1] 136 | crop_size = self.cfg.DATA.TRAIN_CROP_SIZE 137 | elif self.mode in ["test"]: 138 | temporal_sample_index = ( 139 | self._spatial_temporal_idx[index] 140 | // self.cfg.TEST.NUM_SPATIAL_CROPS 141 | ) 142 | # spatial_sample_index is in [0, 1, 2]. Corresponding to left, 143 | # center, or right if width is larger than height, and top, middle, 144 | # or bottom if height is larger than width. 145 | spatial_sample_index = ( 146 | self._spatial_temporal_idx[index] 147 | % self.cfg.TEST.NUM_SPATIAL_CROPS 148 | ) 149 | min_scale, max_scale, crop_size = [ 150 | self.cfg.DATA.TEST_CROP_SIZE] * 3 151 | # The testing is deterministic and no jitter should be performed. 152 | # min_scale, max_scale, and crop_size are expect to be the same. 153 | assert len({min_scale, max_scale, crop_size}) == 1 154 | else: 155 | raise NotImplementedError( 156 | "Does not support {} mode".format(self.mode) 157 | ) 158 | label = self._labels[index] 159 | 160 | num_frames = self.cfg.DATA.NUM_FRAMES 161 | video_length = self._video_len[index] 162 | 163 | seg_size = float(video_length - 1) / num_frames 164 | seq = [] 165 | for i in range(num_frames): 166 | start = int(np.round(seg_size * i)) 167 | end = int(np.round(seg_size * (i + 1))) 168 | if self.mode == "train": 169 | seq.append(random.randint(start, end)) 170 | else: 171 | seq.append((start + end) // 2) 172 | 173 | frames = torch.as_tensor( 174 | utils.retry_load_images( 175 | [os.path.join(self._path_to_videos[index], '%05d.jpg' % (frame + 1)) for frame in seq], 176 | self._num_retries, 177 | ) 178 | ) 179 | 180 | # Perform color normalization. 181 | frames = utils.tensor_normalize( 182 | frames, self.cfg.DATA.MEAN, self.cfg.DATA.STD 183 | ) 184 | 185 | # T H W C -> C T H W. 186 | frames = frames.permute(3, 0, 1, 2) 187 | # Perform data augmentation. 188 | frames = utils.spatial_sampling( 189 | frames, 190 | spatial_idx=spatial_sample_index, 191 | min_scale=min_scale, 192 | max_scale=max_scale, 193 | crop_size=crop_size, 194 | random_horizontal_flip=self.cfg.DATA.RANDOM_FLIP, 195 | inverse_uniform_sampling=self.cfg.DATA.INV_UNIFORM_SAMPLE, 196 | ) 197 | frames = utils.pack_pathway_output(self.cfg, frames) 198 | return frames, label, index, {} 199 | 200 | def __len__(self): 201 | """ 202 | Returns: 203 | (int): the number of videos in the dataset. 204 | """ 205 | return len(self._path_to_videos) -------------------------------------------------------------------------------- /slowfast/csrc/cpu/ROIAlign_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "cpu/vision.h" 3 | 4 | // implementation taken from Caffe2 5 | template 6 | struct PreCalc { 7 | int pos1; 8 | int pos2; 9 | int pos3; 10 | int pos4; 11 | T w1; 12 | T w2; 13 | T w3; 14 | T w4; 15 | }; 16 | 17 | template 18 | void pre_calc_for_bilinear_interpolate( 19 | const int height, 20 | const int width, 21 | const int pooled_height, 22 | const int pooled_width, 23 | const int iy_upper, 24 | const int ix_upper, 25 | T roi_start_h, 26 | T roi_start_w, 27 | T bin_size_h, 28 | T bin_size_w, 29 | int roi_bin_grid_h, 30 | int roi_bin_grid_w, 31 | std::vector>& pre_calc) { 32 | int pre_calc_index = 0; 33 | for (int ph = 0; ph < pooled_height; ph++) { 34 | for (int pw = 0; pw < pooled_width; pw++) { 35 | for (int iy = 0; iy < iy_upper; iy++) { 36 | const T yy = roi_start_h + ph * bin_size_h + 37 | static_cast(iy + .5f) * bin_size_h / 38 | static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 39 | for (int ix = 0; ix < ix_upper; ix++) { 40 | const T xx = roi_start_w + pw * bin_size_w + 41 | static_cast(ix + .5f) * bin_size_w / 42 | static_cast(roi_bin_grid_w); 43 | 44 | T x = xx; 45 | T y = yy; 46 | // deal with: inverse elements are out of feature map boundary 47 | if (y < -1.0 || y > height || x < -1.0 || x > width) { 48 | // empty 49 | PreCalc pc; 50 | pc.pos1 = 0; 51 | pc.pos2 = 0; 52 | pc.pos3 = 0; 53 | pc.pos4 = 0; 54 | pc.w1 = 0; 55 | pc.w2 = 0; 56 | pc.w3 = 0; 57 | pc.w4 = 0; 58 | pre_calc[pre_calc_index] = pc; 59 | pre_calc_index += 1; 60 | continue; 61 | } 62 | 63 | if (y <= 0) { 64 | y = 0; 65 | } 66 | if (x <= 0) { 67 | x = 0; 68 | } 69 | 70 | int y_low = (int)y; 71 | int x_low = (int)x; 72 | int y_high; 73 | int x_high; 74 | 75 | if (y_low >= height - 1) { 76 | y_high = y_low = height - 1; 77 | y = (T)y_low; 78 | } else { 79 | y_high = y_low + 1; 80 | } 81 | 82 | if (x_low >= width - 1) { 83 | x_high = x_low = width - 1; 84 | x = (T)x_low; 85 | } else { 86 | x_high = x_low + 1; 87 | } 88 | 89 | T ly = y - y_low; 90 | T lx = x - x_low; 91 | T hy = 1. - ly, hx = 1. - lx; 92 | T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; 93 | 94 | // save weights and indices 95 | PreCalc pc; 96 | pc.pos1 = y_low * width + x_low; 97 | pc.pos2 = y_low * width + x_high; 98 | pc.pos3 = y_high * width + x_low; 99 | pc.pos4 = y_high * width + x_high; 100 | pc.w1 = w1; 101 | pc.w2 = w2; 102 | pc.w3 = w3; 103 | pc.w4 = w4; 104 | pre_calc[pre_calc_index] = pc; 105 | 106 | pre_calc_index += 1; 107 | } 108 | } 109 | } 110 | } 111 | } 112 | 113 | template 114 | void ROIAlignForward_cpu_kernel( 115 | const int nthreads, 116 | const T* bottom_data, 117 | const T& spatial_scale, 118 | const int channels, 119 | const int height, 120 | const int width, 121 | const int pooled_height, 122 | const int pooled_width, 123 | const int sampling_ratio, 124 | const T* bottom_rois, 125 | //int roi_cols, 126 | T* top_data) { 127 | //AT_ASSERT(roi_cols == 4 || roi_cols == 5); 128 | int roi_cols = 5; 129 | 130 | int n_rois = nthreads / channels / pooled_width / pooled_height; 131 | // (n, c, ph, pw) is an element in the pooled output 132 | // can be parallelized using omp 133 | // #pragma omp parallel for num_threads(32) 134 | for (int n = 0; n < n_rois; n++) { 135 | int index_n = n * channels * pooled_width * pooled_height; 136 | 137 | // roi could have 4 or 5 columns 138 | const T* offset_bottom_rois = bottom_rois + n * roi_cols; 139 | int roi_batch_ind = 0; 140 | if (roi_cols == 5) { 141 | roi_batch_ind = offset_bottom_rois[0]; 142 | offset_bottom_rois++; 143 | } 144 | 145 | // Do not using rounding; this implementation detail is critical 146 | T roi_start_w = offset_bottom_rois[0] * spatial_scale; 147 | T roi_start_h = offset_bottom_rois[1] * spatial_scale; 148 | T roi_end_w = offset_bottom_rois[2] * spatial_scale; 149 | T roi_end_h = offset_bottom_rois[3] * spatial_scale; 150 | // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); 151 | // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); 152 | // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); 153 | // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); 154 | 155 | // Force malformed ROIs to be 1x1 156 | T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); 157 | T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); 158 | T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); 159 | T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); 160 | 161 | // We use roi_bin_grid to sample the grid and mimic integral 162 | int roi_bin_grid_h = (sampling_ratio > 0) 163 | ? sampling_ratio 164 | : ceil(roi_height / pooled_height); // e.g., = 2 165 | int roi_bin_grid_w = 166 | (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); 167 | 168 | // We do average (integral) pooling inside a bin 169 | const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 170 | 171 | // we want to precalculate indices and weights shared by all channels, 172 | // this is the key point of optimization 173 | std::vector> pre_calc( 174 | roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); 175 | pre_calc_for_bilinear_interpolate( 176 | height, 177 | width, 178 | pooled_height, 179 | pooled_width, 180 | roi_bin_grid_h, 181 | roi_bin_grid_w, 182 | roi_start_h, 183 | roi_start_w, 184 | bin_size_h, 185 | bin_size_w, 186 | roi_bin_grid_h, 187 | roi_bin_grid_w, 188 | pre_calc); 189 | 190 | for (int c = 0; c < channels; c++) { 191 | int index_n_c = index_n + c * pooled_width * pooled_height; 192 | const T* offset_bottom_data = 193 | bottom_data + (roi_batch_ind * channels + c) * height * width; 194 | int pre_calc_index = 0; 195 | 196 | for (int ph = 0; ph < pooled_height; ph++) { 197 | for (int pw = 0; pw < pooled_width; pw++) { 198 | int index = index_n_c + ph * pooled_width + pw; 199 | 200 | T output_val = 0.; 201 | for (int iy = 0; iy < roi_bin_grid_h; iy++) { 202 | for (int ix = 0; ix < roi_bin_grid_w; ix++) { 203 | PreCalc pc = pre_calc[pre_calc_index]; 204 | output_val += pc.w1 * offset_bottom_data[pc.pos1] + 205 | pc.w2 * offset_bottom_data[pc.pos2] + 206 | pc.w3 * offset_bottom_data[pc.pos3] + 207 | pc.w4 * offset_bottom_data[pc.pos4]; 208 | 209 | pre_calc_index += 1; 210 | } 211 | } 212 | output_val /= count; 213 | 214 | top_data[index] = output_val; 215 | } // for pw 216 | } // for ph 217 | } // for c 218 | } // for n 219 | } 220 | 221 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 222 | const at::Tensor& rois, 223 | const float spatial_scale, 224 | const int pooled_height, 225 | const int pooled_width, 226 | const int sampling_ratio) { 227 | AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor"); 228 | AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor"); 229 | 230 | auto num_rois = rois.size(0); 231 | auto channels = input.size(1); 232 | auto height = input.size(2); 233 | auto width = input.size(3); 234 | 235 | auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); 236 | auto output_size = num_rois * pooled_height * pooled_width * channels; 237 | 238 | if (output.numel() == 0) { 239 | return output; 240 | } 241 | 242 | AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { 243 | ROIAlignForward_cpu_kernel( 244 | output_size, 245 | input.data(), 246 | spatial_scale, 247 | channels, 248 | height, 249 | width, 250 | pooled_height, 251 | pooled_width, 252 | sampling_ratio, 253 | rois.data(), 254 | output.data()); 255 | }); 256 | return output; 257 | } -------------------------------------------------------------------------------- /slowfast/models/backbones/resnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | 5 | import slowfast.utils.weight_init_helper as init_helper 6 | from slowfast.models.batchnorm_helper import get_norm 7 | 8 | from .. import head_helper, resnet_helper, stem_helper 9 | from ..build import MODEL_REGISTRY 10 | from . import _MODEL_STAGE_DEPTH, _TEMPORAL_KERNEL_BASIS, _POOL1 11 | 12 | 13 | @MODEL_REGISTRY.register() 14 | class ResNet(nn.Module): 15 | """ 16 | ResNet model builder. It builds a ResNet like network backbone without 17 | lateral connection (C2D, I3D, Slow). 18 | 19 | Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. 20 | "SlowFast networks for video recognition." 21 | https://arxiv.org/pdf/1812.03982.pdf 22 | 23 | Xiaolong Wang, Ross Girshick, Abhinav Gupta, and Kaiming He. 24 | "Non-local neural networks." 25 | https://arxiv.org/pdf/1711.07971.pdf 26 | """ 27 | 28 | def __init__(self, cfg): 29 | """ 30 | The `__init__` method of any subclass should also contain these 31 | arguments. 32 | 33 | Args: 34 | cfg (CfgNode): model building configs, details are in the 35 | comments of the config file. 36 | """ 37 | super(ResNet, self).__init__() 38 | self.norm_module = get_norm(cfg) 39 | self.enable_detection = cfg.DETECTION.ENABLE 40 | self.num_pathways = 1 41 | self._cfg = cfg 42 | self._construct_network(cfg) 43 | init_helper.init_weights( 44 | self, cfg.MODEL.FC_INIT_STD, cfg.RESNET.ZERO_INIT_FINAL_BN 45 | ) 46 | 47 | def _construct_network(self, cfg): 48 | """ 49 | Builds a single pathway ResNet model. 50 | 51 | Args: 52 | cfg (CfgNode): model building configs, details are in the 53 | comments of the config file. 54 | """ 55 | assert cfg.MODEL.ARCH in _POOL1.keys() 56 | pool_size = _POOL1[cfg.MODEL.ARCH] 57 | assert len({len(pool_size), self.num_pathways}) == 1 58 | assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys() 59 | 60 | (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH] 61 | 62 | num_groups = cfg.RESNET.NUM_GROUPS 63 | width_per_group = cfg.RESNET.WIDTH_PER_GROUP 64 | dim_inner = num_groups * width_per_group 65 | 66 | temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] 67 | 68 | self.s1 = stem_helper.VideoModelStem( 69 | cfg=cfg, 70 | dim_in=cfg.DATA.INPUT_CHANNEL_NUM, 71 | dim_out=[width_per_group], 72 | kernel=[temp_kernel[0][0] + [7, 7]], 73 | stride=[[1, 2, 2]], # [2,2,2] for non-sparse 74 | padding=[[temp_kernel[0][0][0] // 2, 3, 3]], 75 | stem_func_name=cfg.RESNET.STEM_FUNC, 76 | norm_module=self.norm_module, 77 | ) 78 | 79 | self.s2 = resnet_helper.ResStage( 80 | cfg=cfg, 81 | dim_in=[width_per_group], 82 | dim_out=[width_per_group * 4], 83 | dim_inner=[dim_inner], 84 | temp_kernel_sizes=temp_kernel[1], 85 | stride=cfg.RESNET.SPATIAL_STRIDES[0], 86 | num_blocks=[d2], 87 | num_groups=[num_groups], 88 | num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[0], 89 | nonlocal_inds=cfg.NONLOCAL.LOCATION[0], 90 | nonlocal_group=cfg.NONLOCAL.GROUP[0], 91 | nonlocal_pool=cfg.NONLOCAL.POOL[0], 92 | nonlocal_use_bn=cfg.NONLOCAL.USE_BN, 93 | nonlocal_progress=cfg.NONLOCAL.PROGRESS, 94 | instantiation=cfg.NONLOCAL.INSTANTIATION, 95 | trans_func_name=cfg.RESNET.TRANS_FUNC, 96 | stride_1x1=cfg.RESNET.STRIDE_1X1, 97 | inplace_relu=cfg.RESNET.INPLACE_RELU, 98 | dilation=cfg.RESNET.SPATIAL_DILATIONS[0], 99 | norm_module=self.norm_module, 100 | temp_progress=cfg.PGT.ENABLE, 101 | ) 102 | 103 | for pathway in range(self.num_pathways): 104 | pool = nn.MaxPool3d( 105 | kernel_size=pool_size[pathway], 106 | stride=pool_size[pathway], 107 | padding=[0, 0, 0], 108 | ) 109 | self.add_module("pathway{}_pool".format(pathway), pool) 110 | 111 | self.s3 = resnet_helper.ResStage( 112 | cfg=cfg, 113 | dim_in=[width_per_group * 4], 114 | dim_out=[width_per_group * 8], 115 | dim_inner=[dim_inner * 2], 116 | temp_kernel_sizes=temp_kernel[2], 117 | stride=cfg.RESNET.SPATIAL_STRIDES[1], 118 | num_blocks=[d3], 119 | num_groups=[num_groups], 120 | num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[1], 121 | nonlocal_inds=cfg.NONLOCAL.LOCATION[1], 122 | nonlocal_group=cfg.NONLOCAL.GROUP[1], 123 | nonlocal_pool=cfg.NONLOCAL.POOL[1], 124 | nonlocal_use_bn=cfg.NONLOCAL.USE_BN, 125 | nonlocal_progress=cfg.NONLOCAL.PROGRESS, 126 | instantiation=cfg.NONLOCAL.INSTANTIATION, 127 | trans_func_name=cfg.RESNET.TRANS_FUNC, 128 | stride_1x1=cfg.RESNET.STRIDE_1X1, 129 | inplace_relu=cfg.RESNET.INPLACE_RELU, 130 | dilation=cfg.RESNET.SPATIAL_DILATIONS[1], 131 | norm_module=self.norm_module, 132 | temp_progress=cfg.PGT.ENABLE, 133 | ) 134 | 135 | self.s4 = resnet_helper.ResStage( 136 | cfg=cfg, 137 | dim_in=[width_per_group * 8], 138 | dim_out=[width_per_group * 16], 139 | dim_inner=[dim_inner * 4], 140 | temp_kernel_sizes=temp_kernel[3], 141 | stride=cfg.RESNET.SPATIAL_STRIDES[2], 142 | num_blocks=[d4], 143 | num_groups=[num_groups], 144 | num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[2], 145 | nonlocal_inds=cfg.NONLOCAL.LOCATION[2], 146 | nonlocal_group=cfg.NONLOCAL.GROUP[2], 147 | nonlocal_pool=cfg.NONLOCAL.POOL[2], 148 | nonlocal_use_bn=cfg.NONLOCAL.USE_BN, 149 | nonlocal_progress=cfg.NONLOCAL.PROGRESS, 150 | instantiation=cfg.NONLOCAL.INSTANTIATION, 151 | trans_func_name=cfg.RESNET.TRANS_FUNC, 152 | stride_1x1=cfg.RESNET.STRIDE_1X1, 153 | inplace_relu=cfg.RESNET.INPLACE_RELU, 154 | dilation=cfg.RESNET.SPATIAL_DILATIONS[2], 155 | norm_module=self.norm_module, 156 | temp_progress=cfg.PGT.ENABLE, 157 | ) 158 | 159 | self.s5 = resnet_helper.ResStage( 160 | cfg=cfg, 161 | dim_in=[width_per_group * 16], 162 | dim_out=[width_per_group * 32], 163 | dim_inner=[dim_inner * 8], 164 | temp_kernel_sizes=temp_kernel[4], 165 | stride=cfg.RESNET.SPATIAL_STRIDES[3], 166 | num_blocks=[d5], 167 | num_groups=[num_groups], 168 | num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[3], 169 | nonlocal_inds=cfg.NONLOCAL.LOCATION[3], 170 | nonlocal_group=cfg.NONLOCAL.GROUP[3], 171 | nonlocal_pool=cfg.NONLOCAL.POOL[3], 172 | nonlocal_use_bn=cfg.NONLOCAL.USE_BN, 173 | nonlocal_progress=cfg.NONLOCAL.PROGRESS, 174 | instantiation=cfg.NONLOCAL.INSTANTIATION, 175 | trans_func_name=cfg.RESNET.TRANS_FUNC, 176 | stride_1x1=cfg.RESNET.STRIDE_1X1, 177 | inplace_relu=cfg.RESNET.INPLACE_RELU, 178 | dilation=cfg.RESNET.SPATIAL_DILATIONS[3], 179 | norm_module=self.norm_module, 180 | temp_progress=cfg.PGT.ENABLE, 181 | ) 182 | 183 | if self.enable_detection: 184 | self.head = head_helper.ResNetRoIHead( 185 | cfg=cfg, 186 | dim_in=[width_per_group * 32], 187 | num_classes=cfg.MODEL.NUM_CLASSES, 188 | pool_size=[[cfg.DATA.NUM_FRAMES // pool_size[0][0], 1, 1]], 189 | pool_type=cfg.MODEL.FINAL_POOL[1], 190 | resolution=[[cfg.DETECTION.ROI_XFORM_RESOLUTION] * 2], 191 | scale_factor=[cfg.DETECTION.SPATIAL_SCALE_FACTOR], 192 | dropout_rate=cfg.MODEL.DROPOUT_RATE, 193 | act_func=cfg.MODEL.HEAD_ACT, 194 | aligned=cfg.DETECTION.ALIGNED, 195 | ) 196 | else: 197 | self.head = head_helper.ResNetBasicHead( 198 | cfg=cfg, 199 | dim_in=[width_per_group * 32], 200 | num_classes=cfg.MODEL.NUM_CLASSES, 201 | pool_size=[[ 202 | cfg.DATA.NUM_FRAMES // pool_size[0][0], 203 | cfg.DATA.CROP_SIZE // 32 // pool_size[0][1], 204 | cfg.DATA.CROP_SIZE // 32 // pool_size[0][2], 205 | ]], 206 | pool_type=cfg.MODEL.FINAL_POOL, 207 | dropout_rate=cfg.MODEL.DROPOUT_RATE, 208 | act_func=cfg.MODEL.HEAD_ACT, 209 | ) 210 | 211 | def forward(self, x, bboxes=None, slices=None): 212 | x = self.s1(x) 213 | x = self.s2(x) 214 | for pathway in range(self.num_pathways): 215 | pool = getattr(self, "pathway{}_pool".format(pathway)) 216 | x[pathway] = pool(x[pathway]) 217 | x = self.s3(x) 218 | x = self.s4(x) 219 | x = self.s5(x) 220 | if self.enable_detection: 221 | x = self.head(x, bboxes, slices) 222 | else: 223 | x = self.head(x) 224 | return x -------------------------------------------------------------------------------- /slowfast/utils/multigrid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Helper functions for multigrid training.""" 5 | 6 | import numpy as np 7 | 8 | import slowfast.utils.logging as logging 9 | 10 | logger = logging.get_logger(__name__) 11 | 12 | 13 | class MultigridSchedule(object): 14 | """ 15 | This class defines multigrid training schedule and update cfg accordingly. 16 | """ 17 | 18 | def init_multigrid(self, cfg): 19 | """ 20 | Update cfg based on multigrid settings. 21 | Args: 22 | cfg (configs): configs that contains training and multigrid specific 23 | hyperparameters. Details can be seen in 24 | slowfast/config/defaults.py. 25 | Returns: 26 | cfg (configs): the updated cfg. 27 | """ 28 | self.schedule = None 29 | # We may modify cfg.TRAIN.BATCH_SIZE, cfg.DATA.NUM_FRAMES, and 30 | # cfg.DATA.TRAIN_CROP_SIZE during training, so we store their original 31 | # value in cfg and use them as global variables. 32 | cfg.MULTIGRID.DEFAULT_B = cfg.TRAIN.BATCH_SIZE 33 | cfg.MULTIGRID.DEFAULT_T = cfg.DATA.NUM_FRAMES 34 | cfg.MULTIGRID.DEFAULT_S = cfg.DATA.TRAIN_CROP_SIZE 35 | 36 | if cfg.MULTIGRID.LONG_CYCLE: 37 | self.schedule = self.get_long_cycle_schedule(cfg) 38 | cfg.SOLVER.STEPS = [0] + [s[-1] for s in self.schedule] 39 | # Fine-tuning phase. 40 | cfg.SOLVER.STEPS[-1] = ( 41 | cfg.SOLVER.STEPS[-2] + cfg.SOLVER.STEPS[-1] 42 | ) // 2 43 | cfg.SOLVER.LRS = [ 44 | cfg.SOLVER.GAMMA ** s[0] * s[1][0] for s in self.schedule 45 | ] 46 | # Fine-tuning phase. 47 | cfg.SOLVER.LRS = cfg.SOLVER.LRS[:-1] + [ 48 | cfg.SOLVER.LRS[-2], 49 | cfg.SOLVER.LRS[-1], 50 | ] 51 | 52 | cfg.SOLVER.MAX_EPOCH = self.schedule[-1][-1] 53 | 54 | elif cfg.MULTIGRID.SHORT_CYCLE: 55 | cfg.SOLVER.STEPS = [ 56 | int(s * cfg.MULTIGRID.EPOCH_FACTOR) for s in cfg.SOLVER.STEPS 57 | ] 58 | cfg.SOLVER.MAX_EPOCH = int( 59 | cfg.SOLVER.MAX_EPOCH * cfg.MULTIGRID.EPOCH_FACTOR 60 | ) 61 | return cfg 62 | 63 | def update_long_cycle(self, cfg, cur_epoch): 64 | """ 65 | Before every epoch, check if long cycle shape should change. If it 66 | should, update cfg accordingly. 67 | Args: 68 | cfg (configs): configs that contains training and multigrid specific 69 | hyperparameters. Details can be seen in 70 | slowfast/config/defaults.py. 71 | cur_epoch (int): current epoch index. 72 | Returns: 73 | cfg (configs): the updated cfg. 74 | changed (bool): do we change long cycle shape at this epoch? 75 | """ 76 | base_b, base_t, base_s = get_current_long_cycle_shape( 77 | self.schedule, cur_epoch 78 | ) 79 | if base_s != cfg.DATA.TRAIN_CROP_SIZE or base_t != cfg.DATA.NUM_FRAMES: 80 | 81 | cfg.DATA.NUM_FRAMES = base_t 82 | cfg.DATA.TRAIN_CROP_SIZE = base_s 83 | cfg.TRAIN.BATCH_SIZE = base_b * cfg.MULTIGRID.DEFAULT_B 84 | 85 | bs_factor = ( 86 | float(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS) 87 | / cfg.MULTIGRID.BN_BASE_SIZE 88 | ) 89 | 90 | if bs_factor < 1: 91 | cfg.BN.NORM_TYPE = "sync_batchnorm" 92 | cfg.BN.NUM_SYNC_DEVICES = int(1.0 / bs_factor) 93 | elif bs_factor > 1: 94 | cfg.BN.NORM_TYPE = "sub_batchnorm" 95 | cfg.BN.NUM_SPLITS = int(bs_factor) 96 | else: 97 | cfg.BN.NORM_TYPE = "batchnorm" 98 | 99 | cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = cfg.DATA.SAMPLING_RATE * ( 100 | cfg.MULTIGRID.DEFAULT_T // cfg.DATA.NUM_FRAMES 101 | ) 102 | logger.info("Long cycle updates:") 103 | logger.info("\tBN.NORM_TYPE: {}".format(cfg.BN.NORM_TYPE)) 104 | if cfg.BN.NORM_TYPE == "sync_batchnorm": 105 | logger.info( 106 | "\tBN.NUM_SYNC_DEVICES: {}".format(cfg.BN.NUM_SYNC_DEVICES) 107 | ) 108 | elif cfg.BN.NORM_TYPE == "sub_batchnorm": 109 | logger.info("\tBN.NUM_SPLITS: {}".format(cfg.BN.NUM_SPLITS)) 110 | logger.info("\tTRAIN.BATCH_SIZE: {}".format(cfg.TRAIN.BATCH_SIZE)) 111 | logger.info( 112 | "\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format( 113 | cfg.DATA.NUM_FRAMES, cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE 114 | ) 115 | ) 116 | logger.info( 117 | "\tDATA.TRAIN_CROP_SIZE: {}".format(cfg.DATA.TRAIN_CROP_SIZE) 118 | ) 119 | return cfg, True 120 | else: 121 | return cfg, False 122 | 123 | def get_long_cycle_schedule(self, cfg): 124 | """ 125 | Based on multigrid hyperparameters, define the schedule of a long cycle. 126 | Args: 127 | cfg (configs): configs that contains training and multigrid specific 128 | hyperparameters. Details can be seen in 129 | slowfast/config/defaults.py. 130 | Returns: 131 | schedule (list): Specifies a list long cycle base shapes and their 132 | corresponding training epochs. 133 | """ 134 | 135 | steps = cfg.SOLVER.STEPS 136 | 137 | default_size = float( 138 | cfg.DATA.NUM_FRAMES * cfg.DATA.TRAIN_CROP_SIZE ** 2 139 | ) 140 | default_iters = steps[-1] 141 | 142 | # Get shapes and average batch size for each long cycle shape. 143 | avg_bs = [] 144 | all_shapes = [] 145 | for t_factor, s_factor in cfg.MULTIGRID.LONG_CYCLE_FACTORS: 146 | base_t = int(round(cfg.DATA.NUM_FRAMES * t_factor)) 147 | base_s = int(round(cfg.DATA.TRAIN_CROP_SIZE * s_factor)) 148 | if cfg.MULTIGRID.SHORT_CYCLE: 149 | shapes = [ 150 | [ 151 | base_t, 152 | cfg.MULTIGRID.DEFAULT_S 153 | * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[0], 154 | ], 155 | [ 156 | base_t, 157 | cfg.MULTIGRID.DEFAULT_S 158 | * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[1], 159 | ], 160 | [base_t, base_s], 161 | ] 162 | else: 163 | shapes = [[base_t, base_s]] 164 | 165 | # (T, S) -> (B, T, S) 166 | shapes = [ 167 | [int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]] 168 | for s in shapes 169 | ] 170 | avg_bs.append(np.mean([s[0] for s in shapes])) 171 | all_shapes.append(shapes) 172 | 173 | # Get schedule regardless of cfg.MULTIGRID.EPOCH_FACTOR. 174 | total_iters = 0 175 | schedule = [] 176 | for step_index in range(len(steps) - 1): 177 | step_epochs = steps[step_index + 1] - steps[step_index] 178 | 179 | for long_cycle_index, shapes in enumerate(all_shapes): 180 | cur_epochs = ( 181 | step_epochs * avg_bs[long_cycle_index] / sum(avg_bs) 182 | ) 183 | 184 | cur_iters = cur_epochs / avg_bs[long_cycle_index] 185 | total_iters += cur_iters 186 | schedule.append((step_index, shapes[-1], cur_epochs)) 187 | 188 | iter_saving = default_iters / total_iters 189 | 190 | final_step_epochs = cfg.SOLVER.MAX_EPOCH - steps[-1] 191 | 192 | # We define the fine-tuning phase to have the same amount of iteration 193 | # saving as the rest of the training. 194 | ft_epochs = final_step_epochs / iter_saving * avg_bs[-1] 195 | 196 | schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs)) 197 | 198 | # Obtrain final schedule given desired cfg.MULTIGRID.EPOCH_FACTOR. 199 | x = ( 200 | cfg.SOLVER.MAX_EPOCH 201 | * cfg.MULTIGRID.EPOCH_FACTOR 202 | / sum(s[-1] for s in schedule) 203 | ) 204 | 205 | final_schedule = [] 206 | total_epochs = 0 207 | for s in schedule: 208 | epochs = s[2] * x 209 | total_epochs += epochs 210 | final_schedule.append((s[0], s[1], int(round(total_epochs)))) 211 | print_schedule(final_schedule) 212 | return final_schedule 213 | 214 | 215 | def print_schedule(schedule): 216 | """ 217 | Log schedule. 218 | """ 219 | logger.info("Long cycle index\tBase shape\tEpochs") 220 | for s in schedule: 221 | logger.info("{}\t{}\t{}".format(s[0], s[1], s[2])) 222 | 223 | 224 | def get_current_long_cycle_shape(schedule, epoch): 225 | """ 226 | Given a schedule and epoch index, return the long cycle base shape. 227 | Args: 228 | schedule (configs): configs that contains training and multigrid specific 229 | hyperparameters. Details can be seen in 230 | slowfast/config/defaults.py. 231 | cur_epoch (int): current epoch index. 232 | Returns: 233 | shapes (list): A list describing the base shape in a long cycle: 234 | [batch size relative to default, 235 | number of frames, spatial dimension]. 236 | """ 237 | for s in schedule: 238 | if epoch < s[-1]: 239 | return s[1] 240 | return schedule[-1][1] 241 | -------------------------------------------------------------------------------- /slowfast/utils/distributed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Distributed helpers.""" 5 | 6 | import functools 7 | import logging 8 | import pickle 9 | import torch 10 | import torch.distributed as dist 11 | 12 | _LOCAL_PROCESS_GROUP = None 13 | 14 | 15 | def all_gather(tensors): 16 | """ 17 | All gathers the provided tensors from all processes across machines. 18 | Args: 19 | tensors (list): tensors to perform all gather across all processes in 20 | all machines. 21 | """ 22 | 23 | gather_list = [] 24 | output_tensor = [] 25 | world_size = dist.get_world_size() 26 | for tensor in tensors: 27 | tensor_placeholder = [ 28 | torch.ones_like(tensor) for _ in range(world_size) 29 | ] 30 | dist.all_gather(tensor_placeholder, tensor, async_op=False) 31 | gather_list.append(tensor_placeholder) 32 | for gathered_tensor in gather_list: 33 | output_tensor.append(torch.cat(gathered_tensor, dim=0)) 34 | return output_tensor 35 | 36 | 37 | def all_reduce(tensors, average=True): 38 | """ 39 | All reduce the provided tensors from all processes across machines. 40 | Args: 41 | tensors (list): tensors to perform all reduce across all processes in 42 | all machines. 43 | average (bool): scales the reduced tensor by the number of overall 44 | processes across all machines. 45 | """ 46 | 47 | for tensor in tensors: 48 | dist.all_reduce(tensor, async_op=False) 49 | if average: 50 | world_size = dist.get_world_size() 51 | for tensor in tensors: 52 | tensor.mul_(1.0 / world_size) 53 | return tensors 54 | 55 | 56 | def init_process_group( 57 | local_rank, 58 | local_world_size, 59 | shard_id, 60 | num_shards, 61 | init_method, 62 | dist_backend="nccl", 63 | ): 64 | """ 65 | Initializes the default process group. 66 | Args: 67 | local_rank (int): the rank on the current local machine. 68 | local_world_size (int): the world size (number of processes running) on 69 | the current local machine. 70 | shard_id (int): the shard index (machine rank) of the current machine. 71 | num_shards (int): number of shards for distributed training. 72 | init_method (string): supporting three different methods for 73 | initializing process groups: 74 | "file": use shared file system to initialize the groups across 75 | different processes. 76 | "tcp": use tcp address to initialize the groups across different 77 | dist_backend (string): backend to use for distributed training. Options 78 | includes gloo, mpi and nccl, the details can be found here: 79 | https://pytorch.org/docs/stable/distributed.html 80 | """ 81 | # Sets the GPU to use. 82 | torch.cuda.set_device(local_rank) 83 | # Initialize the process group. 84 | proc_rank = local_rank + shard_id * local_world_size 85 | world_size = local_world_size * num_shards 86 | dist.init_process_group( 87 | backend=dist_backend, 88 | init_method=init_method, 89 | world_size=world_size, 90 | rank=proc_rank, 91 | ) 92 | 93 | 94 | def is_master_proc(num_gpus=8): 95 | """ 96 | Determines if the current process is the master process. 97 | """ 98 | if torch.distributed.is_initialized(): 99 | return dist.get_rank() % num_gpus == 0 100 | else: 101 | return True 102 | 103 | 104 | def get_world_size(): 105 | """ 106 | Get the size of the world. 107 | """ 108 | if not dist.is_available(): 109 | return 1 110 | if not dist.is_initialized(): 111 | return 1 112 | return dist.get_world_size() 113 | 114 | 115 | def get_rank(): 116 | """ 117 | Get the rank of the current process. 118 | """ 119 | if not dist.is_available(): 120 | return 0 121 | if not dist.is_initialized(): 122 | return 0 123 | return dist.get_rank() 124 | 125 | 126 | def synchronize(): 127 | """ 128 | Helper function to synchronize (barrier) among all processes when 129 | using distributed training 130 | """ 131 | if not dist.is_available(): 132 | return 133 | if not dist.is_initialized(): 134 | return 135 | world_size = dist.get_world_size() 136 | if world_size == 1: 137 | return 138 | dist.barrier() 139 | 140 | 141 | @functools.lru_cache() 142 | def _get_global_gloo_group(): 143 | """ 144 | Return a process group based on gloo backend, containing all the ranks 145 | The result is cached. 146 | Returns: 147 | (group): pytorch dist group. 148 | """ 149 | if dist.get_backend() == "nccl": 150 | return dist.new_group(backend="gloo") 151 | else: 152 | return dist.group.WORLD 153 | 154 | 155 | def _serialize_to_tensor(data, group): 156 | """ 157 | Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl` 158 | backend is supported. 159 | Args: 160 | data (data): data to be serialized. 161 | group (group): pytorch dist group. 162 | Returns: 163 | tensor (ByteTensor): tensor that serialized. 164 | """ 165 | 166 | backend = dist.get_backend(group) 167 | assert backend in ["gloo", "nccl"] 168 | device = torch.device("cpu" if backend == "gloo" else "cuda") 169 | 170 | buffer = pickle.dumps(data) 171 | if len(buffer) > 1024 ** 3: 172 | logger = logging.getLogger(__name__) 173 | logger.warning( 174 | "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( 175 | get_rank(), len(buffer) / (1024 ** 3), device 176 | ) 177 | ) 178 | storage = torch.ByteStorage.from_buffer(buffer) 179 | tensor = torch.ByteTensor(storage).to(device=device) 180 | return tensor 181 | 182 | 183 | def _pad_to_largest_tensor(tensor, group): 184 | """ 185 | Padding all the tensors from different GPUs to the largest ones. 186 | Args: 187 | tensor (tensor): tensor to pad. 188 | group (group): pytorch dist group. 189 | Returns: 190 | list[int]: size of the tensor, on each rank 191 | Tensor: padded tensor that has the max size 192 | """ 193 | world_size = dist.get_world_size(group=group) 194 | assert ( 195 | world_size >= 1 196 | ), "comm.gather/all_gather must be called from ranks within the given group!" 197 | local_size = torch.tensor( 198 | [tensor.numel()], dtype=torch.int64, device=tensor.device 199 | ) 200 | size_list = [ 201 | torch.zeros([1], dtype=torch.int64, device=tensor.device) 202 | for _ in range(world_size) 203 | ] 204 | dist.all_gather(size_list, local_size, group=group) 205 | size_list = [int(size.item()) for size in size_list] 206 | 207 | max_size = max(size_list) 208 | 209 | # we pad the tensor because torch all_gather does not support 210 | # gathering tensors of different shapes 211 | if local_size != max_size: 212 | padding = torch.zeros( 213 | (max_size - local_size,), dtype=torch.uint8, device=tensor.device 214 | ) 215 | tensor = torch.cat((tensor, padding), dim=0) 216 | return size_list, tensor 217 | 218 | 219 | def all_gather_unaligned(data, group=None): 220 | """ 221 | Run all_gather on arbitrary picklable data (not necessarily tensors). 222 | 223 | Args: 224 | data: any picklable object 225 | group: a torch process group. By default, will use a group which 226 | contains all ranks on gloo backend. 227 | 228 | Returns: 229 | list[data]: list of data gathered from each rank 230 | """ 231 | if get_world_size() == 1: 232 | return [data] 233 | if group is None: 234 | group = _get_global_gloo_group() 235 | if dist.get_world_size(group) == 1: 236 | return [data] 237 | 238 | tensor = _serialize_to_tensor(data, group) 239 | 240 | size_list, tensor = _pad_to_largest_tensor(tensor, group) 241 | max_size = max(size_list) 242 | 243 | # receiving Tensor from all ranks 244 | tensor_list = [ 245 | torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) 246 | for _ in size_list 247 | ] 248 | dist.all_gather(tensor_list, tensor, group=group) 249 | 250 | data_list = [] 251 | for size, tensor in zip(size_list, tensor_list): 252 | buffer = tensor.cpu().numpy().tobytes()[:size] 253 | data_list.append(pickle.loads(buffer)) 254 | 255 | return data_list 256 | 257 | 258 | def init_distributed_training(cfg): 259 | """ 260 | Initialize variables needed for distributed training. 261 | """ 262 | if cfg.NUM_GPUS == 1: 263 | return 264 | num_gpus_per_machine = cfg.NUM_GPUS 265 | num_machines = dist.get_world_size() // num_gpus_per_machine 266 | for i in range(num_machines): 267 | ranks_on_i = list( 268 | range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine) 269 | ) 270 | pg = dist.new_group(ranks_on_i) 271 | if i == cfg.SHARD_ID: 272 | global _LOCAL_PROCESS_GROUP 273 | _LOCAL_PROCESS_GROUP = pg 274 | 275 | 276 | def get_local_size() -> int: 277 | """ 278 | Returns: 279 | The size of the per-machine process group, 280 | i.e. the number of processes per machine. 281 | """ 282 | if not dist.is_available(): 283 | return 1 284 | if not dist.is_initialized(): 285 | return 1 286 | return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) 287 | 288 | 289 | def get_local_rank() -> int: 290 | """ 291 | Returns: 292 | The rank of the current process within the local (per-machine) process group. 293 | """ 294 | if not dist.is_available(): 295 | return 0 296 | if not dist.is_initialized(): 297 | return 0 298 | assert _LOCAL_PROCESS_GROUP is not None 299 | return dist.get_rank(group=_LOCAL_PROCESS_GROUP) 300 | --------------------------------------------------------------------------------