├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── GETTING_STARTED.md ├── INSTALL.md ├── LICENSE ├── MODEL_ZOO.md ├── README.md ├── configs ├── AVA │ ├── SLOWFAST_32x2_R50_SHORT.yaml │ ├── SLOW_8x8_R50_SHORT.yaml │ └── c2 │ │ ├── SLOWFAST_32x2_R101_50_50.yaml │ │ ├── SLOWFAST_32x2_R101_50_50_v2.1.yaml │ │ ├── SLOWFAST_32x2_R50.yaml │ │ ├── SLOWFAST_64x2_R101_50_50.yaml │ │ └── SLOW_8x8_R50.yaml ├── Charades │ ├── SLOWFAST_16x8_R50.yaml │ └── SLOWFAST_16x8_R50_multigrid.yaml ├── Kinetics │ ├── AVSLOWFAST_4x16_R50.yaml │ ├── AVSLOWFAST_8x8_R50.yaml │ ├── C2D_8x8_R50.yaml │ ├── C2D_8x8_R50_IN1K.yaml │ ├── C2D_NLN_8x8_R50.yaml │ ├── C2D_NLN_8x8_R50_IN1K.yaml │ ├── I3D_8x8_R101.yaml │ ├── I3D_8x8_R50.yaml │ ├── I3D_8x8_R50_IN1K.yaml │ ├── I3D_NLN_8x8_R101.yaml │ ├── I3D_NLN_8x8_R50.yaml │ ├── I3D_NLN_8x8_R50_IN1K.yaml │ ├── SLOWFAST_4x16_R50.yaml │ ├── SLOWFAST_8x8_R50.yaml │ ├── SLOWFAST_8x8_R50_stepwise.yaml │ ├── SLOWFAST_8x8_R50_stepwise_multigrid.yaml │ ├── SLOWFAST_NLN_4x16_R50.yaml │ ├── SLOWFAST_NLN_8x8_R50.yaml │ ├── SLOW_4x16_R50.yaml │ ├── SLOW_8x8_R50.yaml │ ├── SLOW_NLN_4x16_R50.yaml │ ├── SLOW_NLN_8x8_R50.yaml │ └── c2 │ │ ├── C2D_NOPOOL_8x8_R50.yaml │ │ ├── I3D_8x8_R50.yaml │ │ ├── I3D_NLN_8x8_R50.yaml │ │ ├── SLOWFAST_16x8_R101_50_50.yaml │ │ ├── SLOWFAST_4x16_R50.yaml │ │ ├── SLOWFAST_8x8_R101_101_101.yaml │ │ ├── SLOWFAST_8x8_R101_50_101.yaml │ │ ├── SLOWFAST_8x8_R101_50_50.yaml │ │ ├── SLOWFAST_8x8_R50.yaml │ │ ├── SLOWFAST_NLN_16x8_R101_50_50.yaml │ │ ├── SLOW_4x16_R50.yaml │ │ └── SLOW_8x8_R50.yaml └── SSv2 │ ├── SLOWFAST_16x8_R50.yaml │ └── SLOWFAST_16x8_R50_multigrid.yaml ├── demo ├── AVA │ └── SLOWFAST_32x2_R101_50_50.yaml ├── Kinetics │ └── SLOWFAST_8x8_R50.yaml └── ava_demo.gif ├── linter.sh ├── projects ├── avslowfast │ └── README.md └── multigrid │ ├── README.md │ └── multigrid.png ├── setup.cfg ├── setup.py ├── slowfast ├── __init__.py ├── config │ ├── __init__.py │ ├── custom_config.py │ └── defaults.py ├── datasets │ ├── DATASET.md │ ├── __init__.py │ ├── ava_dataset.py │ ├── ava_helper.py │ ├── build.py │ ├── charades.py │ ├── cv2_transform.py │ ├── decoder.py │ ├── kinetics.py │ ├── loader.py │ ├── multigrid_helper.py │ ├── ssv2.py │ ├── transform.py │ ├── utils.py │ └── video_container.py ├── models │ ├── __init__.py │ ├── batchnorm_helper.py │ ├── build.py │ ├── custom_video_model_builder.py │ ├── head_helper.py │ ├── losses.py │ ├── nonlocal_helper.py │ ├── optimizer.py │ ├── resnet_helper.py │ ├── stem_helper.py │ └── video_model_builder.py ├── utils │ ├── __init__.py │ ├── ava_eval_helper.py │ ├── ava_evaluation │ │ ├── README.md │ │ ├── __init__.py │ │ ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt │ │ ├── label_map_util.py │ │ ├── metrics.py │ │ ├── np_box_list.py │ │ ├── np_box_list_ops.py │ │ ├── np_box_mask_list.py │ │ ├── np_box_mask_list_ops.py │ │ ├── np_box_ops.py │ │ ├── np_mask_ops.py │ │ ├── object_detection_evaluation.py │ │ ├── per_image_evaluation.py │ │ └── standard_fields.py │ ├── benchmark.py │ ├── bn_helper.py │ ├── c2_model_loading.py │ ├── checkpoint.py │ ├── distributed.py │ ├── env.py │ ├── logging.py │ ├── lr_policy.py │ ├── meters.py │ ├── metrics.py │ ├── misc.py │ ├── multigrid.py │ ├── multiprocessing.py │ ├── parser.py │ └── weight_init_helper.py └── visualization │ ├── __init__.py │ ├── async_predictor.py │ ├── ava_demo_precomputed_boxes.py │ ├── demo_loader.py │ ├── gradcam_utils.py │ ├── predictor.py │ ├── tensorboard_vis.py │ ├── utils.py │ └── video_visualizer.py └── tools ├── benchmark.py ├── demo_net.py ├── run_net.py ├── test_net.py ├── train_net.py └── visualization.py /.gitignore: -------------------------------------------------------------------------------- 1 | # LaTex 2 | main.pdf 3 | supp.pdf 4 | **/*.aux 5 | **/*.log 6 | **/*.synctex.gz 7 | **/*.aux 8 | **/*.bbl 9 | **/*.blg 10 | **/*.brf 11 | **/*.sublime-project 12 | **/*.sublime-workspace 13 | **/*.fdb_latexmk 14 | **/*.fls 15 | **/*.toc 16 | 17 | tools/debug.sh 18 | 19 | # MacOS stuff 20 | .DS_Store 21 | **/.DS_Store 22 | 23 | **/__pycache__ 24 | **/*.pyc 25 | **/.settings 26 | .project 27 | .pydevproject 28 | 29 | # external/* 30 | 31 | # Byte-compiled / optimized / DLL files 32 | __pycache__/ 33 | *.py[cod] 34 | *$py.class 35 | 36 | # C extensions 37 | *.so 38 | 39 | # Distribution / packaging 40 | .Python 41 | build/ 42 | develop-eggs/ 43 | dist/ 44 | downloads/ 45 | eggs/ 46 | .eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | wheels/ 53 | *.egg-info/ 54 | .installed.cfg 55 | *.egg 56 | MANIFEST 57 | 58 | # PyInstaller 59 | # Usually these files are written by a python script from a template 60 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 61 | *.manifest 62 | *.spec 63 | 64 | # Installer logs 65 | pip-log.txt 66 | pip-delete-this-directory.txt 67 | 68 | # Unit test / coverage reports 69 | htmlcov/ 70 | .tox/ 71 | .coverage 72 | .coverage.* 73 | .cache 74 | nosetests.xml 75 | coverage.xml 76 | *.cover 77 | .hypothesis/ 78 | .pytest_cache/ 79 | 80 | # Translations 81 | *.mo 82 | *.pot 83 | 84 | # Django stuff: 85 | *.log 86 | local_settings.py 87 | db.sqlite3 88 | 89 | # Flask stuff: 90 | instance/ 91 | .webassets-cache 92 | 93 | # Scrapy stuff: 94 | .scrapy 95 | 96 | # Sphinx documentation 97 | docs/_build/ 98 | 99 | # PyBuilder 100 | target/ 101 | 102 | # Jupyter Notebook 103 | .ipynb_checkpoints 104 | 105 | # pyenv 106 | .python-version 107 | 108 | # celery beat schedule file 109 | celerybeat-schedule 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to PySlowFast 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `master`. 9 | 2. If you've changed APIs, update the documentation. 10 | 3. Ensure the test suite passes. 11 | 4. Make sure your code lints. 12 | 5. Ensure no regressions in baseline model speed and accuracy. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | 23 | Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. The recommended issue format is: 24 | ------ 25 | 26 | #### To Reproduce 27 | ```How to reproduce the issue.``` 28 | #### Expected behavior 29 | ```Expected output.``` 30 | #### Environment 31 | ```Your environment.``` 32 | 33 | ------ 34 | 35 | ## Coding Style 36 | * 4 spaces for indentation rather than tabs 37 | * 80 character line length 38 | * PEP8 formatting 39 | 40 | ## License 41 | By contributing to PySlowFast, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. 42 | -------------------------------------------------------------------------------- /GETTING_STARTED.md: -------------------------------------------------------------------------------- 1 | # Getting Started with PySlowFast 2 | 3 | This document provides a brief intro of launching jobs in PySlowFast for training and testing. Before launching any job, make sure you have properly installed the PySlowFast following the instruction in [README.md](README.md) and you have prepared the dataset following [DATASET.md](slowfast/datasets/DATASET.md) with the correct format. 4 | 5 | ## Train a Standard Model from Scratch 6 | 7 | Here we can start with training a simple C2D models by running: 8 | 9 | ``` 10 | python tools/run_net.py \ 11 | --cfg configs/Kinetics/C2D_8x8_R50.yaml \ 12 | DATA.PATH_TO_DATA_DIR path_to_your_dataset \ 13 | NUM_GPUS 2 \ 14 | TRAIN.BATCH_SIZE 16 \ 15 | ``` 16 | You may need to pass location of your dataset in the command line by adding `DATA.PATH_TO_DATA_DIR path_to_your_dataset`, or you can simply add 17 | 18 | ``` 19 | DATA: 20 | PATH_TO_DATA_DIR: path_to_your_dataset 21 | ``` 22 | To the yaml configs file, then you do not need to pass it to the command line every time. 23 | 24 | 25 | You may also want to add: 26 | ``` 27 | DATA_LOADER.NUM_WORKERS 0 \ 28 | NUM_GPUS 2 \ 29 | TRAIN.BATCH_SIZE 16 \ 30 | ``` 31 | 32 | If you want to launch a quick job for debugging on your local machine. 33 | 34 | ## Resume from an Existing Checkpoint 35 | If your checkpoint is trained by PyTorch, then you can add the following line in the command line, or you can also add it in the YAML config: 36 | 37 | ``` 38 | TRAIN.CHECKPOINT_FILE_PATH path_to_your_PyTorch_checkpoint 39 | ``` 40 | 41 | If the checkpoint in trained by Caffe2, then you can do the following: 42 | 43 | ``` 44 | TRAIN.CHECKPOINT_FILE_PATH path_to_your_Caffe2_checkpoint \ 45 | TRAIN.CHECKPOINT_TYPE caffe2 46 | ``` 47 | 48 | If you need to performance inflation on the checkpoint, remember to set `TRAIN.CHECKPOINT_INFLATE` to True. 49 | 50 | 51 | ## Perform Test 52 | We have `TRAIN.ENABLE` and `TEST.ENABLE` to control whether training or testing is required for the current job. If only testing is preferred, you can set the `TRAIN.ENABLE` to False, and do not forget to pass the path to the model you want to test to TEST.CHECKPOINT_FILE_PATH. 53 | ``` 54 | python tools/run_net.py \ 55 | --cfg configs/Kinetics/C2D_8x8_R50.yaml \ 56 | DATA.PATH_TO_DATA_DIR path_to_your_dataset \ 57 | TEST.CHECKPOINT_FILE_PATH path_to_your_checkpoint \ 58 | TRAIN.ENABLE False \ 59 | ``` 60 | ## Run the Demo on Videos/Camera 61 | 62 | Currently, demo on multiple GPUs is not supported. Set the following in your config file: 63 | * `NUM_GPUS: 1` 64 | * `NUM_SHARDS: 1` 65 | * `DEMO.WEBCAM`: Set this to an index of a camera for demo using webcam. Otherwise, set `DEMO.INPUT_VIDEO` to path of an input video. 66 | * `DEMO.ENABLE: True` 67 | * `DEMO.LABEL_FILE_PATH`: path to a json label file that map {label: label_id}. 68 | * `CHECKPOINT_FILE_PATH: "path/to/the/pre-trained/model.pkl"` (skip this if you decide to place the model in `OUTPUT_DIR` which is default to `./checkpoints/`) 69 | 70 | Optional: 71 | * `DEMO.DISPLAY_WIDTH`: custom display window width. 72 | * `DEMO.DISPLAY_HEIGHT`: custom display window height. 73 | * `DEMO.OUTPUT_FILE`: set this as a path if you want to write outputs to a video file instead of displaying it in a window. 74 | * `DEMO.BUFFER_SIZE`: overlapping number of frames between 2 consecutive input clips. Set this to a positive number to make more frequent predictions 75 | (at the expense of slower speed). 76 | 77 | If you want to only run the demo process, set `TRAIN.ENABLE` and `TEST.ENABLE` to False. 78 | 79 | ### Classification 80 | Modify a `.yaml` in `configs/Kinetics/` corresponding to the pretrained model you want to use (you can look at `demo/Kinetics/SLOWFAST_8x8_R50.yaml` for reference). 81 | 82 | ### Detection 83 | Modify a `.yaml` in `configs/AVA/` corresponding to the pretrained model you want to use (you can look at `demo/AVA/SLOWFAST_32x2_R101_50_50.yaml` for reference) 84 | 85 | Optional: 86 | * `DEMO.DETECTRON2_THRESH`: Set a threshold for choosing bouding boxes outputed by Detectron2. (Default to 0.9) 87 | * Pick a different [Detectron2](https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md) Object Detection model config and weights. Set the parameters: `DEMO.DETECTRON2_CFG` and `DEMO.DETECTRON2_WEIGHTS`. (Default to `faster_rcnn_R_50_FPN_3x.yaml` config and the corresponding weights) 88 | 89 | ### Run command 90 | ``` 91 | python \tools\run_net.py --cfg path/to/.yaml 92 | ``` 93 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Requirements 4 | - Python >= 3.6 5 | - Numpy 6 | - PyTorch 1.3 7 | - [fvcore](https://github.com/facebookresearch/fvcore/): `pip install 'git+https://github.com/facebookresearch/fvcore'` 8 | - [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. 9 | You can install them together at [pytorch.org](https://pytorch.org) to make sure of this. 10 | - simplejson: `pip install simplejson` 11 | - GCC >= 4.9 12 | - PyAV: `conda install av -c conda-forge` 13 | - ffmpeg (4.0 is prefereed, will be installed along with PyAV) 14 | - PyYaml: (will be installed along with fvcore) 15 | - tqdm: (will be installed along with fvcore) 16 | - psutil: `pip install psutil` 17 | - OpenCV: `pip install opencv-python` 18 | - torchvision: `pip install torchvision` or `conda install torchvision -c pytorch` 19 | - librosa: `pip install librosa` (if using Audiovisual SlowFast Networks) 20 | - tensorboard: `pip install tensorboard` 21 | - moviepy: (optional, for visualizing video on tensorboard) `conda install -c conda-forge moviepy` or `pip install moviepy` 22 | - [Detectron2](https://github.com/facebookresearch/detectron2): 23 | ``` 24 | pip install -U torch torchvision cython 25 | pip install -U 'git+https://github.com/facebookresearch/fvcore.git' 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI' 26 | git clone https://github.com/facebookresearch/detectron2 detectron2_repo 27 | pip install -e detectron2_repo 28 | # You can find more details at https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md 29 | ``` 30 | 31 | ## Pytorch 32 | Please follow PyTorch official instructions to install from source: 33 | ``` 34 | git clone --recursive https://github.com/pytorch/pytorch 35 | ``` 36 | 37 | ## PySlowFast 38 | 39 | Clone the PySlowFast Video Understanding repository. 40 | ``` 41 | git clone https://github.com/facebookresearch/slowfast 42 | ``` 43 | 44 | Add this repository to $PYTHONPATH. 45 | ``` 46 | export PYTHONPATH=/path/to/SlowFast/slowfast:$PYTHONPATH 47 | ``` 48 | 49 | ### Build PySlowFast 50 | 51 | After having the above dependencies, run: 52 | ``` 53 | git clone https://github.com/facebookresearch/slowfast 54 | cd SlowFast 55 | python setup.py build develop 56 | ``` 57 | 58 | Now the installation is finished, run the pipeline with: 59 | ``` 60 | python tools/run_net.py --cfg configs/Kinetics/C2D_8x8_R50.yaml NUM_GPUS 1 TRAIN.BATCH_SIZE 8 SOLVER.BASE_LR 0.0125 DATA.PATH_TO_DATA_DIR path_to_your_data_folder 61 | ``` 62 | -------------------------------------------------------------------------------- /MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | # PySlowFast Model Zoo and Baselines 2 | 3 | ## Kinetics 4 | 5 | We provided original pretrained models from Caffe2 on heavy models (testing Caffe2 pretrained model in PyTorch might have a small different in performance): 6 | 7 | | architecture | depth | pretrain | frame length x sample rate | top1 | top5 | model | config | 8 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | 9 | | C2D | R50 | Train From Scratch | 8 x 8 | 67.2 | 87.8 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/C2D_NOPOOL_8x8_R50.pkl) | Kinetics/c2/C2D_NOPOOL_8x8_R50 | 10 | | I3D | R50 | Train From Scratch | 8 x 8 | 73.5 | 90.8 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/I3D_8x8_R50.pkl) | Kinetics/c2/I3D_8x8_R50 | 11 | | I3D NLN | R50 | Train From Scratch | 8 x 8 | 74.0 | 91.1 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/I3D_NLN_8x8_R50.pkl) | Kinetics/c2/I3D_NLN_8x8_R50 | 12 | | Slow | R50 | Train From Scratch | 4 x 16 | 72.7 | 90.3 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWONLY_4x16_R50.pkl) | Kinetics/c2/SLOW_4x16_R50 | 13 | | Slow | R50 | Train From Scratch | 8 x 8 | 74.8 | 91.6 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWONLY_8x8_R50.pkl) | Kinetics/c2/SLOW_8x8_R50 | 14 | | SlowFast | R50 | Train From Scratch | 4 x 16 | 75.6 | 92.0 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWFAST_4x16_R50.pkl) | Kinetics/c2/SLOWFAST_4x16_R50 | 15 | | SlowFast | R50 | Train From Scratch | 8 x 8 | 77.0 | 92.6 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWFAST_8x8_R50.pkl) | Kinetics/c2/SLOWFAST_8x8_R50 | 16 | | SlowFast | R101 | Train From Scratch | 8 x 8 | 78.0 | 93.3 | [`link`](coming_soon) | Kinetics/c2/SLOWFAST_8x8_R101_101_101| 17 | | SlowFast | R101 | Train From Scratch | 16 x 8 | 78.9 | 93.5 | [`link`](coming_soon) | Kinetics/c2/SLOWFAST_16x8_R101_50_50 | 18 | 19 | 20 | ## AVA 21 | 22 | | architecture | depth | Pretrain Model | frame length x sample rate | MAP | AVA version | model | 23 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |------------- | 24 | | Slow | R50 | [Kinetics 400](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/ava/pretrain/C2D_8x8_R50.pkl) | 4 x 16 | 19.5 | 2.2 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/ava/C2D_8x8_R50.pkl) | 25 | | SlowFast | R101 | [Kinetics 600](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/ava/pretrain/SLOWFAST_32x2_R101_50_50_v2.1.pkl) | 8 x 8 | 28.2 | 2.1 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/ava/SLOWFAST_32x2_R101_50_50_v2.1.pkl) | 26 | | SlowFast | R101 | [Kinetics 600](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/ava/pretrain/SLOWFAST_32x2_R101_50_50.pkl) | 8 x 8 | 29.1 | 2.2 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/ava/SLOWFAST_32x2_R101_50_50.pkl) | 27 | | SlowFast | R101 | [Kinetics 600](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/ava/pretrain/SLOWFAST_64x2_R101_50_50.pkl) | 16 x 8 | 29.4 | 2.2 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/ava/SLOWFAST_64x2_R101_50_50.pkl) | 28 | 29 | ## Multigrid Training 30 | 31 | ***Update June, 2020:*** In the following we provide (reimplemented) models from "[A Multigrid Method for Efficiently Training Video Models 32 | ](https://arxiv.org/abs/1912.00998)" paper. The multigrid method trains about 3-6x faster than the original training on multiple datasets. See [projects/multigrid](projects/multigrid/README.md) for more information. The following provides models, results, and example config files. 33 | 34 | #### Kinetics: 35 | | architecture | depth | pretrain | frame length x sample rate | training | top1 | top5 | model | config | 36 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | 37 | | SlowFast | R50 | Train From Scratch | 8 x 8 | Standard | 76.8 | 92.7 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/pyslowfast/model_zoo/multigrid/model_zoo/Kinetics/SLOWFAST_8x8_R50_stepwise.pkl) | Kinetics/SLOWFAST_8x8_R50_stepwise | 38 | | SlowFast | R50 | Train From Scratch | 8 x 8 | Multigrid | 76.6 | 92.7 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/pyslowfast/model_zoo/multigrid/model_zoo/Kinetics/SLOWFAST_8x8_R50_stepwise_multigrid.pkl) | Kinetics/SLOWFAST_8x8_R50_stepwise_multigrid | 39 | 40 | (Here we use stepwise learning rate schedule.) 41 | 42 | #### Something-Something V2: 43 | | architecture | depth | pretrain | frame length x sample rate | training | top1 | top5 | model | config | 44 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | 45 | | SlowFast | R50 | [Kinetics 400](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWFAST_8x8_R50.pkl) | 16 x 8 | Standard | 63.0 | 88.5 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/pyslowfast/model_zoo/multigrid/model_zoo/SSv2/SLOWFAST_16x8_R50.pkl) | SSv2/SLOWFAST_16x8_R50 | 46 | | SlowFast | R50 | [Kinetics 400](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWFAST_8x8_R50.pkl) | 16 x 8 | Multigrid | 63.5 | 88.7 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/pyslowfast/model_zoo/multigrid/model_zoo/SSv2/SLOWFAST_16x8_R50_multigrid.pkl) | SSv2/SLOWFAST_16x8_R50_multigrid | 47 | 48 | 49 | #### Charades 50 | | architecture | depth | pretrain | frame length x sample rate | training | mAP | model | config | 51 | | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | ------------- | 52 | | SlowFast | R50 | [Kinetics 400](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWFAST_8x8_R50.pkl) | 16 x 8 | Standard | 38.9 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/pyslowfast/model_zoo/multigrid/model_zoo/Charades/SLOWFAST_16x8_R50.pkl) | SSv2/SLOWFAST_16x8_R50 | 53 | | SlowFast | R50 | [Kinetics 400](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/SLOWFAST_8x8_R50.pkl) | 16 x 8 | Multigrid | 38.6 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/pyslowfast/model_zoo/multigrid/model_zoo/Charades/SLOWFAST_16x8_R50_multigrid.pkl) | SSv2/SLOWFAST_16x8_R50_multigrid | 54 | 55 | 56 | ## ImageNet 57 | 58 | We also release the imagenet pretrain model if finetune from ImageNet pretrain is preferred. The reported accuracy is obtrained by center crop testing on validation set. 59 | 60 | | architecture | depth | Top1 | Top5 | model | 61 | | ------------- | ------------- | ------------- | ------------- | ------------- | 62 | | ResNet | R50 | 23.6 | 6.8 | [`link`](https://dl.fbaipublicfiles.com/pyslowfast/model_zoo/kinetics400/R50_IN1K.pyth) | 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySlowFast 2 | 3 | PySlowFast is an open source video understanding codebase from FAIR that provides state-of-the-art video classification models with efficient training. This repository includes implementations of the following methods: 4 | 5 | - [SlowFast Networks for Video Recognition](https://arxiv.org/abs/1812.03982) 6 | - [Non-local Neural Networks](https://arxiv.org/abs/1711.07971) 7 | - [A Multigrid Method for Efficiently Training Video Models](https://arxiv.org/abs/1912.00998) 8 | 9 |
10 | 11 |
12 | 13 | ## Introduction 14 | 15 | The goal of PySlowFast is to provide a high-performance, light-weight pytorch codebase provides state-of-the-art video backbones for video understanding research on different tasks (classification, detection, and etc). It is designed in order to support rapid implementation and evaluation of novel video research ideas. PySlowFast includes implementations of the following backbone network architectures: 16 | 17 | - SlowFast 18 | - Slow 19 | - C2D 20 | - I3D 21 | - Non-local Network 22 | 23 | ## Updates 24 | - We now support [Multigrid Training](https://arxiv.org/abs/1912.00998) for efficiently training video models. See [`projects/multigrid`](./projects/multigrid/README.md) for more information. 25 | - PySlowFast is released in conjunction with our [ICCV 2019 Tutorial](https://alexander-kirillov.github.io/tutorials/visual-recognition-iccv19/). 26 | 27 | ## License 28 | 29 | PySlowFast is released under the [Apache 2.0 license](LICENSE). 30 | 31 | ## Model Zoo and Baselines 32 | 33 | We provide a large set of baseline results and trained models available for download in the PySlowFast [Model Zoo](MODEL_ZOO.md). 34 | 35 | ## Installation 36 | 37 | Please find installation instructions for PyTorch and PySlowFast in [INSTALL.md](INSTALL.md). You may follow the instructions in [DATASET.md](slowfast/datasets/DATASET.md) to prepare the datasets. 38 | 39 | ## Quick Start 40 | 41 | Follow the example in [GETTING_STARTED.md](GETTING_STARTED.md) to start playing video models with PySlowFast. 42 | 43 | ## Contributors 44 | PySlowFast is written and maintained by [Haoqi Fan](https://haoqifan.github.io/), [Yanghao Li](https://lyttonhao.github.io/), [Bo Xiong](https://www.cs.utexas.edu/~bxiong/), [Wan-Yen Lo](https://www.linkedin.com/in/wanyenlo/), [Christoph Feichtenhofer](https://feichtenhofer.github.io/). 45 | 46 | ## Citing PySlowFast 47 | If you find PySlowFast useful in your research, please use the following BibTeX entry for citation. 48 | ```BibTeX 49 | @misc{fan2020pyslowfast, 50 | author = {Haoqi Fan and Yanghao Li and Bo Xiong and Wan-Yen Lo and 51 | Christoph Feichtenhofer}, 52 | title = {PySlowFast}, 53 | howpublished = {\url{https://github.com/facebookresearch/slowfast}}, 54 | year = {2020} 55 | } 56 | ``` 57 | -------------------------------------------------------------------------------- /configs/AVA/SLOWFAST_32x2_R50_SHORT.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: ava 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the pretrain checkpoint file. 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | DETECTION: 18 | ENABLE: True 19 | ALIGNED: True 20 | AVA: 21 | DETECTION_SCORE_THRESH: 0.8 22 | TRAIN_PREDICT_BOX_LISTS: [ 23 | "ava_train_v2.2.csv", 24 | "person_box_67091280_iou90/ava_detection_train_boxes_and_labels_include_negative_v2.2.csv", 25 | ] 26 | TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"] 27 | SLOWFAST: 28 | ALPHA: 4 29 | BETA_INV: 8 30 | FUSION_CONV_CHANNEL_RATIO: 2 31 | FUSION_KERNEL_SZ: 7 32 | RESNET: 33 | ZERO_INIT_FINAL_BN: True 34 | WIDTH_PER_GROUP: 64 35 | NUM_GROUPS: 1 36 | DEPTH: 50 37 | TRANS_FUNC: bottleneck_transform 38 | STRIDE_1X1: False 39 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 40 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [2, 2]] 41 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [1, 1]] 42 | NONLOCAL: 43 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 44 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 45 | INSTANTIATION: dot_product 46 | POOL: [[[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]]] 47 | BN: 48 | USE_PRECISE_STATS: False 49 | NUM_BATCHES_PRECISE: 200 50 | SOLVER: 51 | BASE_LR: 0.1 52 | LR_POLICY: steps_with_relative_lrs 53 | STEPS: [0, 10, 15, 20] 54 | LRS: [1, 0.1, 0.01, 0.001] 55 | MAX_EPOCH: 20 56 | MOMENTUM: 0.9 57 | WEIGHT_DECAY: 1e-7 58 | WARMUP_EPOCHS: 5.0 59 | WARMUP_START_LR: 0.000125 60 | OPTIMIZING_METHOD: sgd 61 | MODEL: 62 | NUM_CLASSES: 80 63 | ARCH: slowfast 64 | MODEL_NAME: SlowFast 65 | LOSS_FUNC: bce 66 | DROPOUT_RATE: 0.5 67 | HEAD_ACT: sigmoid 68 | TEST: 69 | ENABLE: True 70 | DATASET: ava 71 | BATCH_SIZE: 8 72 | DATA_LOADER: 73 | NUM_WORKERS: 2 74 | PIN_MEMORY: True 75 | NUM_GPUS: 8 76 | NUM_SHARDS: 1 77 | RNG_SEED: 0 78 | OUTPUT_DIR: . 79 | -------------------------------------------------------------------------------- /configs/AVA/SLOW_8x8_R50_SHORT.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: ava 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the pretrain checkpoint file. 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 4 12 | SAMPLING_RATE: 16 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | DETECTION: 18 | ENABLE: True 19 | ALIGNED: True 20 | AVA: 21 | DETECTION_SCORE_THRESH: 0.9 22 | TRAIN_PREDICT_BOX_LISTS: [ 23 | "ava_train_v2.2.csv", 24 | "person_box_67091280_iou90/ava_detection_train_boxes_and_labels_include_negative_v2.2.csv", 25 | ] 26 | TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"] 27 | RESNET: 28 | ZERO_INIT_FINAL_BN: True 29 | WIDTH_PER_GROUP: 64 30 | NUM_GROUPS: 1 31 | DEPTH: 50 32 | TRANS_FUNC: bottleneck_transform 33 | STRIDE_1X1: False 34 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 35 | SPATIAL_DILATIONS: [[1], [1], [1], [2]] 36 | SPATIAL_STRIDES: [[1], [2], [2], [1]] 37 | NONLOCAL: 38 | LOCATION: [[[]], [[]], [[]], [[]]] 39 | GROUP: [[1], [1], [1], [1]] 40 | INSTANTIATION: softmax 41 | BN: 42 | USE_PRECISE_STATS: False 43 | NUM_BATCHES_PRECISE: 200 44 | SOLVER: 45 | BASE_LR: 0.1 46 | LR_POLICY: steps_with_relative_lrs 47 | STEPS: [0, 10, 15, 20] 48 | LRS: [1, 0.1, 0.01, 0.001] 49 | MAX_EPOCH: 20 50 | MOMENTUM: 0.9 51 | WEIGHT_DECAY: 1e-7 52 | WARMUP_EPOCHS: 5.0 53 | WARMUP_START_LR: 0.000125 54 | OPTIMIZING_METHOD: sgd 55 | MODEL: 56 | NUM_CLASSES: 80 57 | ARCH: slow 58 | MODEL_NAME: ResNet 59 | LOSS_FUNC: bce 60 | DROPOUT_RATE: 0.5 61 | HEAD_ACT: sigmoid 62 | TEST: 63 | ENABLE: True 64 | DATASET: ava 65 | BATCH_SIZE: 8 66 | DATA_LOADER: 67 | NUM_WORKERS: 2 68 | PIN_MEMORY: True 69 | NUM_GPUS: 8 70 | NUM_SHARDS: 1 71 | RNG_SEED: 0 72 | OUTPUT_DIR: . 73 | -------------------------------------------------------------------------------- /configs/AVA/c2/SLOWFAST_32x2_R101_50_50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: ava 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to pretrain model 9 | CHECKPOINT_TYPE: pytorch 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | DETECTION: 18 | ENABLE: True 19 | ALIGNED: False 20 | AVA: 21 | BGR: False 22 | DETECTION_SCORE_THRESH: 0.8 23 | TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"] 24 | SLOWFAST: 25 | ALPHA: 4 26 | BETA_INV: 8 27 | FUSION_CONV_CHANNEL_RATIO: 2 28 | FUSION_KERNEL_SZ: 5 29 | RESNET: 30 | ZERO_INIT_FINAL_BN: True 31 | WIDTH_PER_GROUP: 64 32 | NUM_GROUPS: 1 33 | DEPTH: 101 34 | TRANS_FUNC: bottleneck_transform 35 | STRIDE_1X1: False 36 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 37 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [2, 2]] 38 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [1, 1]] 39 | NONLOCAL: 40 | LOCATION: [[[], []], [[], []], [[6, 13, 20], []], [[], []]] 41 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 42 | INSTANTIATION: dot_product 43 | POOL: [[[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]]] 44 | BN: 45 | USE_PRECISE_STATS: False 46 | NUM_BATCHES_PRECISE: 200 47 | SOLVER: 48 | MOMENTUM: 0.9 49 | WEIGHT_DECAY: 1e-7 50 | OPTIMIZING_METHOD: sgd 51 | MODEL: 52 | NUM_CLASSES: 80 53 | ARCH: slowfast 54 | MODEL_NAME: SlowFast 55 | LOSS_FUNC: bce 56 | DROPOUT_RATE: 0.5 57 | HEAD_ACT: sigmoid 58 | TEST: 59 | ENABLE: True 60 | DATASET: ava 61 | BATCH_SIZE: 8 62 | DATA_LOADER: 63 | NUM_WORKERS: 2 64 | PIN_MEMORY: True 65 | NUM_GPUS: 8 66 | NUM_SHARDS: 1 67 | RNG_SEED: 0 68 | OUTPUT_DIR: . 69 | -------------------------------------------------------------------------------- /configs/AVA/c2/SLOWFAST_32x2_R101_50_50_v2.1.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: ava 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to pretrain model 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | DETECTION: 18 | ENABLE: True 19 | ALIGNED: False 20 | AVA: 21 | BGR: False 22 | DETECTION_SCORE_THRESH: 0.8 23 | TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"] 24 | TRAIN_GT_BOX_LISTS: ["ava_train_v2.1.csv"] 25 | LABEL_MAP_FILE: ava_action_list_v2.1_for_activitynet_2018.pbtxt 26 | EXCLUSION_FILE: ava_val_excluded_timestamps_v2.1.csv 27 | GROUNDTRUTH_FILE: ava_val_v2.1.csv 28 | SLOWFAST: 29 | ALPHA: 4 30 | BETA_INV: 8 31 | FUSION_CONV_CHANNEL_RATIO: 2 32 | FUSION_KERNEL_SZ: 5 33 | RESNET: 34 | ZERO_INIT_FINAL_BN: True 35 | WIDTH_PER_GROUP: 64 36 | NUM_GROUPS: 1 37 | DEPTH: 101 38 | TRANS_FUNC: bottleneck_transform 39 | STRIDE_1X1: False 40 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 41 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [2, 2]] 42 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [1, 1]] 43 | NONLOCAL: 44 | LOCATION: [[[], []], [[], []], [[6, 13, 20], []], [[], []]] 45 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 46 | INSTANTIATION: dot_product 47 | POOL: [[[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]]] 48 | BN: 49 | USE_PRECISE_STATS: False 50 | NUM_BATCHES_PRECISE: 200 51 | SOLVER: 52 | MOMENTUM: 0.9 53 | WEIGHT_DECAY: 1e-7 54 | OPTIMIZING_METHOD: sgd 55 | MODEL: 56 | NUM_CLASSES: 80 57 | ARCH: slowfast 58 | MODEL_NAME: SlowFast 59 | LOSS_FUNC: bce 60 | DROPOUT_RATE: 0.5 61 | HEAD_ACT: sigmoid 62 | TEST: 63 | ENABLE: True 64 | DATASET: ava 65 | BATCH_SIZE: 8 66 | DATA_LOADER: 67 | NUM_WORKERS: 2 68 | PIN_MEMORY: True 69 | NUM_GPUS: 8 70 | NUM_SHARDS: 1 71 | RNG_SEED: 0 72 | OUTPUT_DIR: . 73 | -------------------------------------------------------------------------------- /configs/AVA/c2/SLOWFAST_32x2_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: ava 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to pretrain model 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | DETECTION: 18 | ENABLE: True 19 | ALIGNED: False 20 | AVA: 21 | BGR: False 22 | DETECTION_SCORE_THRESH: 0.8 23 | TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"] 24 | SLOWFAST: 25 | ALPHA: 4 26 | BETA_INV: 8 27 | FUSION_CONV_CHANNEL_RATIO: 2 28 | FUSION_KERNEL_SZ: 7 29 | RESNET: 30 | ZERO_INIT_FINAL_BN: True 31 | WIDTH_PER_GROUP: 64 32 | NUM_GROUPS: 1 33 | DEPTH: 50 34 | TRANS_FUNC: bottleneck_transform 35 | STRIDE_1X1: False 36 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 37 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [2, 2]] 38 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [1, 1]] 39 | NONLOCAL: 40 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 41 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 42 | INSTANTIATION: dot_product 43 | POOL: [[[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]], [[1, 2, 2], [1, 2, 2]]] 44 | BN: 45 | USE_PRECISE_STATS: False 46 | NUM_BATCHES_PRECISE: 200 47 | SOLVER: 48 | MOMENTUM: 0.9 49 | WEIGHT_DECAY: 1e-7 50 | OPTIMIZING_METHOD: sgd 51 | MODEL: 52 | NUM_CLASSES: 80 53 | ARCH: slowfast 54 | MODEL_NAME: SlowFast 55 | LOSS_FUNC: bce 56 | DROPOUT_RATE: 0.5 57 | HEAD_ACT: sigmoid 58 | TEST: 59 | ENABLE: True 60 | DATASET: ava 61 | BATCH_SIZE: 8 62 | DATA_LOADER: 63 | NUM_WORKERS: 2 64 | PIN_MEMORY: True 65 | NUM_GPUS: 8 66 | NUM_SHARDS: 1 67 | RNG_SEED: 0 68 | OUTPUT_DIR: . 69 | -------------------------------------------------------------------------------- /configs/AVA/c2/SLOWFAST_64x2_R101_50_50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: ava 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to pretrain model 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 64 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | DETECTION: 18 | ENABLE: True 19 | ALIGNED: False 20 | AVA: 21 | BGR: False 22 | DETECTION_SCORE_THRESH: 0.8 23 | TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"] 24 | SLOWFAST: 25 | ALPHA: 4 26 | BETA_INV: 8 27 | FUSION_CONV_CHANNEL_RATIO: 2 28 | FUSION_KERNEL_SZ: 5 29 | RESNET: 30 | ZERO_INIT_FINAL_BN: True 31 | WIDTH_PER_GROUP: 64 32 | NUM_GROUPS: 1 33 | DEPTH: 101 34 | TRANS_FUNC: bottleneck_transform 35 | STRIDE_1X1: False 36 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 37 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [2, 2]] 38 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [1, 1]] 39 | NONLOCAL: 40 | LOCATION: [[[], []], [[], []], [[6, 13, 20], []], [[], []]] 41 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 42 | INSTANTIATION: dot_product 43 | POOL: [[[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]]] 44 | BN: 45 | USE_PRECISE_STATS: False 46 | NUM_BATCHES_PRECISE: 200 47 | SOLVER: 48 | MOMENTUM: 0.9 49 | WEIGHT_DECAY: 1e-7 50 | OPTIMIZING_METHOD: sgd 51 | MODEL: 52 | NUM_CLASSES: 80 53 | ARCH: slowfast 54 | MODEL_NAME: SlowFast 55 | LOSS_FUNC: bce 56 | DROPOUT_RATE: 0.5 57 | HEAD_ACT: sigmoid 58 | TEST: 59 | ENABLE: True 60 | DATASET: ava 61 | BATCH_SIZE: 8 62 | DATA_LOADER: 63 | NUM_WORKERS: 0 64 | PIN_MEMORY: True 65 | NUM_GPUS: 8 66 | NUM_SHARDS: 1 67 | RNG_SEED: 0 68 | OUTPUT_DIR: . 69 | -------------------------------------------------------------------------------- /configs/AVA/c2/SLOW_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: ava 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 1 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to pretrain model 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 4 12 | SAMPLING_RATE: 16 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | DETECTION: 18 | ENABLE: True 19 | ALIGNED: False 20 | AVA: 21 | BGR: False 22 | DETECTION_SCORE_THRESH: 0.9 23 | TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou75/ava_detection_val_boxes_and_labels.csv"] 24 | RESNET: 25 | ZERO_INIT_FINAL_BN: True 26 | WIDTH_PER_GROUP: 64 27 | NUM_GROUPS: 1 28 | DEPTH: 50 29 | TRANS_FUNC: bottleneck_transform 30 | STRIDE_1X1: False 31 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 32 | SPATIAL_DILATIONS: [[1], [1], [1], [2]] 33 | SPATIAL_STRIDES: [[1], [2], [2], [1]] 34 | NONLOCAL: 35 | LOCATION: [[[]], [[]], [[]], [[]]] 36 | GROUP: [[1], [1], [1], [1]] 37 | INSTANTIATION: softmax 38 | BN: 39 | USE_PRECISE_STATS: False 40 | NUM_BATCHES_PRECISE: 200 41 | SOLVER: 42 | MOMENTUM: 0.9 43 | WEIGHT_DECAY: 1e-7 44 | OPTIMIZING_METHOD: sgd 45 | MODEL: 46 | NUM_CLASSES: 80 47 | ARCH: slow 48 | MODEL_NAME: ResNet 49 | LOSS_FUNC: bce 50 | DROPOUT_RATE: 0.5 51 | HEAD_ACT: sigmoid 52 | TEST: 53 | ENABLE: True 54 | DATASET: ava 55 | BATCH_SIZE: 8 56 | DATA_LOADER: 57 | NUM_WORKERS: 2 58 | PIN_MEMORY: True 59 | NUM_GPUS: 8 60 | NUM_SHARDS: 1 61 | RNG_SEED: 0 62 | OUTPUT_DIR: . 63 | -------------------------------------------------------------------------------- /configs/Charades/SLOWFAST_16x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: charades 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 6 6 | CHECKPOINT_PERIOD: 6 7 | AUTO_RESUME: True 8 | CHECKPOINT_FILE_PATH: SLOWFAST_8x8_R50.pkl # please download from the model zoo. 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 64 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 340] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | MULTI_LABEL: True 18 | INV_UNIFORM_SAMPLE: True 19 | ENSEMBLE_METHOD: max 20 | REVERSE_INPUT_CHANNEL: True 21 | SLOWFAST: 22 | ALPHA: 4 23 | BETA_INV: 8 24 | FUSION_CONV_CHANNEL_RATIO: 2 25 | FUSION_KERNEL_SZ: 7 26 | RESNET: 27 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 28 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 29 | ZERO_INIT_FINAL_BN: True 30 | WIDTH_PER_GROUP: 64 31 | NUM_GROUPS: 1 32 | DEPTH: 50 33 | TRANS_FUNC: bottleneck_transform 34 | STRIDE_1X1: False 35 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 36 | NONLOCAL: 37 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 38 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 39 | INSTANTIATION: dot_product 40 | BN: 41 | USE_PRECISE_STATS: True 42 | NUM_BATCHES_PRECISE: 200 43 | NORM_TYPE: sync_batchnorm 44 | NUM_SYNC_DEVICES: 4 45 | SOLVER: 46 | BASE_LR: 0.0375 47 | LR_POLICY: steps_with_relative_lrs 48 | LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 49 | STEPS: [0, 41, 49] 50 | MAX_EPOCH: 57 51 | MOMENTUM: 0.9 52 | WEIGHT_DECAY: 1e-4 53 | WARMUP_EPOCHS: 4.0 54 | WARMUP_START_LR: 0.0001 55 | OPTIMIZING_METHOD: sgd 56 | MODEL: 57 | NUM_CLASSES: 157 58 | ARCH: slowfast 59 | LOSS_FUNC: bce_logit 60 | HEAD_ACT: sigmoid 61 | DROPOUT_RATE: 0.5 62 | TEST: 63 | ENABLE: True 64 | DATASET: charades 65 | BATCH_SIZE: 16 66 | NUM_ENSEMBLE_VIEWS: 10 67 | NUM_SPATIAL_CROPS: 3 68 | DATA_LOADER: 69 | NUM_WORKERS: 8 70 | PIN_MEMORY: True 71 | NUM_GPUS: 8 72 | NUM_SHARDS: 1 73 | RNG_SEED: 0 74 | OUTPUT_DIR: . 75 | LOG_MODEL_INFO: False 76 | -------------------------------------------------------------------------------- /configs/Charades/SLOWFAST_16x8_R50_multigrid.yaml: -------------------------------------------------------------------------------- 1 | MULTIGRID: 2 | SHORT_CYCLE: True 3 | LONG_CYCLE: True 4 | TRAIN: 5 | ENABLE: True 6 | DATASET: charades 7 | BATCH_SIZE: 16 8 | EVAL_PERIOD: 6 9 | CHECKPOINT_PERIOD: 6 10 | AUTO_RESUME: True 11 | CHECKPOINT_FILE_PATH: SLOWFAST_8x8_R50.pkl # please download from the model zoo. 12 | CHECKPOINT_TYPE: caffe2 13 | DATA: 14 | NUM_FRAMES: 64 15 | SAMPLING_RATE: 2 16 | TRAIN_JITTER_SCALES: [256, 340] 17 | TRAIN_CROP_SIZE: 224 18 | TEST_CROP_SIZE: 256 19 | INPUT_CHANNEL_NUM: [3, 3] 20 | MULTI_LABEL: True 21 | INV_UNIFORM_SAMPLE: True 22 | ENSEMBLE_METHOD: max 23 | REVERSE_INPUT_CHANNEL: True 24 | SLOWFAST: 25 | ALPHA: 4 26 | BETA_INV: 8 27 | FUSION_CONV_CHANNEL_RATIO: 2 28 | FUSION_KERNEL_SZ: 7 29 | RESNET: 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | ZERO_INIT_FINAL_BN: True 33 | WIDTH_PER_GROUP: 64 34 | NUM_GROUPS: 1 35 | DEPTH: 50 36 | TRANS_FUNC: bottleneck_transform 37 | STRIDE_1X1: False 38 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 39 | NONLOCAL: 40 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 41 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 42 | INSTANTIATION: dot_product 43 | BN: 44 | USE_PRECISE_STATS: True 45 | NUM_BATCHES_PRECISE: 200 46 | NORM_TYPE: sync_batchnorm 47 | NUM_SYNC_DEVICES: 4 48 | SOLVER: 49 | BASE_LR: 0.0375 50 | LR_POLICY: steps_with_relative_lrs 51 | LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 52 | STEPS: [0, 41, 49] 53 | MAX_EPOCH: 57 54 | MOMENTUM: 0.9 55 | WEIGHT_DECAY: 1e-4 56 | WARMUP_EPOCHS: 4.0 57 | WARMUP_START_LR: 0.0001 58 | OPTIMIZING_METHOD: sgd 59 | MODEL: 60 | NUM_CLASSES: 157 61 | ARCH: slowfast 62 | LOSS_FUNC: bce_logit 63 | HEAD_ACT: sigmoid 64 | DROPOUT_RATE: 0.5 65 | TEST: 66 | ENABLE: True 67 | DATASET: charades 68 | BATCH_SIZE: 16 69 | NUM_ENSEMBLE_VIEWS: 10 70 | NUM_SPATIAL_CROPS: 3 71 | DATA_LOADER: 72 | NUM_WORKERS: 8 73 | PIN_MEMORY: True 74 | NUM_GPUS: 8 75 | NUM_SHARDS: 1 76 | RNG_SEED: 0 77 | OUTPUT_DIR: . 78 | LOG_MODEL_INFO: False 79 | -------------------------------------------------------------------------------- /configs/Kinetics/AVSLOWFAST_4x16_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: ../../data/output/checkpoints/avslowfast.pth 9 | # CHECKPOINT_TYPE: pytorch # caffe2 or pytorch 10 | DATA: 11 | USE_BGR_ORDER: False # False 12 | NUM_FRAMES: 32 13 | SAMPLING_RATE: 2 14 | TRAIN_JITTER_SCALES: [256, 320] 15 | TRAIN_CROP_SIZE: 224 16 | TEST_CROP_SIZE: 256 17 | INPUT_CHANNEL_NUM: [3, 3, 1] 18 | USE_AUDIO: True 19 | GET_MISALIGNED_AUDIO: True 20 | AUDIO_SAMPLE_RATE: 16000 21 | AUDIO_WIN_SZ: 32 22 | AUDIO_STEP_SZ: 16 23 | AUDIO_FRAME_NUM: 128 24 | AUDIO_MEL_NUM: 80 25 | AUDIO_MISALIGNED_GAP: 32 # half second 26 | LOGMEL_MEAN: -7.03 # -7.03, -24.227 27 | LOGMEL_STD: 4.66 # 4.66, 1.0 28 | EASY_NEG_RATIO: 0.75 29 | MIX_NEG_EPOCH: 96 30 | SLOWFAST: 31 | ALPHA: 8 32 | BETA_INV: 8 33 | FUSION_CONV_CHANNEL_RATIO: 2 34 | FUSION_KERNEL_SZ: 5 35 | AU_ALPHA: 32 36 | AU_BETA_INV: 2 37 | AU_FUSION_CONV_CHANNEL_MODE: ByDim # ByDim, ByRatio 38 | AU_FUSION_CONV_CHANNEL_RATIO: 0.25 39 | AU_FUSION_CONV_CHANNEL_DIM: 64 40 | AU_FUSION_KERNEL_SZ: 5 41 | AU_FUSION_CONV_NUM: 2 42 | AU_REDUCE_TF_DIM: True 43 | FS_FUSION: [False, False, True, True] 44 | AFS_FUSION: [False, False, True, True] 45 | AVS_FLAG: [False, False, True, True, True] 46 | AVS_PROJ_DIM: 64 47 | AVS_VAR_THRESH: 0.01 48 | AVS_DUPLICATE_THRESH: 0.99999 49 | DROPPATHWAY_RATE: 0.8 # 0.8 50 | RESNET: 51 | ZERO_INIT_FINAL_BN: True 52 | WIDTH_PER_GROUP: 64 53 | NUM_GROUPS: 1 54 | DEPTH: 50 55 | TRANS_FUNC: bottleneck_transform 56 | AUDIO_TRANS_FUNC: tf_bottleneck_transform_v1 57 | AUDIO_TRANS_NUM: 2 58 | STRIDE_1X1: False 59 | # 18: [[2, 2, 2], [2, 2, 2], [2, 2, 2], [2, 2, 2]] 60 | # 34: [[3, 3, 3], [4, 4, 4], [6, 6, 6], [3, 3, 3]] 61 | # 50: [[3, 3, 3], [4, 4, 4], [6, 6, 6], [3, 3, 3]] 62 | # 101: [[3, 3, 3], [4, 4, 4], [23, 23, 23], [3, 3, 3]] 63 | # 152: [[3, 3, 3], [8, 8, 8], [36, 36, 36], [3, 3, 3]] 64 | NUM_BLOCK_TEMP_KERNEL: [[3, 3, 3], [4, 4, 4], [6, 6, 6], [3, 3, 3]] 65 | SPATIAL_DILATIONS: [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]] 66 | NONLOCAL: 67 | LOCATION: [[[], [], []], [[], [], []], [[], [], []], [[], [], []]] 68 | GROUP: [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]] 69 | POOL: [ 70 | [[1, 2, 2], [1, 2, 2], [1, 2, 2]], 71 | [[1, 2, 2], [1, 2, 2], [1, 2, 2]], 72 | [[1, 2, 2], [1, 2, 2], [1, 2, 2]], 73 | [[1, 2, 2], [1, 2, 2], [1, 2, 2]], 74 | ] 75 | INSTANTIATION: dot_product 76 | BN: 77 | USE_PRECISE_STATS: True 78 | NUM_BATCHES_PRECISE: 200 79 | MOMENTUM: 0.1 80 | WEIGHT_DECAY: 0.0 81 | SOLVER: 82 | BASE_LR: 0.1 # 0.1 83 | LR_POLICY: cosine 84 | MAX_EPOCH: 196 85 | MOMENTUM: 0.9 86 | WEIGHT_DECAY: 1e-4 87 | WARMUP_EPOCHS: 34.0 # 34.0 88 | WARMUP_START_LR: 0.01 # 0.01 89 | OPTIMIZING_METHOD: sgd 90 | MODEL: 91 | NUM_CLASSES: 400 92 | MODEL_NAME: AVSlowFast 93 | ARCH: avslowfast 94 | LOSS_FUNC: cross_entropy 95 | DROPOUT_RATE: 0.5 96 | TEST: 97 | ENABLE: True 98 | DATASET: kinetics 99 | BATCH_SIZE: 64 100 | # CHECKPOINT_FILE_PATH: ../../data/output/checkpoints/avslowfast.pth 101 | # CHECKPOINT_TYPE: pytorch # caffe2 or pytorch 102 | DATA_LOADER: 103 | NUM_WORKERS: 8 # 8 104 | PIN_MEMORY: True 105 | NUM_GPUS: 8 106 | NUM_SHARDS: 1 107 | RNG_SEED: 0 108 | OUTPUT_DIR: ./output/AVSlowFast-R50-4x16 109 | -------------------------------------------------------------------------------- /configs/Kinetics/AVSLOWFAST_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: ../../data/output/checkpoints/avslowfast.pth 9 | # CHECKPOINT_TYPE: pytorch # caffe2 or pytorch 10 | DATA: 11 | USE_BGR_ORDER: False # False 12 | NUM_FRAMES: 32 13 | SAMPLING_RATE: 2 14 | TRAIN_JITTER_SCALES: [256, 320] 15 | TRAIN_CROP_SIZE: 224 16 | TEST_CROP_SIZE: 256 17 | INPUT_CHANNEL_NUM: [3, 3, 1] 18 | USE_AUDIO: True 19 | GET_MISALIGNED_AUDIO: True 20 | AUDIO_SAMPLE_RATE: 16000 21 | AUDIO_WIN_SZ: 32 22 | AUDIO_STEP_SZ: 16 23 | AUDIO_FRAME_NUM: 128 24 | AUDIO_MEL_NUM: 80 25 | AUDIO_MISALIGNED_GAP: 32 # half second 26 | LOGMEL_MEAN: -7.03 # -7.03, -24.227 27 | LOGMEL_STD: 4.66 # 4.66, 1.0 28 | EASY_NEG_RATIO: 0.75 29 | MIX_NEG_EPOCH: 96 30 | SLOWFAST: 31 | ALPHA: 4 32 | BETA_INV: 8 33 | FUSION_CONV_CHANNEL_RATIO: 2 34 | FUSION_KERNEL_SZ: 7 35 | AU_ALPHA: 16 36 | AU_BETA_INV: 2 37 | AU_FUSION_CONV_CHANNEL_MODE: ByDim # ByDim, ByRatio 38 | AU_FUSION_CONV_CHANNEL_RATIO: 0.25 39 | AU_FUSION_CONV_CHANNEL_DIM: 64 40 | AU_FUSION_KERNEL_SZ: 5 41 | AU_FUSION_CONV_NUM: 2 42 | AU_REDUCE_TF_DIM: True 43 | FS_FUSION: [False, False, True, True] 44 | AFS_FUSION: [False, False, True, True] 45 | AVS_FLAG: [False, False, True, True, True] 46 | AVS_PROJ_DIM: 64 47 | AVS_VAR_THRESH: 0.01 48 | AVS_DUPLICATE_THRESH: 0.99999 49 | DROPPATHWAY_RATE: 0.8 # 0.8 50 | RESNET: 51 | ZERO_INIT_FINAL_BN: True 52 | WIDTH_PER_GROUP: 64 53 | NUM_GROUPS: 1 54 | DEPTH: 50 55 | TRANS_FUNC: bottleneck_transform 56 | AUDIO_TRANS_FUNC: tf_bottleneck_transform_v1 57 | AUDIO_TRANS_NUM: 2 58 | STRIDE_1X1: False 59 | # 18: [[2, 2, 2], [2, 2, 2], [2, 2, 2], [2, 2, 2]] 60 | # 34: [[3, 3, 3], [4, 4, 4], [6, 6, 6], [3, 3, 3]] 61 | # 50: [[3, 3, 3], [4, 4, 4], [6, 6, 6], [3, 3, 3]] 62 | # 101: [[3, 3, 3], [4, 4, 4], [23, 23, 23], [3, 3, 3]] 63 | # 152: [[3, 3, 3], [8, 8, 8], [36, 36, 36], [3, 3, 3]] 64 | NUM_BLOCK_TEMP_KERNEL: [[3, 3, 3], [4, 4, 4], [6, 6, 6], [3, 3, 3]] 65 | SPATIAL_DILATIONS: [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]] 66 | NONLOCAL: 67 | LOCATION: [[[], [], []], [[], [], []], [[], [], []], [[], [], []]] 68 | GROUP: [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]] 69 | POOL: [ 70 | [[1, 2, 2], [1, 2, 2], [1, 2, 2]], 71 | [[1, 2, 2], [1, 2, 2], [1, 2, 2]], 72 | [[1, 2, 2], [1, 2, 2], [1, 2, 2]], 73 | [[1, 2, 2], [1, 2, 2], [1, 2, 2]], 74 | ] 75 | INSTANTIATION: dot_product 76 | BN: 77 | USE_PRECISE_STATS: True 78 | NUM_BATCHES_PRECISE: 400 79 | MOMENTUM: 0.1 80 | WEIGHT_DECAY: 0.0 81 | SOLVER: 82 | BASE_LR: 0.1 # 0.1 83 | LR_POLICY: cosine 84 | MAX_EPOCH: 196 85 | MOMENTUM: 0.9 86 | WEIGHT_DECAY: 1e-4 87 | WARMUP_EPOCHS: 34.0 # 34.0 88 | WARMUP_START_LR: 0.01 # 0.01 89 | OPTIMIZING_METHOD: sgd 90 | MODEL: 91 | NUM_CLASSES: 400 92 | MODEL_NAME: AVSlowFast 93 | ARCH: avslowfast 94 | LOSS_FUNC: cross_entropy 95 | DROPOUT_RATE: 0.5 96 | TEST: 97 | ENABLE: True 98 | DATASET: kinetics 99 | BATCH_SIZE: 32 100 | # CHECKPOINT_FILE_PATH: ../../data/output/checkpoints/avslowfast.pth 101 | # CHECKPOINT_TYPE: pytorch # caffe2 or pytorch 102 | DATA_LOADER: 103 | NUM_WORKERS: 8 # 8 104 | PIN_MEMORY: True 105 | NUM_GPUS: 8 106 | NUM_SHARDS: 1 107 | RNG_SEED: 0 108 | OUTPUT_DIR: ./output/AVSlowFast-R50-8x8 109 | -------------------------------------------------------------------------------- /configs/Kinetics/C2D_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 8 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 50 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[]], [[]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: softmax 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: c2d 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/C2D_8x8_R50_IN1K.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | CHECKPOINT_FILE_PATH: /mnt/vol/gfsai-bistro2-east/ai-group/bistro/gpu/haoqifan/pySlowFastModelZoo/imagenet50_pretrain.pyth 9 | CHECKPOINT_INFLATE: True 10 | DATA: 11 | NUM_FRAMES: 8 12 | SAMPLING_RATE: 8 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | PATH_TO_DATA_DIR: /mnt/vol/gfsai-east/ai-group/users/haoqifan/kinetics/alllist/py_slowfast 18 | RESNET: 19 | ZERO_INIT_FINAL_BN: True 20 | WIDTH_PER_GROUP: 64 21 | NUM_GROUPS: 1 22 | DEPTH: 50 23 | TRANS_FUNC: bottleneck_transform 24 | STRIDE_1X1: False 25 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 26 | NONLOCAL: 27 | LOCATION: [[[]], [[]], [[]], [[]]] 28 | GROUP: [[1], [1], [1], [1]] 29 | INSTANTIATION: softmax 30 | BN: 31 | USE_PRECISE_STATS: True 32 | NUM_BATCHES_PRECISE: 200 33 | SOLVER: 34 | BASE_LR: 0.01 35 | LR_POLICY: steps_with_relative_lrs 36 | STEPS: [0, 44, 88, 118] 37 | LRS: [1, 0.1, 0.01, 0.001] 38 | MAX_EPOCH: 118 39 | MOMENTUM: 0.9 40 | WEIGHT_DECAY: 1e-4 41 | OPTIMIZING_METHOD: sgd 42 | MODEL: 43 | NUM_CLASSES: 400 44 | ARCH: c2d 45 | MODEL_NAME: ResNet 46 | LOSS_FUNC: cross_entropy 47 | DROPOUT_RATE: 0.5 48 | TEST: 49 | ENABLE: True 50 | DATASET: kinetics 51 | BATCH_SIZE: 64 52 | DATA_LOADER: 53 | NUM_WORKERS: 8 54 | PIN_MEMORY: True 55 | NUM_GPUS: 8 56 | NUM_SHARDS: 1 57 | RNG_SEED: 0 58 | OUTPUT_DIR: . 59 | -------------------------------------------------------------------------------- /configs/Kinetics/C2D_NLN_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 8 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 50 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[1, 3]], [[1, 3, 5]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: softmax 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: c2d 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/C2D_NLN_8x8_R50_IN1K.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | CHECKPOINT_FILE_PATH: /mnt/vol/gfsai-bistro2-east/ai-group/bistro/gpu/haoqifan/pySlowFastModelZoo/imagenet50_pretrain.pyth 9 | CHECKPOINT_INFLATE: True 10 | DATA: 11 | NUM_FRAMES: 8 12 | SAMPLING_RATE: 8 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | PATH_TO_DATA_DIR: /mnt/vol/gfsai-east/ai-group/users/haoqifan/kinetics/alllist/py_slowfast 18 | RESNET: 19 | ZERO_INIT_FINAL_BN: True 20 | WIDTH_PER_GROUP: 64 21 | NUM_GROUPS: 1 22 | DEPTH: 50 23 | TRANS_FUNC: bottleneck_transform 24 | STRIDE_1X1: False 25 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 26 | NONLOCAL: 27 | LOCATION: [[[]], [[1, 3]], [[1, 3, 5]], [[]]] 28 | GROUP: [[1], [1], [1], [1]] 29 | INSTANTIATION: softmax 30 | BN: 31 | USE_PRECISE_STATS: True 32 | NUM_BATCHES_PRECISE: 200 33 | SOLVER: 34 | BASE_LR: 0.01 35 | LR_POLICY: steps_with_relative_lrs 36 | STEPS: [0, 44, 88, 118] 37 | LRS: [1, 0.1, 0.01, 0.001] 38 | MAX_EPOCH: 118 39 | MOMENTUM: 0.9 40 | WEIGHT_DECAY: 1e-4 41 | OPTIMIZING_METHOD: sgd 42 | MODEL: 43 | NUM_CLASSES: 400 44 | ARCH: c2d 45 | MODEL_NAME: ResNet 46 | LOSS_FUNC: cross_entropy 47 | DROPOUT_RATE: 0.5 48 | TEST: 49 | ENABLE: True 50 | DATASET: kinetics 51 | BATCH_SIZE: 64 52 | DATA_LOADER: 53 | NUM_WORKERS: 8 54 | PIN_MEMORY: True 55 | NUM_GPUS: 8 56 | NUM_SHARDS: 1 57 | RNG_SEED: 0 58 | OUTPUT_DIR: . 59 | -------------------------------------------------------------------------------- /configs/Kinetics/I3D_8x8_R101.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 8 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 101 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [23], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[]], [[]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: softmax 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: i3d 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/I3D_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 8 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 50 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[]], [[]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: softmax 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: i3d 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/I3D_8x8_R50_IN1K.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | CHECKPOINT_FILE_PATH: /mnt/vol/gfsai-bistro2-east/ai-group/bistro/gpu/haoqifan/pySlowFastModelZoo/imagenet50_pretrain.pyth 9 | CHECKPOINT_INFLATE: True 10 | DATA: 11 | NUM_FRAMES: 8 12 | SAMPLING_RATE: 8 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | PATH_TO_DATA_DIR: /mnt/vol/gfsai-east/ai-group/users/haoqifan/kinetics/alllist/py_slowfast 18 | RESNET: 19 | ZERO_INIT_FINAL_BN: True 20 | WIDTH_PER_GROUP: 64 21 | NUM_GROUPS: 1 22 | DEPTH: 50 23 | TRANS_FUNC: bottleneck_transform 24 | STRIDE_1X1: False 25 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 26 | NONLOCAL: 27 | LOCATION: [[[]], [[]], [[]], [[]]] 28 | GROUP: [[1], [1], [1], [1]] 29 | INSTANTIATION: softmax 30 | BN: 31 | USE_PRECISE_STATS: True 32 | NUM_BATCHES_PRECISE: 200 33 | SOLVER: 34 | BASE_LR: 0.01 35 | LR_POLICY: steps_with_relative_lrs 36 | STEPS: [0, 44, 88, 118] 37 | LRS: [1, 0.1, 0.01, 0.001] 38 | MAX_EPOCH: 118 39 | MOMENTUM: 0.9 40 | WEIGHT_DECAY: 1e-4 41 | OPTIMIZING_METHOD: sgd 42 | MODEL: 43 | NUM_CLASSES: 400 44 | ARCH: i3d 45 | MODEL_NAME: ResNet 46 | LOSS_FUNC: cross_entropy 47 | DROPOUT_RATE: 0.5 48 | TEST: 49 | ENABLE: True 50 | DATASET: kinetics 51 | BATCH_SIZE: 64 52 | DATA_LOADER: 53 | NUM_WORKERS: 8 54 | PIN_MEMORY: True 55 | NUM_GPUS: 8 56 | NUM_SHARDS: 1 57 | RNG_SEED: 0 58 | OUTPUT_DIR: . 59 | -------------------------------------------------------------------------------- /configs/Kinetics/I3D_NLN_8x8_R101.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 8 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 101 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [23], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[1, 3]], [[1, 3, 5]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: softmax 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: i3d 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/I3D_NLN_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 8 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 50 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[1, 3]], [[1, 3, 5]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: softmax 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: i3d 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/I3D_NLN_8x8_R50_IN1K.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | CHECKPOINT_FILE_PATH: /mnt/vol/gfsai-bistro2-east/ai-group/bistro/gpu/haoqifan/pySlowFastModelZoo/imagenet50_pretrain.pyth 9 | CHECKPOINT_INFLATE: True 10 | DATA: 11 | NUM_FRAMES: 8 12 | SAMPLING_RATE: 8 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | PATH_TO_DATA_DIR: /mnt/vol/gfsai-east/ai-group/users/haoqifan/kinetics/alllist/py_slowfast 18 | RESNET: 19 | ZERO_INIT_FINAL_BN: True 20 | WIDTH_PER_GROUP: 64 21 | NUM_GROUPS: 1 22 | DEPTH: 50 23 | TRANS_FUNC: bottleneck_transform 24 | STRIDE_1X1: False 25 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 26 | NONLOCAL: 27 | LOCATION: [[[]], [[1, 3]], [[1, 3, 5]], [[]]] 28 | GROUP: [[1], [1], [1], [1]] 29 | INSTANTIATION: softmax 30 | BN: 31 | USE_PRECISE_STATS: True 32 | NUM_BATCHES_PRECISE: 200 33 | SOLVER: 34 | BASE_LR: 0.01 35 | LR_POLICY: steps_with_relative_lrs 36 | STEPS: [0, 44, 88, 118] 37 | LRS: [1, 0.1, 0.01, 0.001] 38 | MAX_EPOCH: 118 39 | MOMENTUM: 0.9 40 | WEIGHT_DECAY: 1e-4 41 | OPTIMIZING_METHOD: sgd 42 | MODEL: 43 | NUM_CLASSES: 400 44 | ARCH: i3d 45 | MODEL_NAME: ResNet 46 | LOSS_FUNC: cross_entropy 47 | DROPOUT_RATE: 0.5 48 | TEST: 49 | ENABLE: True 50 | DATASET: kinetics 51 | BATCH_SIZE: 64 52 | DATA_LOADER: 53 | NUM_WORKERS: 8 54 | PIN_MEMORY: True 55 | NUM_GPUS: 8 56 | NUM_SHARDS: 1 57 | RNG_SEED: 0 58 | OUTPUT_DIR: . 59 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_4x16_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 32 10 | SAMPLING_RATE: 2 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3, 3] 15 | SLOWFAST: 16 | ALPHA: 8 17 | BETA_INV: 8 18 | FUSION_CONV_CHANNEL_RATIO: 2 19 | FUSION_KERNEL_SZ: 5 20 | RESNET: 21 | ZERO_INIT_FINAL_BN: True 22 | WIDTH_PER_GROUP: 64 23 | NUM_GROUPS: 1 24 | DEPTH: 50 25 | TRANS_FUNC: bottleneck_transform 26 | STRIDE_1X1: False 27 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 28 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 29 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 30 | NONLOCAL: 31 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 32 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 33 | INSTANTIATION: dot_product 34 | BN: 35 | USE_PRECISE_STATS: True 36 | NUM_BATCHES_PRECISE: 200 37 | SOLVER: 38 | BASE_LR: 0.1 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 196 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 1e-4 43 | WARMUP_EPOCHS: 34.0 44 | WARMUP_START_LR: 0.01 45 | OPTIMIZING_METHOD: sgd 46 | MODEL: 47 | NUM_CLASSES: 400 48 | ARCH: slowfast 49 | MODEL_NAME: SlowFast 50 | LOSS_FUNC: cross_entropy 51 | DROPOUT_RATE: 0.5 52 | TEST: 53 | ENABLE: True 54 | DATASET: kinetics 55 | BATCH_SIZE: 64 56 | DATA_LOADER: 57 | NUM_WORKERS: 8 58 | PIN_MEMORY: True 59 | NUM_GPUS: 8 60 | NUM_SHARDS: 1 61 | RNG_SEED: 0 62 | OUTPUT_DIR: . 63 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 32 10 | SAMPLING_RATE: 2 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3, 3] 15 | SLOWFAST: 16 | ALPHA: 4 17 | BETA_INV: 8 18 | FUSION_CONV_CHANNEL_RATIO: 2 19 | FUSION_KERNEL_SZ: 7 20 | RESNET: 21 | ZERO_INIT_FINAL_BN: True 22 | WIDTH_PER_GROUP: 64 23 | NUM_GROUPS: 1 24 | DEPTH: 50 25 | TRANS_FUNC: bottleneck_transform 26 | STRIDE_1X1: False 27 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 28 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 29 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 30 | NONLOCAL: 31 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 32 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 33 | INSTANTIATION: dot_product 34 | BN: 35 | USE_PRECISE_STATS: True 36 | NUM_BATCHES_PRECISE: 200 37 | SOLVER: 38 | BASE_LR: 0.1 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 196 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 1e-4 43 | WARMUP_EPOCHS: 34.0 44 | WARMUP_START_LR: 0.01 45 | OPTIMIZING_METHOD: sgd 46 | MODEL: 47 | NUM_CLASSES: 400 48 | ARCH: slowfast 49 | MODEL_NAME: SlowFast 50 | LOSS_FUNC: cross_entropy 51 | DROPOUT_RATE: 0.5 52 | TEST: 53 | ENABLE: True 54 | DATASET: kinetics 55 | BATCH_SIZE: 64 56 | DATA_LOADER: 57 | NUM_WORKERS: 8 58 | PIN_MEMORY: True 59 | NUM_GPUS: 8 60 | NUM_SHARDS: 1 61 | RNG_SEED: 0 62 | OUTPUT_DIR: . 63 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_8x8_R50_stepwise.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 32 10 | SAMPLING_RATE: 2 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3, 3] 15 | SLOWFAST: 16 | ALPHA: 4 17 | BETA_INV: 8 18 | FUSION_CONV_CHANNEL_RATIO: 2 19 | FUSION_KERNEL_SZ: 7 20 | RESNET: 21 | ZERO_INIT_FINAL_BN: True 22 | WIDTH_PER_GROUP: 64 23 | NUM_GROUPS: 1 24 | DEPTH: 50 25 | TRANS_FUNC: bottleneck_transform 26 | STRIDE_1X1: False 27 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 28 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 29 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 30 | NONLOCAL: 31 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 32 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 33 | INSTANTIATION: dot_product 34 | BN: 35 | USE_PRECISE_STATS: True 36 | NUM_BATCHES_PRECISE: 200 37 | SOLVER: 38 | BASE_LR: 0.1 39 | LR_POLICY: steps_with_relative_lrs 40 | LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 41 | STEPS: [0, 94, 154, 196] 42 | MAX_EPOCH: 239 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34.0 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | MODEL_NAME: SlowFast 52 | LOSS_FUNC: cross_entropy 53 | DROPOUT_RATE: 0.5 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | DATA_LOADER: 59 | NUM_WORKERS: 8 60 | PIN_MEMORY: True 61 | NUM_GPUS: 8 62 | NUM_SHARDS: 1 63 | RNG_SEED: 0 64 | OUTPUT_DIR: . 65 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_8x8_R50_stepwise_multigrid.yaml: -------------------------------------------------------------------------------- 1 | MULTIGRID: 2 | SHORT_CYCLE: True 3 | LONG_CYCLE: True 4 | TRAIN: 5 | ENABLE: True 6 | DATASET: kinetics 7 | BATCH_SIZE: 64 8 | EVAL_PERIOD: 10 9 | CHECKPOINT_PERIOD: 1 10 | AUTO_RESUME: True 11 | DATA: 12 | NUM_FRAMES: 32 13 | SAMPLING_RATE: 2 14 | TRAIN_JITTER_SCALES: [256, 320] 15 | TRAIN_CROP_SIZE: 224 16 | TEST_CROP_SIZE: 224 17 | INPUT_CHANNEL_NUM: [3, 3] 18 | SLOWFAST: 19 | ALPHA: 4 20 | BETA_INV: 8 21 | FUSION_CONV_CHANNEL_RATIO: 2 22 | FUSION_KERNEL_SZ: 7 23 | RESNET: 24 | ZERO_INIT_FINAL_BN: True 25 | WIDTH_PER_GROUP: 64 26 | NUM_GROUPS: 1 27 | DEPTH: 50 28 | TRANS_FUNC: bottleneck_transform 29 | STRIDE_1X1: False 30 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 31 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 32 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 33 | NONLOCAL: 34 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 35 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 36 | INSTANTIATION: dot_product 37 | BN: 38 | USE_PRECISE_STATS: True 39 | NUM_BATCHES_PRECISE: 200 40 | SOLVER: 41 | BASE_LR: 0.1 42 | LR_POLICY: steps_with_relative_lrs 43 | LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 44 | STEPS: [0, 94, 154, 196] 45 | MAX_EPOCH: 239 46 | MOMENTUM: 0.9 47 | WEIGHT_DECAY: 1e-4 48 | WARMUP_EPOCHS: 34.0 49 | WARMUP_START_LR: 0.01 50 | OPTIMIZING_METHOD: sgd 51 | MODEL: 52 | NUM_CLASSES: 400 53 | ARCH: slowfast 54 | MODEL_NAME: SlowFast 55 | LOSS_FUNC: cross_entropy 56 | DROPOUT_RATE: 0.5 57 | TEST: 58 | ENABLE: True 59 | DATASET: kinetics 60 | BATCH_SIZE: 64 61 | DATA_LOADER: 62 | NUM_WORKERS: 8 63 | PIN_MEMORY: True 64 | NUM_GPUS: 8 65 | NUM_SHARDS: 1 66 | RNG_SEED: 0 67 | OUTPUT_DIR: . 68 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_NLN_4x16_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 32 10 | SAMPLING_RATE: 2 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3, 3] 15 | SLOWFAST: 16 | ALPHA: 8 17 | BETA_INV: 8 18 | FUSION_CONV_CHANNEL_RATIO: 2 19 | FUSION_KERNEL_SZ: 5 20 | RESNET: 21 | ZERO_INIT_FINAL_BN: True 22 | WIDTH_PER_GROUP: 64 23 | NUM_GROUPS: 1 24 | DEPTH: 50 25 | TRANS_FUNC: bottleneck_transform 26 | STRIDE_1X1: False 27 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 28 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 29 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 30 | NONLOCAL: 31 | LOCATION: [[[], []], [[1, 3], []], [[1, 3, 5], []], [[], []]] 32 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 33 | INSTANTIATION: dot_product 34 | BN: 35 | USE_PRECISE_STATS: True 36 | NUM_BATCHES_PRECISE: 200 37 | SOLVER: 38 | BASE_LR: 0.1 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 196 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 1e-4 43 | WARMUP_EPOCHS: 34.0 44 | WARMUP_START_LR: 0.01 45 | OPTIMIZING_METHOD: sgd 46 | MODEL: 47 | NUM_CLASSES: 400 48 | ARCH: slowfast 49 | MODEL_NAME: SlowFast 50 | LOSS_FUNC: cross_entropy 51 | DROPOUT_RATE: 0.5 52 | TEST: 53 | ENABLE: True 54 | DATASET: kinetics 55 | BATCH_SIZE: 64 56 | DATA_LOADER: 57 | NUM_WORKERS: 8 58 | PIN_MEMORY: True 59 | NUM_GPUS: 8 60 | NUM_SHARDS: 1 61 | RNG_SEED: 0 62 | OUTPUT_DIR: . 63 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_NLN_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 32 10 | SAMPLING_RATE: 2 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3, 3] 15 | SLOWFAST: 16 | ALPHA: 4 17 | BETA_INV: 8 18 | FUSION_CONV_CHANNEL_RATIO: 2 19 | FUSION_KERNEL_SZ: 5 20 | RESNET: 21 | ZERO_INIT_FINAL_BN: True 22 | WIDTH_PER_GROUP: 64 23 | NUM_GROUPS: 1 24 | DEPTH: 50 25 | TRANS_FUNC: bottleneck_transform 26 | STRIDE_1X1: False 27 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 28 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 29 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 30 | NONLOCAL: 31 | LOCATION: [[[], []], [[1, 3], []], [[1, 3, 5], []], [[], []]] 32 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 33 | INSTANTIATION: dot_product 34 | BN: 35 | USE_PRECISE_STATS: True 36 | NUM_BATCHES_PRECISE: 200 37 | SOLVER: 38 | BASE_LR: 0.1 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 196 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 1e-4 43 | WARMUP_EPOCHS: 34.0 44 | WARMUP_START_LR: 0.01 45 | OPTIMIZING_METHOD: sgd 46 | MODEL: 47 | NUM_CLASSES: 400 48 | ARCH: slowfast 49 | MODEL_NAME: SlowFast 50 | LOSS_FUNC: cross_entropy 51 | DROPOUT_RATE: 0.5 52 | TEST: 53 | ENABLE: True 54 | DATASET: kinetics 55 | BATCH_SIZE: 64 56 | DATA_LOADER: 57 | NUM_WORKERS: 8 58 | PIN_MEMORY: True 59 | NUM_GPUS: 8 60 | NUM_SHARDS: 1 61 | RNG_SEED: 0 62 | OUTPUT_DIR: . 63 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_4x16_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 4 10 | SAMPLING_RATE: 16 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 50 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[]], [[]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: dot_product 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: slow 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 8 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 50 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[]], [[]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: dot_product 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: slow 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_NLN_4x16_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 4 10 | SAMPLING_RATE: 16 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 50 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[1, 3]], [[1, 3, 5]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: dot_product 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: slow 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOW_NLN_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 8 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 256 14 | INPUT_CHANNEL_NUM: [3] 15 | RESNET: 16 | ZERO_INIT_FINAL_BN: True 17 | WIDTH_PER_GROUP: 64 18 | NUM_GROUPS: 1 19 | DEPTH: 50 20 | TRANS_FUNC: bottleneck_transform 21 | STRIDE_1X1: False 22 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 23 | NONLOCAL: 24 | LOCATION: [[[]], [[1, 3]], [[1, 3, 5]], [[]]] 25 | GROUP: [[1], [1], [1], [1]] 26 | INSTANTIATION: dot_product 27 | BN: 28 | USE_PRECISE_STATS: True 29 | NUM_BATCHES_PRECISE: 200 30 | SOLVER: 31 | BASE_LR: 0.1 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 196 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 34.0 37 | WARMUP_START_LR: 0.01 38 | OPTIMIZING_METHOD: sgd 39 | MODEL: 40 | NUM_CLASSES: 400 41 | ARCH: slow 42 | MODEL_NAME: ResNet 43 | LOSS_FUNC: cross_entropy 44 | DROPOUT_RATE: 0.5 45 | TEST: 46 | ENABLE: True 47 | DATASET: kinetics 48 | BATCH_SIZE: 64 49 | DATA_LOADER: 50 | NUM_WORKERS: 8 51 | PIN_MEMORY: True 52 | NUM_GPUS: 8 53 | NUM_SHARDS: 1 54 | RNG_SEED: 0 55 | OUTPUT_DIR: . 56 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/C2D_NOPOOL_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 8 12 | SAMPLING_RATE: 8 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | RESNET: 18 | ZERO_INIT_FINAL_BN: True 19 | WIDTH_PER_GROUP: 64 20 | NUM_GROUPS: 1 21 | DEPTH: 50 22 | TRANS_FUNC: bottleneck_transform 23 | STRIDE_1X1: False 24 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 25 | NONLOCAL: 26 | LOCATION: [[[]], [[]], [[]], [[]]] 27 | GROUP: [[1], [1], [1], [1]] 28 | INSTANTIATION: softmax 29 | BN: 30 | USE_PRECISE_STATS: True 31 | NUM_BATCHES_PRECISE: 200 32 | SOLVER: 33 | BASE_LR: 0.1 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 196 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 1e-4 38 | WARMUP_EPOCHS: 34.0 39 | WARMUP_START_LR: 0.01 40 | OPTIMIZING_METHOD: sgd 41 | MODEL: 42 | NUM_CLASSES: 400 43 | ARCH: c2d 44 | MODEL_NAME: ResNet_nopool 45 | LOSS_FUNC: cross_entropy 46 | DROPOUT_RATE: 0.5 47 | TEST: 48 | ENABLE: True 49 | DATASET: kinetics 50 | BATCH_SIZE: 64 51 | DATA_LOADER: 52 | NUM_WORKERS: 8 53 | PIN_MEMORY: True 54 | NUM_GPUS: 8 55 | NUM_SHARDS: 1 56 | RNG_SEED: 0 57 | OUTPUT_DIR: . 58 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/I3D_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 8 12 | SAMPLING_RATE: 8 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | RESNET: 18 | ZERO_INIT_FINAL_BN: True 19 | WIDTH_PER_GROUP: 64 20 | NUM_GROUPS: 1 21 | DEPTH: 50 22 | TRANS_FUNC: bottleneck_transform 23 | STRIDE_1X1: False 24 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 25 | NONLOCAL: 26 | LOCATION: [[[]], [[]], [[]], [[]]] 27 | GROUP: [[1], [1], [1], [1]] 28 | INSTANTIATION: softmax 29 | BN: 30 | USE_PRECISE_STATS: True 31 | NUM_BATCHES_PRECISE: 200 32 | SOLVER: 33 | BASE_LR: 0.1 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 196 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 1e-4 38 | WARMUP_EPOCHS: 34.0 39 | WARMUP_START_LR: 0.01 40 | OPTIMIZING_METHOD: sgd 41 | MODEL: 42 | NUM_CLASSES: 400 43 | ARCH: i3d 44 | MODEL_NAME: ResNet 45 | LOSS_FUNC: cross_entropy 46 | DROPOUT_RATE: 0.5 47 | TEST: 48 | ENABLE: True 49 | DATASET: kinetics 50 | BATCH_SIZE: 64 51 | DATA_LOADER: 52 | NUM_WORKERS: 8 53 | PIN_MEMORY: True 54 | NUM_GPUS: 8 55 | NUM_SHARDS: 1 56 | RNG_SEED: 0 57 | OUTPUT_DIR: . 58 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/I3D_NLN_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 8 12 | SAMPLING_RATE: 8 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | RESNET: 18 | ZERO_INIT_FINAL_BN: True 19 | WIDTH_PER_GROUP: 64 20 | NUM_GROUPS: 1 21 | DEPTH: 50 22 | TRANS_FUNC: bottleneck_transform 23 | STRIDE_1X1: False 24 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 25 | NONLOCAL: 26 | LOCATION: [[[]], [[1, 3]], [[1, 3, 5]], [[]]] 27 | GROUP: [[1], [1], [1], [1]] 28 | INSTANTIATION: softmax 29 | BN: 30 | USE_PRECISE_STATS: True 31 | NUM_BATCHES_PRECISE: 200 32 | SOLVER: 33 | BASE_LR: 0.1 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 196 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 1e-4 38 | WARMUP_EPOCHS: 34.0 39 | WARMUP_START_LR: 0.01 40 | OPTIMIZING_METHOD: sgd 41 | MODEL: 42 | NUM_CLASSES: 400 43 | ARCH: i3d 44 | MODEL_NAME: ResNet 45 | LOSS_FUNC: cross_entropy 46 | DROPOUT_RATE: 0.5 47 | TEST: 48 | ENABLE: True 49 | DATASET: kinetics 50 | BATCH_SIZE: 64 51 | DATA_LOADER: 52 | NUM_WORKERS: 8 53 | PIN_MEMORY: True 54 | NUM_GPUS: 8 55 | NUM_SHARDS: 1 56 | RNG_SEED: 0 57 | OUTPUT_DIR: . 58 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOWFAST_16x8_R101_50_50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 64 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | SLOWFAST: 18 | ALPHA: 4 19 | BETA_INV: 8 20 | FUSION_CONV_CHANNEL_RATIO: 2 21 | FUSION_KERNEL_SZ: 5 22 | RESNET: 23 | ZERO_INIT_FINAL_BN: True 24 | WIDTH_PER_GROUP: 64 25 | NUM_GROUPS: 1 26 | DEPTH: 101 27 | TRANS_FUNC: bottleneck_transform 28 | STRIDE_1X1: False 29 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | NONLOCAL: 33 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 34 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | INSTANTIATION: dot_product 36 | BN: 37 | USE_PRECISE_STATS: True 38 | NUM_BATCHES_PRECISE: 200 39 | SOLVER: 40 | BASE_LR: 0.1 41 | LR_POLICY: cosine 42 | MAX_EPOCH: 196 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34.0 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | MODEL_NAME: SlowFast 52 | LOSS_FUNC: cross_entropy 53 | DROPOUT_RATE: 0.5 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | DATA_LOADER: 59 | NUM_WORKERS: 8 60 | PIN_MEMORY: True 61 | NUM_GPUS: 8 62 | NUM_SHARDS: 1 63 | RNG_SEED: 0 64 | OUTPUT_DIR: . 65 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOWFAST_4x16_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | SLOWFAST: 18 | ALPHA: 8 19 | BETA_INV: 8 20 | FUSION_CONV_CHANNEL_RATIO: 2 21 | FUSION_KERNEL_SZ: 5 22 | RESNET: 23 | ZERO_INIT_FINAL_BN: True 24 | WIDTH_PER_GROUP: 64 25 | NUM_GROUPS: 1 26 | DEPTH: 50 27 | TRANS_FUNC: bottleneck_transform 28 | STRIDE_1X1: False 29 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | NONLOCAL: 33 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 34 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | INSTANTIATION: dot_product 36 | BN: 37 | USE_PRECISE_STATS: True 38 | NUM_BATCHES_PRECISE: 200 39 | SOLVER: 40 | BASE_LR: 0.1 41 | LR_POLICY: cosine 42 | MAX_EPOCH: 196 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34.0 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | MODEL_NAME: SlowFast 52 | LOSS_FUNC: cross_entropy 53 | DROPOUT_RATE: 0.5 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | DATA_LOADER: 59 | NUM_WORKERS: 8 60 | PIN_MEMORY: True 61 | NUM_GPUS: 8 62 | NUM_SHARDS: 1 63 | RNG_SEED: 0 64 | OUTPUT_DIR: . 65 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOWFAST_8x8_R101_101_101.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | SLOWFAST: 18 | ALPHA: 4 19 | BETA_INV: 8 20 | FUSION_CONV_CHANNEL_RATIO: 2 21 | FUSION_KERNEL_SZ: 5 22 | RESNET: 23 | ZERO_INIT_FINAL_BN: True 24 | WIDTH_PER_GROUP: 64 25 | NUM_GROUPS: 1 26 | DEPTH: 101 27 | TRANS_FUNC: bottleneck_transform 28 | STRIDE_1X1: False 29 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [23, 23], [3, 3]] 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | NONLOCAL: 33 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 34 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | INSTANTIATION: dot_product 36 | BN: 37 | USE_PRECISE_STATS: True 38 | NUM_BATCHES_PRECISE: 200 39 | SOLVER: 40 | BASE_LR: 0.1 41 | LR_POLICY: cosine 42 | MAX_EPOCH: 196 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34.0 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | MODEL_NAME: SlowFast 52 | LOSS_FUNC: cross_entropy 53 | DROPOUT_RATE: 0.5 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | DATA_LOADER: 59 | NUM_WORKERS: 8 60 | PIN_MEMORY: True 61 | NUM_GPUS: 8 62 | NUM_SHARDS: 1 63 | RNG_SEED: 0 64 | OUTPUT_DIR: . 65 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOWFAST_8x8_R101_50_101.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | SLOWFAST: 18 | ALPHA: 4 19 | BETA_INV: 8 20 | FUSION_CONV_CHANNEL_RATIO: 2 21 | FUSION_KERNEL_SZ: 5 22 | RESNET: 23 | ZERO_INIT_FINAL_BN: True 24 | WIDTH_PER_GROUP: 64 25 | NUM_GROUPS: 1 26 | DEPTH: 101 27 | TRANS_FUNC: bottleneck_transform 28 | STRIDE_1X1: False 29 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 23], [3, 3]] 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | NONLOCAL: 33 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 34 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | INSTANTIATION: dot_product 36 | BN: 37 | USE_PRECISE_STATS: True 38 | NUM_BATCHES_PRECISE: 200 39 | SOLVER: 40 | BASE_LR: 0.1 41 | LR_POLICY: cosine 42 | MAX_EPOCH: 196 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34.0 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | MODEL_NAME: SlowFast 52 | LOSS_FUNC: cross_entropy 53 | DROPOUT_RATE: 0.5 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | DATA_LOADER: 59 | NUM_WORKERS: 8 60 | PIN_MEMORY: True 61 | NUM_GPUS: 8 62 | NUM_SHARDS: 1 63 | RNG_SEED: 0 64 | OUTPUT_DIR: . 65 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOWFAST_8x8_R101_50_50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | SLOWFAST: 18 | ALPHA: 4 19 | BETA_INV: 8 20 | FUSION_CONV_CHANNEL_RATIO: 2 21 | FUSION_KERNEL_SZ: 5 22 | RESNET: 23 | ZERO_INIT_FINAL_BN: True 24 | WIDTH_PER_GROUP: 64 25 | NUM_GROUPS: 1 26 | DEPTH: 101 27 | TRANS_FUNC: bottleneck_transform 28 | STRIDE_1X1: False 29 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | NONLOCAL: 33 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 34 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | INSTANTIATION: dot_product 36 | BN: 37 | USE_PRECISE_STATS: True 38 | NUM_BATCHES_PRECISE: 200 39 | SOLVER: 40 | BASE_LR: 0.1 41 | LR_POLICY: cosine 42 | MAX_EPOCH: 196 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34.0 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | MODEL_NAME: SlowFast 52 | LOSS_FUNC: cross_entropy 53 | DROPOUT_RATE: 0.5 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | DATA_LOADER: 59 | NUM_WORKERS: 8 60 | PIN_MEMORY: True 61 | NUM_GPUS: 8 62 | NUM_SHARDS: 1 63 | RNG_SEED: 0 64 | OUTPUT_DIR: . 65 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOWFAST_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | SLOWFAST: 18 | ALPHA: 4 19 | BETA_INV: 8 20 | FUSION_CONV_CHANNEL_RATIO: 2 21 | FUSION_KERNEL_SZ: 7 22 | RESNET: 23 | ZERO_INIT_FINAL_BN: True 24 | WIDTH_PER_GROUP: 64 25 | NUM_GROUPS: 1 26 | DEPTH: 50 27 | TRANS_FUNC: bottleneck_transform 28 | STRIDE_1X1: False 29 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | NONLOCAL: 33 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 34 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | INSTANTIATION: dot_product 36 | BN: 37 | USE_PRECISE_STATS: True 38 | NUM_BATCHES_PRECISE: 200 39 | SOLVER: 40 | BASE_LR: 0.1 41 | LR_POLICY: cosine 42 | MAX_EPOCH: 196 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34.0 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | MODEL_NAME: SlowFast 52 | LOSS_FUNC: cross_entropy 53 | DROPOUT_RATE: 0.5 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | DATA_LOADER: 59 | NUM_WORKERS: 8 60 | PIN_MEMORY: True 61 | NUM_GPUS: 8 62 | NUM_SHARDS: 1 63 | RNG_SEED: 0 64 | OUTPUT_DIR: . 65 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOWFAST_NLN_16x8_R101_50_50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 64 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | SLOWFAST: 18 | ALPHA: 4 19 | BETA_INV: 8 20 | FUSION_CONV_CHANNEL_RATIO: 2 21 | FUSION_KERNEL_SZ: 5 22 | RESNET: 23 | ZERO_INIT_FINAL_BN: True 24 | WIDTH_PER_GROUP: 64 25 | NUM_GROUPS: 1 26 | DEPTH: 101 27 | TRANS_FUNC: bottleneck_transform 28 | STRIDE_1X1: False 29 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | NONLOCAL: 33 | LOCATION: [[[], []], [[], []], [[6, 13, 20], []], [[], []]] 34 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | INSTANTIATION: dot_product 36 | BN: 37 | USE_PRECISE_STATS: True 38 | NUM_BATCHES_PRECISE: 200 39 | SOLVER: 40 | BASE_LR: 0.1 41 | LR_POLICY: cosine 42 | MAX_EPOCH: 196 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34.0 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | MODEL_NAME: SlowFast 52 | LOSS_FUNC: cross_entropy 53 | DROPOUT_RATE: 0.5 54 | TEST: 55 | ENABLE: True 56 | DATASET: kinetics 57 | BATCH_SIZE: 64 58 | DATA_LOADER: 59 | NUM_WORKERS: 8 60 | PIN_MEMORY: True 61 | NUM_GPUS: 8 62 | NUM_SHARDS: 1 63 | RNG_SEED: 0 64 | OUTPUT_DIR: . 65 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOW_4x16_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 4 12 | SAMPLING_RATE: 16 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | RESNET: 18 | ZERO_INIT_FINAL_BN: True 19 | WIDTH_PER_GROUP: 64 20 | NUM_GROUPS: 1 21 | DEPTH: 50 22 | TRANS_FUNC: bottleneck_transform 23 | STRIDE_1X1: False 24 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 25 | NONLOCAL: 26 | LOCATION: [[[]], [[]], [[]], [[]]] 27 | GROUP: [[1], [1], [1], [1]] 28 | INSTANTIATION: dot_product 29 | BN: 30 | USE_PRECISE_STATS: True 31 | NUM_BATCHES_PRECISE: 200 32 | SOLVER: 33 | BASE_LR: 0.1 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 196 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 1e-4 38 | WARMUP_EPOCHS: 34.0 39 | WARMUP_START_LR: 0.01 40 | OPTIMIZING_METHOD: sgd 41 | MODEL: 42 | NUM_CLASSES: 400 43 | ARCH: slow 44 | MODEL_NAME: ResNet 45 | LOSS_FUNC: cross_entropy 46 | DROPOUT_RATE: 0.5 47 | TEST: 48 | ENABLE: True 49 | DATASET: kinetics 50 | BATCH_SIZE: 64 51 | DATA_LOADER: 52 | NUM_WORKERS: 8 53 | PIN_MEMORY: True 54 | NUM_GPUS: 8 55 | NUM_SHARDS: 1 56 | RNG_SEED: 0 57 | OUTPUT_DIR: . 58 | -------------------------------------------------------------------------------- /configs/Kinetics/c2/SLOW_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | # CHECKPOINT_FILE_PATH: path to the model to test 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 8 12 | SAMPLING_RATE: 8 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3] 17 | RESNET: 18 | ZERO_INIT_FINAL_BN: True 19 | WIDTH_PER_GROUP: 64 20 | NUM_GROUPS: 1 21 | DEPTH: 50 22 | TRANS_FUNC: bottleneck_transform 23 | STRIDE_1X1: False 24 | NUM_BLOCK_TEMP_KERNEL: [[3], [4], [6], [3]] 25 | NONLOCAL: 26 | LOCATION: [[[]], [[]], [[]], [[]]] 27 | GROUP: [[1], [1], [1], [1]] 28 | INSTANTIATION: dot_product 29 | BN: 30 | USE_PRECISE_STATS: True 31 | NUM_BATCHES_PRECISE: 200 32 | SOLVER: 33 | BASE_LR: 0.1 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 196 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 1e-4 38 | WARMUP_EPOCHS: 34.0 39 | WARMUP_START_LR: 0.01 40 | OPTIMIZING_METHOD: sgd 41 | MODEL: 42 | NUM_CLASSES: 400 43 | ARCH: slow 44 | MODEL_NAME: ResNet 45 | LOSS_FUNC: cross_entropy 46 | DROPOUT_RATE: 0.5 47 | TEST: 48 | ENABLE: True 49 | DATASET: kinetics 50 | BATCH_SIZE: 64 51 | DATA_LOADER: 52 | NUM_WORKERS: 8 53 | PIN_MEMORY: True 54 | NUM_GPUS: 8 55 | NUM_SHARDS: 1 56 | RNG_SEED: 0 57 | OUTPUT_DIR: . 58 | -------------------------------------------------------------------------------- /configs/SSv2/SLOWFAST_16x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: ssv2 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 2 6 | CHECKPOINT_PERIOD: 2 7 | AUTO_RESUME: True 8 | CHECKPOINT_FILE_PATH: SLOWFAST_8x8_R50.pkl # please download from the model zoo. 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 64 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | INV_UNIFORM_SAMPLE: True 18 | RANDOM_FLIP: False 19 | REVERSE_INPUT_CHANNEL: True 20 | SLOWFAST: 21 | ALPHA: 4 22 | BETA_INV: 8 23 | FUSION_CONV_CHANNEL_RATIO: 2 24 | FUSION_KERNEL_SZ: 7 25 | RESNET: 26 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 27 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 28 | ZERO_INIT_FINAL_BN: True 29 | WIDTH_PER_GROUP: 64 30 | NUM_GROUPS: 1 31 | DEPTH: 50 32 | TRANS_FUNC: bottleneck_transform 33 | STRIDE_1X1: False 34 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 35 | NONLOCAL: 36 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 37 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 38 | INSTANTIATION: dot_product 39 | BN: 40 | USE_PRECISE_STATS: True 41 | NUM_BATCHES_PRECISE: 200 42 | NORM_TYPE: sync_batchnorm 43 | NUM_SYNC_DEVICES: 4 44 | SOLVER: 45 | BASE_LR: 0.03 46 | LR_POLICY: steps_with_relative_lrs 47 | LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 48 | STEPS: [0, 14, 18] 49 | MAX_EPOCH: 22 50 | MOMENTUM: 0.9 51 | WEIGHT_DECAY: 1e-6 52 | WARMUP_EPOCHS: 0.19 53 | WARMUP_START_LR: 0.0001 54 | OPTIMIZING_METHOD: sgd 55 | MODEL: 56 | NUM_CLASSES: 174 57 | ARCH: slowfast 58 | LOSS_FUNC: cross_entropy 59 | DROPOUT_RATE: 0.5 60 | TEST: 61 | ENABLE: True 62 | DATASET: ssv2 63 | BATCH_SIZE: 16 64 | NUM_ENSEMBLE_VIEWS: 1 65 | NUM_SPATIAL_CROPS: 1 66 | DATA_LOADER: 67 | NUM_WORKERS: 4 68 | PIN_MEMORY: True 69 | NUM_GPUS: 8 70 | NUM_SHARDS: 1 71 | RNG_SEED: 0 72 | OUTPUT_DIR: . 73 | LOG_MODEL_INFO: False 74 | -------------------------------------------------------------------------------- /configs/SSv2/SLOWFAST_16x8_R50_multigrid.yaml: -------------------------------------------------------------------------------- 1 | MULTIGRID: 2 | SHORT_CYCLE: True 3 | LONG_CYCLE: True 4 | TRAIN: 5 | ENABLE: True 6 | DATASET: ssv2 7 | BATCH_SIZE: 16 8 | EVAL_PERIOD: 2 9 | CHECKPOINT_PERIOD: 2 10 | AUTO_RESUME: True 11 | CHECKPOINT_FILE_PATH: SLOWFAST_8x8_R50.pkl # please download from the model zoo. 12 | CHECKPOINT_TYPE: caffe2 13 | DATA: 14 | NUM_FRAMES: 64 15 | SAMPLING_RATE: 2 16 | TRAIN_JITTER_SCALES: [256, 320] 17 | TRAIN_CROP_SIZE: 224 18 | TEST_CROP_SIZE: 224 19 | INPUT_CHANNEL_NUM: [3, 3] 20 | INV_UNIFORM_SAMPLE: True 21 | RANDOM_FLIP: False 22 | REVERSE_INPUT_CHANNEL: True 23 | SLOWFAST: 24 | ALPHA: 4 25 | BETA_INV: 8 26 | FUSION_CONV_CHANNEL_RATIO: 2 27 | FUSION_KERNEL_SZ: 7 28 | RESNET: 29 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 30 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 31 | ZERO_INIT_FINAL_BN: True 32 | WIDTH_PER_GROUP: 64 33 | NUM_GROUPS: 1 34 | DEPTH: 50 35 | TRANS_FUNC: bottleneck_transform 36 | STRIDE_1X1: False 37 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 38 | NONLOCAL: 39 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 40 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 41 | INSTANTIATION: dot_product 42 | BN: 43 | USE_PRECISE_STATS: True 44 | NUM_BATCHES_PRECISE: 200 45 | NORM_TYPE: sync_batchnorm 46 | NUM_SYNC_DEVICES: 4 47 | SOLVER: 48 | BASE_LR: 0.03 49 | LR_POLICY: steps_with_relative_lrs 50 | LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 51 | STEPS: [0, 14, 18] 52 | MAX_EPOCH: 22 53 | MOMENTUM: 0.9 54 | WEIGHT_DECAY: 1e-6 55 | WARMUP_EPOCHS: 0.19 56 | WARMUP_START_LR: 0.0001 57 | OPTIMIZING_METHOD: sgd 58 | MODEL: 59 | NUM_CLASSES: 174 60 | ARCH: slowfast 61 | LOSS_FUNC: cross_entropy 62 | DROPOUT_RATE: 0.5 63 | TEST: 64 | ENABLE: True 65 | DATASET: ssv2 66 | BATCH_SIZE: 16 67 | NUM_ENSEMBLE_VIEWS: 1 68 | NUM_SPATIAL_CROPS: 1 69 | DATA_LOADER: 70 | NUM_WORKERS: 4 71 | PIN_MEMORY: True 72 | NUM_GPUS: 8 73 | NUM_SHARDS: 1 74 | RNG_SEED: 0 75 | OUTPUT_DIR: . 76 | LOG_MODEL_INFO: False 77 | -------------------------------------------------------------------------------- /demo/AVA/SLOWFAST_32x2_R101_50_50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: ava 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 1 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | CHECKPOINT_FILE_PATH: ./SLOWFAST_32x2_R101_50_50.pkl #path to pretrain model 9 | CHECKPOINT_TYPE: pytorch 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | DETECTION: 18 | ENABLE: True 19 | ALIGNED: False 20 | AVA: 21 | BGR: False 22 | DETECTION_SCORE_THRESH: 0.8 23 | TEST_PREDICT_BOX_LISTS: ["person_box_67091280_iou90/ava_detection_val_boxes_and_labels.csv"] 24 | SLOWFAST: 25 | ALPHA: 4 26 | BETA_INV: 8 27 | FUSION_CONV_CHANNEL_RATIO: 2 28 | FUSION_KERNEL_SZ: 5 29 | RESNET: 30 | ZERO_INIT_FINAL_BN: True 31 | WIDTH_PER_GROUP: 64 32 | NUM_GROUPS: 1 33 | DEPTH: 101 34 | TRANS_FUNC: bottleneck_transform 35 | STRIDE_1X1: False 36 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 37 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [2, 2]] 38 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [1, 1]] 39 | NONLOCAL: 40 | LOCATION: [[[], []], [[], []], [[6, 13, 20], []], [[], []]] 41 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 42 | INSTANTIATION: dot_product 43 | POOL: [[[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]], [[2, 2, 2], [2, 2, 2]]] 44 | BN: 45 | USE_PRECISE_STATS: False 46 | NUM_BATCHES_PRECISE: 200 47 | SOLVER: 48 | MOMENTUM: 0.9 49 | WEIGHT_DECAY: 1e-7 50 | OPTIMIZING_METHOD: sgd 51 | MODEL: 52 | NUM_CLASSES: 80 53 | ARCH: slowfast 54 | MODEL_NAME: SlowFast 55 | LOSS_FUNC: bce 56 | DROPOUT_RATE: 0.5 57 | HEAD_ACT: sigmoid 58 | TEST: 59 | ENABLE: False 60 | DATASET: ava 61 | BATCH_SIZE: 8 62 | DATA_LOADER: 63 | NUM_WORKERS: 2 64 | PIN_MEMORY: True 65 | 66 | NUM_GPUS: 1 67 | NUM_SHARDS: 1 68 | RNG_SEED: 0 69 | OUTPUT_DIR: . 70 | TENSORBOARD: 71 | MODEL_VIS: 72 | TOPK: 2 73 | DEMO: 74 | ENABLE: True 75 | LABEL_FILE_PATH: # Add local label file path here. 76 | WEBCAM: 0 77 | DETECTRON2_CFG: "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml" 78 | DETECTRON2_WEIGHTS: detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl 79 | -------------------------------------------------------------------------------- /demo/Kinetics/SLOWFAST_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 1 7 | AUTO_RESUME: True 8 | CHECKPOINT_FILE_PATH: "./SLOWFAST_8x8_R50.pkl" # path to pretrain model to run demo 9 | CHECKPOINT_TYPE: caffe2 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | SLOWFAST: 18 | ALPHA: 4 19 | BETA_INV: 8 20 | FUSION_CONV_CHANNEL_RATIO: 2 21 | FUSION_KERNEL_SZ: 7 22 | RESNET: 23 | ZERO_INIT_FINAL_BN: True 24 | WIDTH_PER_GROUP: 64 25 | NUM_GROUPS: 1 26 | DEPTH: 50 27 | TRANS_FUNC: bottleneck_transform 28 | STRIDE_1X1: False 29 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 30 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 31 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 32 | NONLOCAL: 33 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 34 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 35 | INSTANTIATION: dot_product 36 | BN: 37 | USE_PRECISE_STATS: True 38 | NUM_BATCHES_PRECISE: 200 39 | SOLVER: 40 | BASE_LR: 0.1 41 | LR_POLICY: cosine 42 | MAX_EPOCH: 196 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 1e-4 45 | WARMUP_EPOCHS: 34 46 | WARMUP_START_LR: 0.01 47 | OPTIMIZING_METHOD: sgd 48 | MODEL: 49 | NUM_CLASSES: 400 50 | ARCH: slowfast 51 | LOSS_FUNC: cross_entropy 52 | DROPOUT_RATE: 0.5 53 | TEST: 54 | ENABLE: False 55 | DATASET: kinetics 56 | BATCH_SIZE: 64 57 | DATA_LOADER: 58 | NUM_WORKERS: 8 59 | PIN_MEMORY: True 60 | DEMO: 61 | ENABLE: True 62 | LABEL_FILE_PATH: # Add local label file path here. 63 | WEBCAM: 0 64 | NUM_GPUS: 1 65 | NUM_SHARDS: 1 66 | RNG_SEED: 0 67 | OUTPUT_DIR: . 68 | -------------------------------------------------------------------------------- /demo/ava_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanyix/SlowFast/629fd1bf00e2d3b320b6e46c652331819fe9d4e7/demo/ava_demo.gif -------------------------------------------------------------------------------- /linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | # Run this script at project root by ".linter.sh" before you commit. 4 | echo "Running isort..." 5 | isort -y -sp . 6 | 7 | echo "Running black..." 8 | black -l 80 . 9 | 10 | echo "Running flake..." 11 | flake8 . 12 | 13 | command -v arc > /dev/null && { 14 | echo "Running arc lint ..." 15 | arc lint 16 | } 17 | -------------------------------------------------------------------------------- /projects/avslowfast/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started with PyAVSlowFast 2 | 3 | This section supplements the original doc in PySlowFast (attached below) and provide instructions on how to start training AVSlowFast model with this codebase. 4 | 5 | First, a note that `DATA.PATH_TO_DATA_DIR` points to the directory where annotation csv files reside and `DATA.PATH_PREFIX` to the root of the data directory. 6 | 7 | Then, issue the following training command 8 | ``` 9 | python tools/run_net.py \ 10 | --cfg configs/Kinetics/AVSLOWFAST_4x16_R50.yaml \ 11 | DATA.PATH_TO_DATA_DIR path_to_your_annotation \ 12 | DATA.PATH_PREFIX path_to_your_dataset_root \ 13 | NUM_GPUS 8 \ 14 | DATA_LOADER.NUM_WORKERS 8 \ 15 | TRAIN.BATCH_SIZE 64 \ 16 | ``` 17 | 18 | For testing, run the following 19 | ``` 20 | python tools/run_net.py \ 21 | --cfg configs/Kinetics/AVSLOWFAST_4x16_R50.yaml \ 22 | DATA.PATH_TO_DATA_DIR path_to_your_annotation \ 23 | DATA.PATH_PREFIX path_to_your_dataset_root \ 24 | TEST.BATCH_SIZE 32 \ 25 | TEST.CHECKPOINT_FILE_PATH path_to_your_checkpoint \ 26 | TRAIN.ENABLE False \ 27 | ``` 28 | 29 | ## Citing AVSlowFast 30 | Please cite AVSlowFast if you use it in your research, you can use the following BibTeX entry. 31 | ```BibTeX 32 | @article{xiao-avslowfast2020, 33 | author = {Xiao, Fanyi and Lee, Yong Jae and Grauman, Kristen and Malik, Jitendra and Feichtenhofer, Christoph}, 34 | title = {{Audiovisual SlowFast Networks for Video Recognition}}, 35 | journal = {arXiv preprint arXiv:2001.08740}, 36 | Year = {2020}} 37 | ``` 38 | -------------------------------------------------------------------------------- /projects/multigrid/README.md: -------------------------------------------------------------------------------- 1 | # A Multigrid Method for Efficiently Training Video Models 2 | [Chao-Yuan Wu](https://www.cs.utexas.edu/~cywu/), 3 | [Ross Girshick](http://rossgirshick.info), 4 | [Kaiming He](http://kaiminghe.com), 5 | [Christoph Feichtenhofer](http://feichtenhofer.github.io/), 6 | [Philipp Krähenbühl](http://www.philkr.net/) 7 |
8 | In CVPR, 2020. [[Paper](https://arxiv.org/abs/1912.00998)] 9 |
10 |
11 | 12 |
13 |
14 | 15 | 16 | ## Getting started 17 | To enable multigrid training, add `MULTIGRID.LONG_CYCLE True` and/or `MULTIGRID.SHORT_CYCLE True` when training your model. (Default multigrid training uses both long and short cycles; See [paper](https://arxiv.org/abs/1912.00998) for details.) For example, 18 | 19 | ``` 20 | python tools/run_net.py \ 21 | --cfg configs/Charades/SLOWFAST_16x8_R50.yaml \ 22 | DATA.PATH_TO_DATA_DIR path_to_your_dataset \ 23 | MULTIGRID.LONG_CYCLE True \ 24 | MULTIGRID.SHORT_CYCLE True \ 25 | ``` 26 | This should train multiple times faster than training *without* multigrid training. 27 | Note that multigrid training might induce higher IO overhead. 28 | Systems with faster IO (e.g., with efficient local disk) might enjoy more speedup. 29 | Please see [MODEL_ZOO.md](https://github.com/facebookresearch/SlowFast/blob/master/MODEL_ZOO.md) for more examples of multigrid training. 30 | 31 | ## Citing Multigrid Training 32 | If you use multigrid training or the models from MODEL_ZOO in your research, please use the following BibTeX entry. 33 | ```BibTeX 34 | @inproceedings{multigrid2020, 35 | Author = {Chao-Yuan Wu and Ross Girshick and Kaiming He and Christoph Feichtenhofer 36 | and Philipp Kr\"{a}henb\"{u}hl}, 37 | Title = {{A Multigrid Method for Efficiently Training Video Models}}, 38 | Booktitle = {{CVPR}}, 39 | Year = {2020}} 40 | ``` 41 | -------------------------------------------------------------------------------- /projects/multigrid/multigrid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanyix/SlowFast/629fd1bf00e2d3b320b6e46c652331819fe9d4e7/projects/multigrid/multigrid.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=4 4 | known_standard_library=numpy,setuptools 5 | known_myself=slowfast 6 | known_third_party=fvcore,av,torch,pycocotools,yacs,termcolor,scipy,simplejson,matplotlib,detectron2,torchvision,yaml,tqdm,psutil,opencv-python,pandas,tensorboard,moviepy 7 | no_lines_before=STDLIB,THIRDPARTY 8 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER 9 | default_section=FIRSTPARTY 10 | 11 | [mypy] 12 | python_version=3.6 13 | ignore_missing_imports = True 14 | warn_unused_configs = True 15 | disallow_untyped_defs = True 16 | check_untyped_defs = True 17 | warn_unused_ignores = True 18 | warn_redundant_casts = True 19 | show_column_numbers = True 20 | follow_imports = silent 21 | allow_redefinition = True 22 | ; Require all functions to be annotated 23 | disallow_incomplete_defs = True 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from setuptools import find_packages, setup 5 | 6 | setup( 7 | name="slowfast", 8 | version="1.0", 9 | author="FAIR", 10 | url="unknown", 11 | description="SlowFast Video Understanding", 12 | install_requires=[ 13 | "yacs>=0.1.6", 14 | "pyyaml>=5.1", 15 | "av", 16 | "matplotlib", 17 | "termcolor>=1.1", 18 | "simplejson", 19 | "tqdm", 20 | "psutil", 21 | "matplotlib", 22 | "detectron2", 23 | "opencv-python", 24 | "pandas", 25 | "torchvision>=0.4.2", 26 | "sklearn", 27 | ], 28 | extras_require={"tensorboard_video_visualization": ["moviepy"]}, 29 | packages=find_packages(exclude=("configs", "tests")), 30 | ) 31 | -------------------------------------------------------------------------------- /slowfast/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from slowfast.utils.env import setup_environment 5 | 6 | setup_environment() 7 | -------------------------------------------------------------------------------- /slowfast/config/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /slowfast/config/custom_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Add custom configs and default values""" 5 | 6 | 7 | def add_custom_config(_C): 8 | # Add your own customized configs. 9 | pass 10 | -------------------------------------------------------------------------------- /slowfast/datasets/DATASET.md: -------------------------------------------------------------------------------- 1 | # Dataset Preparation 2 | 3 | ## Kinetics 4 | 5 | The Kinetics Dataset could be downloaded via the code released by ActivityNet: 6 | 7 | 1. Download the videos via the official [scripts](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). 8 | 9 | 2. After all the videos were downloaded, resize the video to the short edge size of 256, then prepare the csv files for training, validation, and testing set as `train.csv`, `val.csv`, `test.csv`. The format of the csv file is: 10 | 11 | ``` 12 | path_to_video_1 label_1 13 | path_to_video_2 label_2 14 | path_to_video_3 label_3 15 | ... 16 | path_to_video_N label_N 17 | ``` 18 | 19 | All the Kinetics models in the Model Zoo are trained and tested with the same data as [Non-local Network](https://github.com/facebookresearch/video-nonlocal-net/blob/master/DATASET.md). For dataset specific issues, please reach out to the [dataset provider](https://deepmind.com/research/open-source/kinetics). 20 | 21 | ## AVA 22 | 23 | The AVA Dataset could be downloaded from the [official site](https://research.google.com/ava/download.html#ava_actions_download) 24 | 25 | We followed the same [downloading and preprocessing procedure](https://github.com/facebookresearch/video-long-term-feature-banks/blob/master/DATASET.md) as the [Long-Term Feature Banks for Detailed Video Understanding](https://arxiv.org/abs/1812.05038) do. 26 | 27 | You could follow these steps to download and preprocess the data: 28 | 29 | 1. Download videos 30 | 31 | ``` 32 | DATA_DIR="../../data/ava/videos" 33 | 34 | if [[ ! -d "${DATA_DIR}" ]]; then 35 | echo "${DATA_DIR} doesn't exist. Creating it."; 36 | mkdir -p ${DATA_DIR} 37 | fi 38 | 39 | wget https://s3.amazonaws.com/ava-dataset/annotations/ava_file_names_trainval_v2.1.txt 40 | 41 | for line in $(cat ava_file_names_trainval_v2.1.txt) 42 | do 43 | wget https://s3.amazonaws.com/ava-dataset/trainval/$line -P ${DATA_DIR} 44 | done 45 | ``` 46 | 47 | 2. Cut each video from its 15th to 30th minute 48 | 49 | ``` 50 | IN_DATA_DIR="../../data/ava/videos" 51 | OUT_DATA_DIR="../../data/ava/videos_15min" 52 | 53 | if [[ ! -d "${OUT_DATA_DIR}" ]]; then 54 | echo "${OUT_DATA_DIR} doesn't exist. Creating it."; 55 | mkdir -p ${OUT_DATA_DIR} 56 | fi 57 | 58 | for video in $(ls -A1 -U ${IN_DATA_DIR}/*) 59 | do 60 | out_name="${OUT_DATA_DIR}/${video##*/}" 61 | if [ ! -f "${out_name}" ]; then 62 | ffmpeg -ss 900 -t 901 -i "${video}" "${out_name}" 63 | fi 64 | done 65 | ``` 66 | 67 | 3. Extract frames 68 | 69 | ``` 70 | IN_DATA_DIR="../../data/ava/videos_15min" 71 | OUT_DATA_DIR="../../data/ava/frames" 72 | 73 | if [[ ! -d "${OUT_DATA_DIR}" ]]; then 74 | echo "${OUT_DATA_DIR} doesn't exist. Creating it."; 75 | mkdir -p ${OUT_DATA_DIR} 76 | fi 77 | 78 | for video in $(ls -A1 -U ${IN_DATA_DIR}/*) 79 | do 80 | video_name=${video##*/} 81 | 82 | if [[ $video_name = *".webm" ]]; then 83 | video_name=${video_name::-5} 84 | else 85 | video_name=${video_name::-4} 86 | fi 87 | 88 | out_video_dir=${OUT_DATA_DIR}/${video_name}/ 89 | mkdir -p "${out_video_dir}" 90 | 91 | out_name="${out_video_dir}/${video_name}_%06d.jpg" 92 | 93 | ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}" 94 | done 95 | ``` 96 | 97 | 4. Download annotations 98 | 99 | ``` 100 | DATA_DIR="../../data/ava/annotations" 101 | 102 | if [[ ! -d "${DATA_DIR}" ]]; then 103 | echo "${DATA_DIR} doesn't exist. Creating it."; 104 | mkdir -p ${DATA_DIR} 105 | fi 106 | 107 | wget https://research.google.com/ava/download/ava_train_v2.1.csv -P ${DATA_DIR} 108 | wget https://research.google.com/ava/download/ava_val_v2.1.csv -P ${DATA_DIR} 109 | wget https://research.google.com/ava/download/ava_action_list_v2.1_for_activitynet_2018.pbtxt -P ${DATA_DIR} 110 | wget https://research.google.com/ava/download/ava_train_excluded_timestamps_v2.1.csv -P ${DATA_DIR} 111 | wget https://research.google.com/ava/download/ava_val_excluded_timestamps_v2.1.csv -P ${DATA_DIR} 112 | ``` 113 | 114 | 5. Download "frame lists" ([train](https://dl.fbaipublicfiles.com/video-long-term-feature-banks/data/ava/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/video-long-term-feature-banks/data/ava/frame_lists/val.csv)) and put them in 115 | the `frame_lists` folder (see structure above). 116 | 117 | 6. Download person boxes ([train](https://dl.fbaipublicfiles.com/video-long-term-feature-banks/data/ava/annotations/ava_train_predicted_boxes.csv), [val](https://dl.fbaipublicfiles.com/video-long-term-feature-banks/data/ava/annotations/ava_val_predicted_boxes.csv), [test](https://dl.fbaipublicfiles.com/video-long-term-feature-banks/data/ava/annotations/ava_test_predicted_boxes.csv)) and put them in the `annotations` folder (see structure above). 118 | If you prefer to use your own person detector, please see details 119 | in [here](https://github.com/facebookresearch/video-long-term-feature-banks/blob/master/GETTING_STARTED.md#ava-person-detector). 120 | 121 | 122 | Download the ava dataset with the following structure: 123 | 124 | ``` 125 | ava 126 | |_ frames 127 | | |_ [video name 0] 128 | | | |_ [video name 0]_000001.jpg 129 | | | |_ [video name 0]_000002.jpg 130 | | | |_ ... 131 | | |_ [video name 1] 132 | | |_ [video name 1]_000001.jpg 133 | | |_ [video name 1]_000002.jpg 134 | | |_ ... 135 | |_ frame_lists 136 | | |_ train.csv 137 | | |_ val.csv 138 | |_ annotations 139 | |_ [official AVA annotation files] 140 | |_ ava_train_predicted_boxes.csv 141 | |_ ava_val_predicted_boxes.csv 142 | ``` 143 | 144 | You could also replace the `v2.1` by `v2.2` if you need the AVA v2.2 annotation. You can also download some pre-prepared annotations from [here](https://dl.fbaipublicfiles.com/pyslowfast/annotation/ava/ava_annotations.tar). 145 | 146 | 147 | ## Charades 148 | 1. Please download the Charades RGB frames from [dataset provider](http://ai2-website.s3.amazonaws.com/data/Charades_v1_rgb.tar). 149 | 150 | 2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/charades/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/charades/frame_lists/val.csv)). 151 | 152 | Please set `DATA.PATH_TO_DATA_DIR` to point to the folder containing the frame lists, and `DATA.PATH_PREFIX` to the folder containing RGB frames. 153 | 154 | 155 | ## Something-Something V2 156 | 1. Please download the dataset and annotations from [dataset provider](https://20bn.com/datasets/something-something). 157 | 158 | 2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/val.csv)). 159 | 160 | 3. Extract the frames at 30 FPS. (We used ffmpeg-4.1.3 with command 161 | `ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}"` 162 | in experiments.) Please put the frames in a structure consistent with the frame lists. 163 | 164 | 165 | Please put all annotation json files and the frame lists in the same folder, and set `DATA.PATH_TO_DATA_DIR` to the path. Set `DATA.PATH_PREFIX` to be the path to the folder containing extracted frames. 166 | -------------------------------------------------------------------------------- /slowfast/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from .ava_dataset import Ava # noqa 5 | from .build import DATASET_REGISTRY, build_dataset # noqa 6 | from .charades import Charades # noqa 7 | from .kinetics import Kinetics # noqa 8 | from .ssv2 import Ssv2 # noqa 9 | -------------------------------------------------------------------------------- /slowfast/datasets/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from fvcore.common.registry import Registry 5 | 6 | DATASET_REGISTRY = Registry("DATASET") 7 | DATASET_REGISTRY.__doc__ = """ 8 | Registry for dataset. 9 | 10 | The registered object will be called with `obj(cfg, split)`. 11 | The call should return a `torch.utils.data.Dataset` object. 12 | """ 13 | 14 | 15 | def build_dataset(dataset_name, cfg, split): 16 | """ 17 | Build a dataset, defined by `dataset_name`. 18 | Args: 19 | dataset_name (str): the name of the dataset to be constructed. 20 | cfg (CfgNode): configs. Details can be found in 21 | slowfast/config/defaults.py 22 | split (str): the split of the data loader. Options include `train`, 23 | `val`, and `test`. 24 | Returns: 25 | Dataset: a constructed dataset specified by dataset_name. 26 | """ 27 | # Capitalize the the first letter of the dataset_name since the dataset_name 28 | # in configs may be in lowercase but the name of dataset class should always 29 | # start with an uppercase letter. 30 | name = dataset_name.capitalize() 31 | return DATASET_REGISTRY.get(name)(cfg, split) 32 | -------------------------------------------------------------------------------- /slowfast/datasets/loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Data loader.""" 5 | 6 | import itertools 7 | import numpy as np 8 | import torch 9 | from torch.utils.data._utils.collate import default_collate 10 | from torch.utils.data.distributed import DistributedSampler 11 | from torch.utils.data.sampler import RandomSampler 12 | 13 | from slowfast.datasets.multigrid_helper import ShortCycleBatchSampler 14 | 15 | from .build import build_dataset 16 | 17 | 18 | def detection_collate(batch): 19 | """ 20 | Collate function for detection task. Concatanate bboxes, labels and 21 | metadata from different samples in the first dimension instead of 22 | stacking them to have a batch-size dimension. 23 | Args: 24 | batch (tuple or list): data batch to collate. 25 | Returns: 26 | (tuple): collated detection data batch. 27 | """ 28 | inputs, labels, video_idx, extra_data = zip(*batch) 29 | inputs, video_idx = default_collate(inputs), default_collate(video_idx) 30 | labels = torch.tensor(np.concatenate(labels, axis=0)).float() 31 | 32 | collated_extra_data = {} 33 | for key in extra_data[0].keys(): 34 | data = [d[key] for d in extra_data] 35 | if key == "boxes" or key == "ori_boxes": 36 | # Append idx info to the bboxes before concatenating them. 37 | bboxes = [ 38 | np.concatenate( 39 | [np.full((data[i].shape[0], 1), float(i)), data[i]], axis=1 40 | ) 41 | for i in range(len(data)) 42 | ] 43 | bboxes = np.concatenate(bboxes, axis=0) 44 | collated_extra_data[key] = torch.tensor(bboxes).float() 45 | elif key == "metadata": 46 | collated_extra_data[key] = torch.tensor( 47 | list(itertools.chain(*data)) 48 | ).view(-1, 2) 49 | else: 50 | collated_extra_data[key] = default_collate(data) 51 | 52 | return inputs, labels, video_idx, collated_extra_data 53 | 54 | 55 | def shuffle_misaligned_audio(epoch, inputs, cfg): 56 | """ 57 | Shuffle the misaligned (negative) input audio clips, 58 | such that creating positive/negative pairs that are 59 | from different videos. 60 | 61 | Args: 62 | epoch (int): the current epoch number. 63 | inputs (list of tensors): inputs to model, 64 | inputs[2] corresponds to audio inputs. 65 | cfg (CfgNode): configs. Details can be found in 66 | slowfast/config/defaults.py 67 | """ 68 | 69 | if len(inputs) > 2 and cfg.DATA.GET_MISALIGNED_AUDIO: 70 | N = inputs[2].size(0) 71 | # We only leave "hard negatives" after 72 | # cfg.DATA.MIX_NEG_EPOCH epochs 73 | SN = max(int(cfg.DATA.EASY_NEG_RATIO * N), 1) if \ 74 | epoch >= cfg.DATA.MIX_NEG_EPOCH else N 75 | with torch.no_grad(): 76 | idx = torch.arange(N) 77 | idx[:SN] = torch.arange(1, SN+1) % SN 78 | inputs[2][:, 1, ...] = inputs[2][idx, 1, ...] 79 | return inputs 80 | 81 | 82 | def construct_loader(cfg, split, is_precise_bn=False): 83 | """ 84 | Constructs the data loader for the given dataset. 85 | Args: 86 | cfg (CfgNode): configs. Details can be found in 87 | slowfast/config/defaults.py 88 | split (str): the split of the data loader. Options include `train`, 89 | `val`, and `test`. 90 | """ 91 | assert split in ["train", "val", "test"] 92 | if split in ["train"]: 93 | dataset_name = cfg.TRAIN.DATASET 94 | batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 95 | shuffle = True 96 | drop_last = True 97 | elif split in ["val"]: 98 | dataset_name = cfg.TRAIN.DATASET 99 | batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 100 | shuffle = False 101 | drop_last = False 102 | elif split in ["test"]: 103 | dataset_name = cfg.TEST.DATASET 104 | batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 105 | shuffle = False 106 | drop_last = False 107 | 108 | # Construct the dataset 109 | dataset = build_dataset(dataset_name, cfg, split) 110 | 111 | if cfg.MULTIGRID.SHORT_CYCLE and split in ["train"] and not is_precise_bn: 112 | # Create a sampler for multi-process training 113 | sampler = ( 114 | DistributedSampler(dataset) 115 | if cfg.NUM_GPUS > 1 116 | else RandomSampler(dataset) 117 | ) 118 | batch_sampler = ShortCycleBatchSampler( 119 | sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg 120 | ) 121 | # Create a loader 122 | loader = torch.utils.data.DataLoader( 123 | dataset, 124 | batch_sampler=batch_sampler, 125 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 126 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 127 | ) 128 | else: 129 | # Create a sampler for multi-process training 130 | sampler = DistributedSampler(dataset) if cfg.NUM_GPUS > 1 else None 131 | # Create a loader 132 | loader = torch.utils.data.DataLoader( 133 | dataset, 134 | batch_size=batch_size, 135 | shuffle=(False if sampler else shuffle), 136 | sampler=sampler, 137 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 138 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 139 | drop_last=drop_last, 140 | collate_fn=detection_collate if cfg.DETECTION.ENABLE else None, 141 | ) 142 | return loader 143 | 144 | 145 | def shuffle_dataset(loader, cur_epoch): 146 | """" 147 | Shuffles the data. 148 | Args: 149 | loader (loader): data loader to perform shuffle. 150 | cur_epoch (int): number of the current epoch. 151 | """ 152 | sampler = ( 153 | loader.batch_sampler.sampler 154 | if isinstance(loader.batch_sampler, ShortCycleBatchSampler) 155 | else loader.sampler 156 | ) 157 | assert isinstance( 158 | sampler, (RandomSampler, DistributedSampler) 159 | ), "Sampler type '{}' not supported".format(type(sampler)) 160 | # RandomSampler handles shuffling automatically 161 | if isinstance(sampler, DistributedSampler): 162 | # DistributedSampler shuffles data based on epoch 163 | sampler.set_epoch(cur_epoch) 164 | -------------------------------------------------------------------------------- /slowfast/datasets/multigrid_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Helper functions for multigrid training.""" 5 | 6 | import numpy as np 7 | from torch._six import int_classes as _int_classes 8 | from torch.utils.data.sampler import Sampler 9 | 10 | 11 | class ShortCycleBatchSampler(Sampler): 12 | """ 13 | Extend Sampler to support "short cycle" sampling. 14 | See paper "A Multigrid Method for Efficiently Training Video Models", 15 | Wu et al., 2019 (https://arxiv.org/abs/1912.00998) for details. 16 | """ 17 | 18 | def __init__(self, sampler, batch_size, drop_last, cfg): 19 | if not isinstance(sampler, Sampler): 20 | raise ValueError( 21 | "sampler should be an instance of " 22 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 23 | ) 24 | if ( 25 | not isinstance(batch_size, _int_classes) 26 | or isinstance(batch_size, bool) 27 | or batch_size <= 0 28 | ): 29 | raise ValueError( 30 | "batch_size should be a positive integer value, " 31 | "but got batch_size={}".format(batch_size) 32 | ) 33 | if not isinstance(drop_last, bool): 34 | raise ValueError( 35 | "drop_last should be a boolean value, but got " 36 | "drop_last={}".format(drop_last) 37 | ) 38 | self.sampler = sampler 39 | self.drop_last = drop_last 40 | 41 | bs_factor = [ 42 | int( 43 | round( 44 | ( 45 | float(cfg.DATA.TRAIN_CROP_SIZE) 46 | / (s * cfg.MULTIGRID.DEFAULT_S) 47 | ) 48 | ** 2 49 | ) 50 | ) 51 | for s in cfg.MULTIGRID.SHORT_CYCLE_FACTORS 52 | ] 53 | 54 | self.batch_sizes = [ 55 | batch_size * bs_factor[0], 56 | batch_size * bs_factor[1], 57 | batch_size, 58 | ] 59 | 60 | def __iter__(self): 61 | counter = 0 62 | batch_size = self.batch_sizes[0] 63 | batch = [] 64 | for idx in self.sampler: 65 | batch.append((idx, counter % 3)) 66 | if len(batch) == batch_size: 67 | yield batch 68 | counter += 1 69 | batch_size = self.batch_sizes[counter % 3] 70 | batch = [] 71 | if len(batch) > 0 and not self.drop_last: 72 | yield batch 73 | 74 | def __len__(self): 75 | avg_batch_size = sum(self.batch_sizes) / 3.0 76 | if self.drop_last: 77 | return int(np.floor(len(self.sampler) / avg_batch_size)) 78 | else: 79 | return int(np.ceil(len(self.sampler) / avg_batch_size)) 80 | -------------------------------------------------------------------------------- /slowfast/datasets/video_container.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import av 5 | 6 | 7 | def get_video_container(path_to_vid, multi_thread_decode=False, backend="pyav"): 8 | """ 9 | Given the path to the video, return the pyav video container. 10 | Args: 11 | path_to_vid (str): path to the video. 12 | multi_thread_decode (bool): if True, perform multi-thread decoding. 13 | backend (str): decoder backend, options include `pyav` and 14 | `torchvision`, default is `pyav`. 15 | Returns: 16 | container (container): video container. 17 | """ 18 | if backend == "torchvision": 19 | with open(path_to_vid, "rb") as fp: 20 | container = fp.read() 21 | return container 22 | elif backend == "pyav": 23 | container = av.open(path_to_vid) 24 | if multi_thread_decode: 25 | # Enable multiple threads for decoding. 26 | container.streams.video[0].thread_type = "AUTO" 27 | return container 28 | else: 29 | raise NotImplementedError("Unknown backend {}".format(backend)) 30 | -------------------------------------------------------------------------------- /slowfast/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from .build import MODEL_REGISTRY, build_model # noqa 5 | from .custom_video_model_builder import * # noqa 6 | from .video_model_builder import ResNet, SlowFast # noqa 7 | -------------------------------------------------------------------------------- /slowfast/models/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Model construction functions.""" 5 | 6 | import torch 7 | from fvcore.common.registry import Registry 8 | 9 | MODEL_REGISTRY = Registry("MODEL") 10 | MODEL_REGISTRY.__doc__ = """ 11 | Registry for video model. 12 | 13 | The registered object will be called with `obj(cfg)`. 14 | The call should return a `torch.nn.Module` object. 15 | """ 16 | 17 | 18 | def build_model(cfg, gpu_id=None): 19 | """ 20 | Builds the video model. 21 | Args: 22 | cfg (configs): configs that contains the hyper-parameters to build the 23 | backbone. Details can be seen in slowfast/config/defaults.py. 24 | gpu_id (Optional[int]): specify the gpu index to build model. 25 | """ 26 | if torch.cuda.is_available(): 27 | assert ( 28 | cfg.NUM_GPUS <= torch.cuda.device_count() 29 | ), "Cannot use more GPU devices than available" 30 | else: 31 | assert ( 32 | cfg.NUM_GPUS == 0 33 | ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs." 34 | 35 | # Construct the model 36 | name = cfg.MODEL.MODEL_NAME 37 | model = MODEL_REGISTRY.get(name)(cfg) 38 | 39 | if cfg.NUM_GPUS: 40 | if gpu_id is None: 41 | # Determine the GPU used by the current process 42 | cur_device = torch.cuda.current_device() 43 | else: 44 | cur_device = gpu_id 45 | # Transfer the model to the current GPU device 46 | model = model.cuda(device=cur_device) 47 | # Use multi-process data parallel model in the multi-gpu setting 48 | if cfg.NUM_GPUS > 1: 49 | # Make model replica operate on the current device 50 | model = torch.nn.parallel.DistributedDataParallel( 51 | module=model, device_ids=[cur_device], output_device=cur_device 52 | ) 53 | return model 54 | -------------------------------------------------------------------------------- /slowfast/models/custom_video_model_builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | 5 | """A More Flexible Video models.""" 6 | -------------------------------------------------------------------------------- /slowfast/models/losses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Loss functions.""" 5 | 6 | import torch.nn as nn 7 | 8 | _LOSSES = { 9 | "cross_entropy": nn.CrossEntropyLoss, 10 | "bce": nn.BCELoss, 11 | "bce_logit": nn.BCEWithLogitsLoss, 12 | } 13 | 14 | 15 | def get_loss_func(loss_name): 16 | """ 17 | Retrieve the loss given the loss name. 18 | Args (int): 19 | loss_name: the name of the loss to use. 20 | """ 21 | if loss_name not in _LOSSES.keys(): 22 | raise NotImplementedError("Loss {} is not supported".format(loss_name)) 23 | return _LOSSES[loss_name] 24 | -------------------------------------------------------------------------------- /slowfast/models/nonlocal_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Non-local helper""" 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class Nonlocal(nn.Module): 11 | """ 12 | Builds Non-local Neural Networks as a generic family of building 13 | blocks for capturing long-range dependencies. Non-local Network 14 | computes the response at a position as a weighted sum of the 15 | features at all positions. This building block can be plugged into 16 | many computer vision architectures. 17 | More details in the paper: https://arxiv.org/pdf/1711.07971.pdf 18 | """ 19 | 20 | def __init__( 21 | self, 22 | dim, 23 | dim_inner, 24 | pool_size=None, 25 | instantiation="softmax", 26 | zero_init_final_conv=False, 27 | zero_init_final_norm=True, 28 | norm_eps=1e-5, 29 | norm_momentum=0.1, 30 | norm_module=nn.BatchNorm3d, 31 | ): 32 | """ 33 | Args: 34 | dim (int): number of dimension for the input. 35 | dim_inner (int): number of dimension inside of the Non-local block. 36 | pool_size (list): the kernel size of spatial temporal pooling, 37 | temporal pool kernel size, spatial pool kernel size, spatial 38 | pool kernel size in order. By default pool_size is None, 39 | then there would be no pooling used. 40 | instantiation (string): supports two different instantiation method: 41 | "dot_product": normalizing correlation matrix with L2. 42 | "softmax": normalizing correlation matrix with Softmax. 43 | zero_init_final_conv (bool): If true, zero initializing the final 44 | convolution of the Non-local block. 45 | zero_init_final_norm (bool): 46 | If true, zero initializing the final batch norm of the Non-local 47 | block. 48 | norm_module (nn.Module): nn.Module for the normalization layer. The 49 | default is nn.BatchNorm3d. 50 | """ 51 | super(Nonlocal, self).__init__() 52 | self.dim = dim 53 | self.dim_inner = dim_inner 54 | self.pool_size = pool_size 55 | self.instantiation = instantiation 56 | self.use_pool = ( 57 | False 58 | if pool_size is None 59 | else any((size > 1 for size in pool_size)) 60 | ) 61 | self.norm_eps = norm_eps 62 | self.norm_momentum = norm_momentum 63 | self._construct_nonlocal( 64 | zero_init_final_conv, zero_init_final_norm, norm_module 65 | ) 66 | 67 | def _construct_nonlocal( 68 | self, zero_init_final_conv, zero_init_final_norm, norm_module 69 | ): 70 | # Three convolution heads: theta, phi, and g. 71 | self.conv_theta = nn.Conv3d( 72 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 73 | ) 74 | self.conv_phi = nn.Conv3d( 75 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 76 | ) 77 | self.conv_g = nn.Conv3d( 78 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 79 | ) 80 | 81 | # Final convolution output. 82 | self.conv_out = nn.Conv3d( 83 | self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0 84 | ) 85 | # Zero initializing the final convolution output. 86 | self.conv_out.zero_init = zero_init_final_conv 87 | 88 | # TODO: change the name to `norm` 89 | self.bn = norm_module( 90 | num_features=self.dim, 91 | eps=self.norm_eps, 92 | momentum=self.norm_momentum, 93 | ) 94 | # Zero initializing the final bn. 95 | self.bn.transform_final_bn = zero_init_final_norm 96 | 97 | # Optional to add the spatial-temporal pooling. 98 | if self.use_pool: 99 | self.pool = nn.MaxPool3d( 100 | kernel_size=self.pool_size, 101 | stride=self.pool_size, 102 | padding=[0, 0, 0], 103 | ) 104 | 105 | def forward(self, x): 106 | x_identity = x 107 | N, C, T, H, W = x.size() 108 | 109 | theta = self.conv_theta(x) 110 | 111 | # Perform temporal-spatial pooling to reduce the computation. 112 | if self.use_pool: 113 | x = self.pool(x) 114 | 115 | phi = self.conv_phi(x) 116 | g = self.conv_g(x) 117 | 118 | theta = theta.view(N, self.dim_inner, -1) 119 | phi = phi.view(N, self.dim_inner, -1) 120 | g = g.view(N, self.dim_inner, -1) 121 | 122 | # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW). 123 | theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi)) 124 | # For original Non-local paper, there are two main ways to normalize 125 | # the affinity tensor: 126 | # 1) Softmax normalization (norm on exp). 127 | # 2) dot_product normalization. 128 | if self.instantiation == "softmax": 129 | # Normalizing the affinity tensor theta_phi before softmax. 130 | theta_phi = theta_phi * (self.dim_inner ** -0.5) 131 | theta_phi = nn.functional.softmax(theta_phi, dim=2) 132 | elif self.instantiation == "dot_product": 133 | spatial_temporal_dim = theta_phi.shape[2] 134 | theta_phi = theta_phi / spatial_temporal_dim 135 | else: 136 | raise NotImplementedError( 137 | "Unknown norm type {}".format(self.instantiation) 138 | ) 139 | 140 | # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW). 141 | theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g)) 142 | 143 | # (N, C, TxHxW) => (N, C, T, H, W). 144 | theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W) 145 | 146 | p = self.conv_out(theta_phi_g) 147 | p = self.bn(p) 148 | return x_identity + p 149 | -------------------------------------------------------------------------------- /slowfast/models/optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Optimizer.""" 5 | 6 | import torch 7 | 8 | import slowfast.utils.lr_policy as lr_policy 9 | 10 | 11 | def construct_optimizer(model, cfg): 12 | """ 13 | Construct a stochastic gradient descent or ADAM optimizer with momentum. 14 | Details can be found in: 15 | Herbert Robbins, and Sutton Monro. "A stochastic approximation method." 16 | and 17 | Diederik P.Kingma, and Jimmy Ba. 18 | "Adam: A Method for Stochastic Optimization." 19 | 20 | Args: 21 | model (model): model to perform stochastic gradient descent 22 | optimization or ADAM optimization. 23 | cfg (config): configs of hyper-parameters of SGD or ADAM, includes base 24 | learning rate, momentum, weight_decay, dampening, and etc. 25 | """ 26 | # Batchnorm parameters. 27 | bn_params = [] 28 | # Non-batchnorm parameters. 29 | non_bn_parameters = [] 30 | for name, p in model.named_parameters(): 31 | if "bn" in name: 32 | bn_params.append(p) 33 | else: 34 | non_bn_parameters.append(p) 35 | # Apply different weight decay to Batchnorm and non-batchnorm parameters. 36 | # In Caffe2 classification codebase the weight decay for batchnorm is 0.0. 37 | # Having a different weight decay on batchnorm might cause a performance 38 | # drop. 39 | optim_params = [ 40 | {"params": bn_params, "weight_decay": cfg.BN.WEIGHT_DECAY}, 41 | {"params": non_bn_parameters, "weight_decay": cfg.SOLVER.WEIGHT_DECAY}, 42 | ] 43 | # Check all parameters will be passed into optimizer. 44 | assert len(list(model.parameters())) == len(non_bn_parameters) + len( 45 | bn_params 46 | ), "parameter size does not match: {} + {} != {}".format( 47 | len(non_bn_parameters), len(bn_params), len(list(model.parameters())) 48 | ) 49 | 50 | if cfg.SOLVER.OPTIMIZING_METHOD == "sgd": 51 | return torch.optim.SGD( 52 | optim_params, 53 | lr=cfg.SOLVER.BASE_LR, 54 | momentum=cfg.SOLVER.MOMENTUM, 55 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 56 | dampening=cfg.SOLVER.DAMPENING, 57 | nesterov=cfg.SOLVER.NESTEROV, 58 | ) 59 | elif cfg.SOLVER.OPTIMIZING_METHOD == "adam": 60 | return torch.optim.Adam( 61 | optim_params, 62 | lr=cfg.SOLVER.BASE_LR, 63 | betas=(0.9, 0.999), 64 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 65 | ) 66 | else: 67 | raise NotImplementedError( 68 | "Does not support {} optimizer".format(cfg.SOLVER.OPTIMIZING_METHOD) 69 | ) 70 | 71 | 72 | def get_epoch_lr(cur_epoch, cfg): 73 | """ 74 | Retrieves the lr for the given epoch (as specified by the lr policy). 75 | Args: 76 | cfg (config): configs of hyper-parameters of ADAM, includes base 77 | learning rate, betas, and weight decays. 78 | cur_epoch (float): the number of epoch of the current training stage. 79 | """ 80 | return lr_policy.get_lr_at_epoch(cfg, cur_epoch) 81 | 82 | 83 | def set_lr(optimizer, new_lr): 84 | """ 85 | Sets the optimizer lr to the specified value. 86 | Args: 87 | optimizer (optim): the optimizer using to optimize the current network. 88 | new_lr (float): the new learning rate to set. 89 | """ 90 | for param_group in optimizer.param_groups: 91 | param_group["lr"] = new_lr 92 | -------------------------------------------------------------------------------- /slowfast/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/README.md: -------------------------------------------------------------------------------- 1 | The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet). 2 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanyix/SlowFast/629fd1bf00e2d3b320b6e46c652331819fe9d4e7/slowfast/utils/ava_evaluation/__init__.py -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt: -------------------------------------------------------------------------------- 1 | item { 2 | name: "bend/bow (at the waist)" 3 | id: 1 4 | } 5 | item { 6 | name: "crouch/kneel" 7 | id: 3 8 | } 9 | item { 10 | name: "dance" 11 | id: 4 12 | } 13 | item { 14 | name: "fall down" 15 | id: 5 16 | } 17 | item { 18 | name: "get up" 19 | id: 6 20 | } 21 | item { 22 | name: "jump/leap" 23 | id: 7 24 | } 25 | item { 26 | name: "lie/sleep" 27 | id: 8 28 | } 29 | item { 30 | name: "martial art" 31 | id: 9 32 | } 33 | item { 34 | name: "run/jog" 35 | id: 10 36 | } 37 | item { 38 | name: "sit" 39 | id: 11 40 | } 41 | item { 42 | name: "stand" 43 | id: 12 44 | } 45 | item { 46 | name: "swim" 47 | id: 13 48 | } 49 | item { 50 | name: "walk" 51 | id: 14 52 | } 53 | item { 54 | name: "answer phone" 55 | id: 15 56 | } 57 | item { 58 | name: "carry/hold (an object)" 59 | id: 17 60 | } 61 | item { 62 | name: "climb (e.g., a mountain)" 63 | id: 20 64 | } 65 | item { 66 | name: "close (e.g., a door, a box)" 67 | id: 22 68 | } 69 | item { 70 | name: "cut" 71 | id: 24 72 | } 73 | item { 74 | name: "dress/put on clothing" 75 | id: 26 76 | } 77 | item { 78 | name: "drink" 79 | id: 27 80 | } 81 | item { 82 | name: "drive (e.g., a car, a truck)" 83 | id: 28 84 | } 85 | item { 86 | name: "eat" 87 | id: 29 88 | } 89 | item { 90 | name: "enter" 91 | id: 30 92 | } 93 | item { 94 | name: "hit (an object)" 95 | id: 34 96 | } 97 | item { 98 | name: "lift/pick up" 99 | id: 36 100 | } 101 | item { 102 | name: "listen (e.g., to music)" 103 | id: 37 104 | } 105 | item { 106 | name: "open (e.g., a window, a car door)" 107 | id: 38 108 | } 109 | item { 110 | name: "play musical instrument" 111 | id: 41 112 | } 113 | item { 114 | name: "point to (an object)" 115 | id: 43 116 | } 117 | item { 118 | name: "pull (an object)" 119 | id: 45 120 | } 121 | item { 122 | name: "push (an object)" 123 | id: 46 124 | } 125 | item { 126 | name: "put down" 127 | id: 47 128 | } 129 | item { 130 | name: "read" 131 | id: 48 132 | } 133 | item { 134 | name: "ride (e.g., a bike, a car, a horse)" 135 | id: 49 136 | } 137 | item { 138 | name: "sail boat" 139 | id: 51 140 | } 141 | item { 142 | name: "shoot" 143 | id: 52 144 | } 145 | item { 146 | name: "smoke" 147 | id: 54 148 | } 149 | item { 150 | name: "take a photo" 151 | id: 56 152 | } 153 | item { 154 | name: "text on/look at a cellphone" 155 | id: 57 156 | } 157 | item { 158 | name: "throw" 159 | id: 58 160 | } 161 | item { 162 | name: "touch (an object)" 163 | id: 59 164 | } 165 | item { 166 | name: "turn (e.g., a screwdriver)" 167 | id: 60 168 | } 169 | item { 170 | name: "watch (e.g., TV)" 171 | id: 61 172 | } 173 | item { 174 | name: "work on a computer" 175 | id: 62 176 | } 177 | item { 178 | name: "write" 179 | id: 63 180 | } 181 | item { 182 | name: "fight/hit (a person)" 183 | id: 64 184 | } 185 | item { 186 | name: "give/serve (an object) to (a person)" 187 | id: 65 188 | } 189 | item { 190 | name: "grab (a person)" 191 | id: 66 192 | } 193 | item { 194 | name: "hand clap" 195 | id: 67 196 | } 197 | item { 198 | name: "hand shake" 199 | id: 68 200 | } 201 | item { 202 | name: "hand wave" 203 | id: 69 204 | } 205 | item { 206 | name: "hug (a person)" 207 | id: 70 208 | } 209 | item { 210 | name: "kiss (a person)" 211 | id: 72 212 | } 213 | item { 214 | name: "lift (a person)" 215 | id: 73 216 | } 217 | item { 218 | name: "listen to (a person)" 219 | id: 74 220 | } 221 | item { 222 | name: "push (another person)" 223 | id: 76 224 | } 225 | item { 226 | name: "sing to (e.g., self, a person, a group)" 227 | id: 77 228 | } 229 | item { 230 | name: "take (an object) from (a person)" 231 | id: 78 232 | } 233 | item { 234 | name: "talk to (e.g., self, a person, a group)" 235 | id: 79 236 | } 237 | item { 238 | name: "watch (a person)" 239 | id: 80 240 | } 241 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/label_map_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Label map utility functions.""" 16 | 17 | from __future__ import ( 18 | absolute_import, 19 | division, 20 | print_function, 21 | unicode_literals, 22 | ) 23 | import logging 24 | 25 | # from google.protobuf import text_format 26 | # from google3.third_party.tensorflow_models.object_detection.protos import string_int_label_map_pb2 27 | 28 | 29 | def _validate_label_map(label_map): 30 | """Checks if a label map is valid. 31 | 32 | Args: 33 | label_map: StringIntLabelMap to validate. 34 | 35 | Raises: 36 | ValueError: if label map is invalid. 37 | """ 38 | for item in label_map.item: 39 | if item.id < 1: 40 | raise ValueError("Label map ids should be >= 1.") 41 | 42 | 43 | def create_category_index(categories): 44 | """Creates dictionary of COCO compatible categories keyed by category id. 45 | 46 | Args: 47 | categories: a list of dicts, each of which has the following keys: 48 | 'id': (required) an integer id uniquely identifying this category. 49 | 'name': (required) string representing category name 50 | e.g., 'cat', 'dog', 'pizza'. 51 | 52 | Returns: 53 | category_index: a dict containing the same entries as categories, but keyed 54 | by the 'id' field of each category. 55 | """ 56 | category_index = {} 57 | for cat in categories: 58 | category_index[cat["id"]] = cat 59 | return category_index 60 | 61 | 62 | def get_max_label_map_index(label_map): 63 | """Get maximum index in label map. 64 | 65 | Args: 66 | label_map: a StringIntLabelMapProto 67 | 68 | Returns: 69 | an integer 70 | """ 71 | return max([item.id for item in label_map.item]) 72 | 73 | 74 | def convert_label_map_to_categories( 75 | label_map, max_num_classes, use_display_name=True 76 | ): 77 | """Loads label map proto and returns categories list compatible with eval. 78 | 79 | This function loads a label map and returns a list of dicts, each of which 80 | has the following keys: 81 | 'id': (required) an integer id uniquely identifying this category. 82 | 'name': (required) string representing category name 83 | e.g., 'cat', 'dog', 'pizza'. 84 | We only allow class into the list if its id-label_id_offset is 85 | between 0 (inclusive) and max_num_classes (exclusive). 86 | If there are several items mapping to the same id in the label map, 87 | we will only keep the first one in the categories list. 88 | 89 | Args: 90 | label_map: a StringIntLabelMapProto or None. If None, a default categories 91 | list is created with max_num_classes categories. 92 | max_num_classes: maximum number of (consecutive) label indices to include. 93 | use_display_name: (boolean) choose whether to load 'display_name' field 94 | as category name. If False or if the display_name field does not exist, 95 | uses 'name' field as category names instead. 96 | Returns: 97 | categories: a list of dictionaries representing all possible categories. 98 | """ 99 | categories = [] 100 | list_of_ids_already_added = [] 101 | if not label_map: 102 | label_id_offset = 1 103 | for class_id in range(max_num_classes): 104 | categories.append( 105 | { 106 | "id": class_id + label_id_offset, 107 | "name": "category_{}".format(class_id + label_id_offset), 108 | } 109 | ) 110 | return categories 111 | for item in label_map.item: 112 | if not 0 < item.id <= max_num_classes: 113 | logging.info( 114 | "Ignore item %d since it falls outside of requested " 115 | "label range.", 116 | item.id, 117 | ) 118 | continue 119 | if use_display_name and item.HasField("display_name"): 120 | name = item.display_name 121 | else: 122 | name = item.name 123 | if item.id not in list_of_ids_already_added: 124 | list_of_ids_already_added.append(item.id) 125 | categories.append({"id": item.id, "name": name}) 126 | return categories 127 | 128 | 129 | def load_labelmap(path): 130 | """Loads label map proto. 131 | 132 | Args: 133 | path: path to StringIntLabelMap proto text file. 134 | Returns: 135 | a StringIntLabelMapProto 136 | """ 137 | with open(path, "r") as fid: 138 | label_map_string = fid.read() 139 | label_map = string_int_label_map_pb2.StringIntLabelMap() 140 | try: 141 | text_format.Merge(label_map_string, label_map) 142 | except text_format.ParseError: 143 | label_map.ParseFromString(label_map_string) 144 | _validate_label_map(label_map) 145 | return label_map 146 | 147 | 148 | def get_label_map_dict(label_map_path, use_display_name=False): 149 | """Reads a label map and returns a dictionary of label names to id. 150 | 151 | Args: 152 | label_map_path: path to label_map. 153 | use_display_name: whether to use the label map items' display names as keys. 154 | 155 | Returns: 156 | A dictionary mapping label names to id. 157 | """ 158 | label_map = load_labelmap(label_map_path) 159 | label_map_dict = {} 160 | for item in label_map.item: 161 | if use_display_name: 162 | label_map_dict[item.display_name] = item.id 163 | else: 164 | label_map_dict[item.name] = item.id 165 | return label_map_dict 166 | 167 | 168 | def create_category_index_from_labelmap(label_map_path): 169 | """Reads a label map and returns a category index. 170 | 171 | Args: 172 | label_map_path: Path to `StringIntLabelMap` proto text file. 173 | 174 | Returns: 175 | A category index, which is a dictionary that maps integer ids to dicts 176 | containing categories, e.g. 177 | {1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...} 178 | """ 179 | label_map = load_labelmap(label_map_path) 180 | max_num_classes = max(item.id for item in label_map.item) 181 | categories = convert_label_map_to_categories(label_map, max_num_classes) 182 | return create_category_index(categories) 183 | 184 | 185 | def create_class_agnostic_category_index(): 186 | """Creates a category index with a single `object` class.""" 187 | return {1: {"id": 1, "name": "object"}} 188 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Functions for computing metrics like precision, recall, CorLoc and etc.""" 17 | from __future__ import division 18 | import numpy as np 19 | 20 | 21 | def compute_precision_recall(scores, labels, num_gt): 22 | """Compute precision and recall. 23 | 24 | Args: 25 | scores: A float numpy array representing detection score 26 | labels: A boolean numpy array representing true/false positive labels 27 | num_gt: Number of ground truth instances 28 | 29 | Raises: 30 | ValueError: if the input is not of the correct format 31 | 32 | Returns: 33 | precision: Fraction of positive instances over detected ones. This value is 34 | None if no ground truth labels are present. 35 | recall: Fraction of detected positive instance over all positive instances. 36 | This value is None if no ground truth labels are present. 37 | 38 | """ 39 | if ( 40 | not isinstance(labels, np.ndarray) 41 | or labels.dtype != np.bool 42 | or len(labels.shape) != 1 43 | ): 44 | raise ValueError("labels must be single dimension bool numpy array") 45 | 46 | if not isinstance(scores, np.ndarray) or len(scores.shape) != 1: 47 | raise ValueError("scores must be single dimension numpy array") 48 | 49 | if num_gt < np.sum(labels): 50 | raise ValueError( 51 | "Number of true positives must be smaller than num_gt." 52 | ) 53 | 54 | if len(scores) != len(labels): 55 | raise ValueError("scores and labels must be of the same size.") 56 | 57 | if num_gt == 0: 58 | return None, None 59 | 60 | sorted_indices = np.argsort(scores) 61 | sorted_indices = sorted_indices[::-1] 62 | labels = labels.astype(int) 63 | true_positive_labels = labels[sorted_indices] 64 | false_positive_labels = 1 - true_positive_labels 65 | cum_true_positives = np.cumsum(true_positive_labels) 66 | cum_false_positives = np.cumsum(false_positive_labels) 67 | precision = cum_true_positives.astype(float) / ( 68 | cum_true_positives + cum_false_positives 69 | ) 70 | recall = cum_true_positives.astype(float) / num_gt 71 | return precision, recall 72 | 73 | 74 | def compute_average_precision(precision, recall): 75 | """Compute Average Precision according to the definition in VOCdevkit. 76 | 77 | Precision is modified to ensure that it does not decrease as recall 78 | decrease. 79 | 80 | Args: 81 | precision: A float [N, 1] numpy array of precisions 82 | recall: A float [N, 1] numpy array of recalls 83 | 84 | Raises: 85 | ValueError: if the input is not of the correct format 86 | 87 | Returns: 88 | average_precison: The area under the precision recall curve. NaN if 89 | precision and recall are None. 90 | 91 | """ 92 | if precision is None: 93 | if recall is not None: 94 | raise ValueError("If precision is None, recall must also be None") 95 | return np.NAN 96 | 97 | if not isinstance(precision, np.ndarray) or not isinstance( 98 | recall, np.ndarray 99 | ): 100 | raise ValueError("precision and recall must be numpy array") 101 | if precision.dtype != np.float or recall.dtype != np.float: 102 | raise ValueError("input must be float numpy array.") 103 | if len(precision) != len(recall): 104 | raise ValueError("precision and recall must be of the same size.") 105 | if not precision.size: 106 | return 0.0 107 | if np.amin(precision) < 0 or np.amax(precision) > 1: 108 | raise ValueError("Precision must be in the range of [0, 1].") 109 | if np.amin(recall) < 0 or np.amax(recall) > 1: 110 | raise ValueError("recall must be in the range of [0, 1].") 111 | if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): 112 | raise ValueError("recall must be a non-decreasing array") 113 | 114 | recall = np.concatenate([[0], recall, [1]]) 115 | precision = np.concatenate([[0], precision, [0]]) 116 | 117 | # Preprocess precision to be a non-decreasing array 118 | for i in range(len(precision) - 2, -1, -1): 119 | precision[i] = np.maximum(precision[i], precision[i + 1]) 120 | 121 | indices = np.where(recall[1:] != recall[:-1])[0] + 1 122 | average_precision = np.sum( 123 | (recall[indices] - recall[indices - 1]) * precision[indices] 124 | ) 125 | return average_precision 126 | 127 | 128 | def compute_cor_loc( 129 | num_gt_imgs_per_class, num_images_correctly_detected_per_class 130 | ): 131 | """Compute CorLoc according to the definition in the following paper. 132 | 133 | https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf 134 | 135 | Returns nans if there are no ground truth images for a class. 136 | 137 | Args: 138 | num_gt_imgs_per_class: 1D array, representing number of images containing 139 | at least one object instance of a particular class 140 | num_images_correctly_detected_per_class: 1D array, representing number of 141 | images that are correctly detected at least one object instance of a 142 | particular class 143 | 144 | Returns: 145 | corloc_per_class: A float numpy array represents the corloc score of each 146 | class 147 | """ 148 | # Divide by zero expected for classes with no gt examples. 149 | with np.errstate(divide="ignore", invalid="ignore"): 150 | return np.where( 151 | num_gt_imgs_per_class == 0, 152 | np.nan, 153 | num_images_correctly_detected_per_class / num_gt_imgs_per_class, 154 | ) 155 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/np_box_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxList classes and functions.""" 17 | 18 | from __future__ import ( 19 | absolute_import, 20 | division, 21 | print_function, 22 | unicode_literals, 23 | ) 24 | import numpy as np 25 | 26 | 27 | class BoxList(object): 28 | """Box collection. 29 | 30 | BoxList represents a list of bounding boxes as numpy array, where each 31 | bounding box is represented as a row of 4 numbers, 32 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within a 33 | given list correspond to a single image. 34 | 35 | Optionally, users can add additional related fields (such as 36 | objectness/classification scores). 37 | """ 38 | 39 | def __init__(self, data): 40 | """Constructs box collection. 41 | 42 | Args: 43 | data: a numpy array of shape [N, 4] representing box coordinates 44 | 45 | Raises: 46 | ValueError: if bbox data is not a numpy array 47 | ValueError: if invalid dimensions for bbox data 48 | """ 49 | if not isinstance(data, np.ndarray): 50 | raise ValueError("data must be a numpy array.") 51 | if len(data.shape) != 2 or data.shape[1] != 4: 52 | raise ValueError("Invalid dimensions for box data.") 53 | if data.dtype != np.float32 and data.dtype != np.float64: 54 | raise ValueError( 55 | "Invalid data type for box data: float is required." 56 | ) 57 | if not self._is_valid_boxes(data): 58 | raise ValueError( 59 | "Invalid box data. data must be a numpy array of " 60 | "N*[y_min, x_min, y_max, x_max]" 61 | ) 62 | self.data = {"boxes": data} 63 | 64 | def num_boxes(self): 65 | """Return number of boxes held in collections.""" 66 | return self.data["boxes"].shape[0] 67 | 68 | def get_extra_fields(self): 69 | """Return all non-box fields.""" 70 | return [k for k in self.data.keys() if k != "boxes"] 71 | 72 | def has_field(self, field): 73 | return field in self.data 74 | 75 | def add_field(self, field, field_data): 76 | """Add data to a specified field. 77 | 78 | Args: 79 | field: a string parameter used to speficy a related field to be accessed. 80 | field_data: a numpy array of [N, ...] representing the data associated 81 | with the field. 82 | Raises: 83 | ValueError: if the field is already exist or the dimension of the field 84 | data does not matches the number of boxes. 85 | """ 86 | if self.has_field(field): 87 | raise ValueError("Field " + field + "already exists") 88 | if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(): 89 | raise ValueError("Invalid dimensions for field data") 90 | self.data[field] = field_data 91 | 92 | def get(self): 93 | """Convenience function for accesssing box coordinates. 94 | 95 | Returns: 96 | a numpy array of shape [N, 4] representing box corners 97 | """ 98 | return self.get_field("boxes") 99 | 100 | def get_field(self, field): 101 | """Accesses data associated with the specified field in the box collection. 102 | 103 | Args: 104 | field: a string parameter used to speficy a related field to be accessed. 105 | 106 | Returns: 107 | a numpy 1-d array representing data of an associated field 108 | 109 | Raises: 110 | ValueError: if invalid field 111 | """ 112 | if not self.has_field(field): 113 | raise ValueError("field {} does not exist".format(field)) 114 | return self.data[field] 115 | 116 | def get_coordinates(self): 117 | """Get corner coordinates of boxes. 118 | 119 | Returns: 120 | a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] 121 | """ 122 | box_coordinates = self.get() 123 | y_min = box_coordinates[:, 0] 124 | x_min = box_coordinates[:, 1] 125 | y_max = box_coordinates[:, 2] 126 | x_max = box_coordinates[:, 3] 127 | return [y_min, x_min, y_max, x_max] 128 | 129 | def _is_valid_boxes(self, data): 130 | """Check whether data fullfills the format of N*[ymin, xmin, ymax, xmin]. 131 | 132 | Args: 133 | data: a numpy array of shape [N, 4] representing box coordinates 134 | 135 | Returns: 136 | a boolean indicating whether all ymax of boxes are equal or greater than 137 | ymin, and all xmax of boxes are equal or greater than xmin. 138 | """ 139 | if data.shape[0] > 0: 140 | for i in range(data.shape[0]): 141 | if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]: 142 | return False 143 | return True 144 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/np_box_mask_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxMaskList classes and functions.""" 17 | 18 | from __future__ import ( 19 | absolute_import, 20 | division, 21 | print_function, 22 | unicode_literals, 23 | ) 24 | import numpy as np 25 | 26 | from . import np_box_list 27 | 28 | 29 | class BoxMaskList(np_box_list.BoxList): 30 | """Convenience wrapper for BoxList with masks. 31 | 32 | BoxMaskList extends the np_box_list.BoxList to contain masks as well. 33 | In particular, its constructor receives both boxes and masks. Note that the 34 | masks correspond to the full image. 35 | """ 36 | 37 | def __init__(self, box_data, mask_data): 38 | """Constructs box collection. 39 | 40 | Args: 41 | box_data: a numpy array of shape [N, 4] representing box coordinates 42 | mask_data: a numpy array of shape [N, height, width] representing masks 43 | with values are in {0,1}. The masks correspond to the full 44 | image. The height and the width will be equal to image height and width. 45 | 46 | Raises: 47 | ValueError: if bbox data is not a numpy array 48 | ValueError: if invalid dimensions for bbox data 49 | ValueError: if mask data is not a numpy array 50 | ValueError: if invalid dimension for mask data 51 | """ 52 | super(BoxMaskList, self).__init__(box_data) 53 | if not isinstance(mask_data, np.ndarray): 54 | raise ValueError("Mask data must be a numpy array.") 55 | if len(mask_data.shape) != 3: 56 | raise ValueError("Invalid dimensions for mask data.") 57 | if mask_data.dtype != np.uint8: 58 | raise ValueError( 59 | "Invalid data type for mask data: uint8 is required." 60 | ) 61 | if mask_data.shape[0] != box_data.shape[0]: 62 | raise ValueError( 63 | "There should be the same number of boxes and masks." 64 | ) 65 | self.data["masks"] = mask_data 66 | 67 | def get_masks(self): 68 | """Convenience function for accessing masks. 69 | 70 | Returns: 71 | a numpy array of shape [N, height, width] representing masks 72 | """ 73 | return self.get_field("masks") 74 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/np_box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, 4] numpy arrays representing bounding boxes. 17 | 18 | Example box operations that are supported: 19 | * Areas: compute bounding box areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | from __future__ import ( 23 | absolute_import, 24 | division, 25 | print_function, 26 | unicode_literals, 27 | ) 28 | import numpy as np 29 | 30 | 31 | def area(boxes): 32 | """Computes area of boxes. 33 | 34 | Args: 35 | boxes: Numpy array with shape [N, 4] holding N boxes 36 | 37 | Returns: 38 | a numpy array with shape [N*1] representing box areas 39 | """ 40 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 41 | 42 | 43 | def intersection(boxes1, boxes2): 44 | """Compute pairwise intersection areas between boxes. 45 | 46 | Args: 47 | boxes1: a numpy array with shape [N, 4] holding N boxes 48 | boxes2: a numpy array with shape [M, 4] holding M boxes 49 | 50 | Returns: 51 | a numpy array with shape [N*M] representing pairwise intersection area 52 | """ 53 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 54 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 55 | 56 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 57 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 58 | intersect_heights = np.maximum( 59 | np.zeros(all_pairs_max_ymin.shape), 60 | all_pairs_min_ymax - all_pairs_max_ymin, 61 | ) 62 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 63 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 64 | intersect_widths = np.maximum( 65 | np.zeros(all_pairs_max_xmin.shape), 66 | all_pairs_min_xmax - all_pairs_max_xmin, 67 | ) 68 | return intersect_heights * intersect_widths 69 | 70 | 71 | def iou(boxes1, boxes2): 72 | """Computes pairwise intersection-over-union between box collections. 73 | 74 | Args: 75 | boxes1: a numpy array with shape [N, 4] holding N boxes. 76 | boxes2: a numpy array with shape [M, 4] holding N boxes. 77 | 78 | Returns: 79 | a numpy array with shape [N, M] representing pairwise iou scores. 80 | """ 81 | intersect = intersection(boxes1, boxes2) 82 | area1 = area(boxes1) 83 | area2 = area(boxes2) 84 | union = ( 85 | np.expand_dims(area1, axis=1) 86 | + np.expand_dims(area2, axis=0) 87 | - intersect 88 | ) 89 | return intersect / union 90 | 91 | 92 | def ioa(boxes1, boxes2): 93 | """Computes pairwise intersection-over-area between box collections. 94 | 95 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 96 | their intersection area over box2's area. Note that ioa is not symmetric, 97 | that is, IOA(box1, box2) != IOA(box2, box1). 98 | 99 | Args: 100 | boxes1: a numpy array with shape [N, 4] holding N boxes. 101 | boxes2: a numpy array with shape [M, 4] holding N boxes. 102 | 103 | Returns: 104 | a numpy array with shape [N, M] representing pairwise ioa scores. 105 | """ 106 | intersect = intersection(boxes1, boxes2) 107 | areas = np.expand_dims(area(boxes2), axis=0) 108 | return intersect / areas 109 | -------------------------------------------------------------------------------- /slowfast/utils/ava_evaluation/np_mask_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, height, width] numpy arrays representing masks. 17 | 18 | Example mask operations that are supported: 19 | * Areas: compute mask areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | from __future__ import ( 23 | absolute_import, 24 | division, 25 | print_function, 26 | unicode_literals, 27 | ) 28 | import numpy as np 29 | 30 | EPSILON = 1e-7 31 | 32 | 33 | def area(masks): 34 | """Computes area of masks. 35 | 36 | Args: 37 | masks: Numpy array with shape [N, height, width] holding N masks. Masks 38 | values are of type np.uint8 and values are in {0,1}. 39 | 40 | Returns: 41 | a numpy array with shape [N*1] representing mask areas. 42 | 43 | Raises: 44 | ValueError: If masks.dtype is not np.uint8 45 | """ 46 | if masks.dtype != np.uint8: 47 | raise ValueError("Masks type should be np.uint8") 48 | return np.sum(masks, axis=(1, 2), dtype=np.float32) 49 | 50 | 51 | def intersection(masks1, masks2): 52 | """Compute pairwise intersection areas between masks. 53 | 54 | Args: 55 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 56 | values are of type np.uint8 and values are in {0,1}. 57 | masks2: a numpy array with shape [M, height, width] holding M masks. Masks 58 | values are of type np.uint8 and values are in {0,1}. 59 | 60 | Returns: 61 | a numpy array with shape [N*M] representing pairwise intersection area. 62 | 63 | Raises: 64 | ValueError: If masks1 and masks2 are not of type np.uint8. 65 | """ 66 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 67 | raise ValueError("masks1 and masks2 should be of type np.uint8") 68 | n = masks1.shape[0] 69 | m = masks2.shape[0] 70 | answer = np.zeros([n, m], dtype=np.float32) 71 | for i in np.arange(n): 72 | for j in np.arange(m): 73 | answer[i, j] = np.sum( 74 | np.minimum(masks1[i], masks2[j]), dtype=np.float32 75 | ) 76 | return answer 77 | 78 | 79 | def iou(masks1, masks2): 80 | """Computes pairwise intersection-over-union between mask collections. 81 | 82 | Args: 83 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 84 | values are of type np.uint8 and values are in {0,1}. 85 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 86 | values are of type np.uint8 and values are in {0,1}. 87 | 88 | Returns: 89 | a numpy array with shape [N, M] representing pairwise iou scores. 90 | 91 | Raises: 92 | ValueError: If masks1 and masks2 are not of type np.uint8. 93 | """ 94 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 95 | raise ValueError("masks1 and masks2 should be of type np.uint8") 96 | intersect = intersection(masks1, masks2) 97 | area1 = area(masks1) 98 | area2 = area(masks2) 99 | union = ( 100 | np.expand_dims(area1, axis=1) 101 | + np.expand_dims(area2, axis=0) 102 | - intersect 103 | ) 104 | return intersect / np.maximum(union, EPSILON) 105 | 106 | 107 | def ioa(masks1, masks2): 108 | """Computes pairwise intersection-over-area between box collections. 109 | 110 | Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as 111 | their intersection area over mask2's area. Note that ioa is not symmetric, 112 | that is, IOA(mask1, mask2) != IOA(mask2, mask1). 113 | 114 | Args: 115 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 116 | values are of type np.uint8 and values are in {0,1}. 117 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 118 | values are of type np.uint8 and values are in {0,1}. 119 | 120 | Returns: 121 | a numpy array with shape [N, M] representing pairwise ioa scores. 122 | 123 | Raises: 124 | ValueError: If masks1 and masks2 are not of type np.uint8. 125 | """ 126 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 127 | raise ValueError("masks1 and masks2 should be of type np.uint8") 128 | intersect = intersection(masks1, masks2) 129 | areas = np.expand_dims(area(masks2), axis=0) 130 | return intersect / (areas + EPSILON) 131 | -------------------------------------------------------------------------------- /slowfast/utils/benchmark.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Functions for benchmarks. 4 | """ 5 | 6 | import numpy as np 7 | import pprint 8 | import torch 9 | import tqdm 10 | from fvcore.common.timer import Timer 11 | 12 | import slowfast.utils.logging as logging 13 | import slowfast.utils.misc as misc 14 | from slowfast.datasets import loader 15 | from slowfast.utils.env import setup_environment 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | def benchmark_data_loading(cfg): 21 | """ 22 | Benchmark the speed of data loading in PySlowFast. 23 | Args: 24 | 25 | cfg (CfgNode): configs. Details can be found in 26 | slowfast/config/defaults.py 27 | """ 28 | # Set up environment. 29 | setup_environment() 30 | # Set random seed from configs. 31 | np.random.seed(cfg.RNG_SEED) 32 | torch.manual_seed(cfg.RNG_SEED) 33 | 34 | # Setup logging format. 35 | logging.setup_logging(cfg.OUTPUT_DIR) 36 | 37 | # Print config. 38 | logger.info("Benchmark data loading with config:") 39 | logger.info(pprint.pformat(cfg)) 40 | 41 | timer = Timer() 42 | dataloader = loader.construct_loader(cfg, "train") 43 | logger.info( 44 | "Initialize loader using {:.2f} seconds.".format(timer.seconds()) 45 | ) 46 | # Total batch size across different machines. 47 | batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS 48 | log_period = cfg.BENCHMARK.LOG_PERIOD 49 | epoch_times = [] 50 | # Test for a few epochs. 51 | for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS): 52 | timer = Timer() 53 | timer_epoch = Timer() 54 | iter_times = [] 55 | if cfg.BENCHMARK.SHUFFLE: 56 | loader.shuffle_dataset(dataloader, cur_epoch) 57 | for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)): 58 | if cur_iter > 0 and cur_iter % log_period == 0: 59 | iter_times.append(timer.seconds()) 60 | ram_usage, ram_total = misc.cpu_mem_usage() 61 | logger.info( 62 | "Epoch {}: {} iters ({} videos) in {:.2f} seconds. " 63 | "RAM Usage: {:.2f}/{:.2f} GB.".format( 64 | cur_epoch, 65 | log_period, 66 | log_period * batch_size, 67 | iter_times[-1], 68 | ram_usage, 69 | ram_total, 70 | ) 71 | ) 72 | timer.reset() 73 | epoch_times.append(timer_epoch.seconds()) 74 | ram_usage, ram_total = misc.cpu_mem_usage() 75 | logger.info( 76 | "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. " 77 | "RAM Usage: {:.2f}/{:.2f} GB.".format( 78 | cur_epoch, 79 | len(dataloader), 80 | len(dataloader) * batch_size, 81 | epoch_times[-1], 82 | ram_usage, 83 | ram_total, 84 | ) 85 | ) 86 | logger.info( 87 | "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} " 88 | "(avg/std) seconds.".format( 89 | cur_epoch, 90 | log_period, 91 | log_period * batch_size, 92 | np.mean(iter_times), 93 | np.std(iter_times), 94 | ) 95 | ) 96 | logger.info( 97 | "On average every epoch ({} videos) takes {:.2f}/{:.2f} " 98 | "(avg/std) seconds.".format( 99 | len(dataloader) * batch_size, 100 | np.mean(epoch_times), 101 | np.std(epoch_times), 102 | ) 103 | ) 104 | -------------------------------------------------------------------------------- /slowfast/utils/bn_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """bn helper.""" 5 | 6 | import itertools 7 | import torch 8 | 9 | 10 | @torch.no_grad() 11 | def compute_and_update_bn_stats(model, data_loader, num_batches=200): 12 | """ 13 | Compute and update the batch norm stats to make it more precise. During 14 | training both bn stats and the weight are changing after every iteration, 15 | so the bn can not precisely reflect the latest stats of the current model. 16 | Here the bn stats is recomputed without change of weights, to make the 17 | running mean and running var more precise. 18 | Args: 19 | model (model): the model using to compute and update the bn stats. 20 | data_loader (dataloader): dataloader using to provide inputs. 21 | num_batches (int): running iterations using to compute the stats. 22 | """ 23 | 24 | # Prepares all the bn layers. 25 | bn_layers = [ 26 | m 27 | for m in model.modules() 28 | if any( 29 | ( 30 | isinstance(m, bn_type) 31 | for bn_type in ( 32 | torch.nn.BatchNorm1d, 33 | torch.nn.BatchNorm2d, 34 | torch.nn.BatchNorm3d, 35 | ) 36 | ) 37 | ) 38 | ] 39 | 40 | # In order to make the running stats only reflect the current batch, the 41 | # momentum is disabled. 42 | # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean 43 | # Setting the momentum to 1.0 to compute the stats without momentum. 44 | momentum_actual = [bn.momentum for bn in bn_layers] 45 | for bn in bn_layers: 46 | bn.momentum = 1.0 47 | 48 | # Calculates the running iterations for precise stats computation. 49 | running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers] 50 | running_square_mean = [torch.zeros_like(bn.running_var) for bn in bn_layers] 51 | 52 | for ind, (inputs, _, _) in enumerate( 53 | itertools.islice(data_loader, num_batches) 54 | ): 55 | # Forwards the model to update the bn stats. 56 | if isinstance(inputs, (list,)): 57 | for i in range(len(inputs)): 58 | inputs[i] = inputs[i].float().cuda(non_blocking=True) 59 | else: 60 | inputs = inputs.cuda(non_blocking=True) 61 | model(inputs) 62 | 63 | for i, bn in enumerate(bn_layers): 64 | # Accumulates the bn stats. 65 | running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1) 66 | # $E(x^2) = Var(x) + E(x)^2$. 67 | cur_square_mean = bn.running_var + bn.running_mean ** 2 68 | running_square_mean[i] += ( 69 | cur_square_mean - running_square_mean[i] 70 | ) / (ind + 1) 71 | 72 | for i, bn in enumerate(bn_layers): 73 | bn.running_mean = running_mean[i] 74 | # Var(x) = $E(x^2) - E(x)^2$. 75 | bn.running_var = running_square_mean[i] - bn.running_mean ** 2 76 | # Sets the precise bn stats. 77 | bn.momentum = momentum_actual[i] 78 | -------------------------------------------------------------------------------- /slowfast/utils/c2_model_loading.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Caffe2 to PyTorch checkpoint name converting utility.""" 5 | 6 | import re 7 | 8 | 9 | def get_name_convert_func(): 10 | """ 11 | Get the function to convert Caffe2 layer names to PyTorch layer names. 12 | Returns: 13 | (func): function to convert parameter name from Caffe2 format to PyTorch 14 | format. 15 | """ 16 | pairs = [ 17 | # ------------------------------------------------------------ 18 | # 'nonlocal_conv3_1_theta_w' -> 's3.pathway0_nonlocal3.conv_g.weight' 19 | [ 20 | r"^nonlocal_conv([0-9]+)_([0-9]+)_(.*)", 21 | r"s\1.pathway0_nonlocal\2_\3", 22 | ], 23 | # 'theta' -> 'conv_theta' 24 | [r"^(.*)_nonlocal([0-9]+)_(theta)(.*)", r"\1_nonlocal\2.conv_\3\4"], 25 | # 'g' -> 'conv_g' 26 | [r"^(.*)_nonlocal([0-9]+)_(g)(.*)", r"\1_nonlocal\2.conv_\3\4"], 27 | # 'phi' -> 'conv_phi' 28 | [r"^(.*)_nonlocal([0-9]+)_(phi)(.*)", r"\1_nonlocal\2.conv_\3\4"], 29 | # 'out' -> 'conv_out' 30 | [r"^(.*)_nonlocal([0-9]+)_(out)(.*)", r"\1_nonlocal\2.conv_\3\4"], 31 | # 'nonlocal_conv4_5_bn_s' -> 's4.pathway0_nonlocal3.bn.weight' 32 | [r"^(.*)_nonlocal([0-9]+)_(bn)_(.*)", r"\1_nonlocal\2.\3.\4"], 33 | # ------------------------------------------------------------ 34 | # 't_pool1_subsample_bn' -> 's1_fuse.conv_f2s.bn.running_mean' 35 | [r"^t_pool1_subsample_bn_(.*)", r"s1_fuse.bn.\1"], 36 | # 't_pool1_subsample' -> 's1_fuse.conv_f2s' 37 | [r"^t_pool1_subsample_(.*)", r"s1_fuse.conv_f2s.\1"], 38 | # 't_res4_5_branch2c_bn_subsample_bn_rm' -> 's4_fuse.conv_f2s.bias' 39 | [ 40 | r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_bn_(.*)", 41 | r"s\1_fuse.bn.\3", 42 | ], 43 | # 't_pool1_subsample' -> 's1_fuse.conv_f2s' 44 | [ 45 | r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_(.*)", 46 | r"s\1_fuse.conv_f2s.\3", 47 | ], 48 | # ------------------------------------------------------------ 49 | # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b' 50 | [ 51 | r"^res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)", 52 | r"s\1.pathway0_res\2.branch\3.\4_\5", 53 | ], 54 | # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.' 55 | [r"^res_conv1_bn_(.*)", r"s1.pathway0_stem.bn.\1"], 56 | # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.' 57 | [r"^conv1_(.*)", r"s1.pathway0_stem.conv.\1"], 58 | # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight' 59 | [ 60 | r"^res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)", 61 | r"s\1.pathway0_res\2.branch\3_\4", 62 | ], 63 | # 'res_conv1_' -> 's1.pathway0_stem.conv.' 64 | [r"^res_conv1_(.*)", r"s1.pathway0_stem.conv.\1"], 65 | # ------------------------------------------------------------ 66 | # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b' 67 | [ 68 | r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)", 69 | r"s\1.pathway1_res\2.branch\3.\4_\5", 70 | ], 71 | # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.' 72 | [r"^t_res_conv1_bn_(.*)", r"s1.pathway1_stem.bn.\1"], 73 | # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.' 74 | [r"^t_conv1_(.*)", r"s1.pathway1_stem.conv.\1"], 75 | # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight' 76 | [ 77 | r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)", 78 | r"s\1.pathway1_res\2.branch\3_\4", 79 | ], 80 | # 'res_conv1_' -> 's1.pathway0_stem.conv.' 81 | [r"^t_res_conv1_(.*)", r"s1.pathway1_stem.conv.\1"], 82 | # ------------------------------------------------------------ 83 | # pred_ -> head.projection. 84 | [r"pred_(.*)", r"head.projection.\1"], 85 | # '.bn_b' -> '.weight' 86 | [r"(.*)bn.b\Z", r"\1bn.bias"], 87 | # '.bn_s' -> '.weight' 88 | [r"(.*)bn.s\Z", r"\1bn.weight"], 89 | # '_bn_rm' -> '.running_mean' 90 | [r"(.*)bn.rm\Z", r"\1bn.running_mean"], 91 | # '_bn_riv' -> '.running_var' 92 | [r"(.*)bn.riv\Z", r"\1bn.running_var"], 93 | # '_b' -> '.bias' 94 | [r"(.*)[\._]b\Z", r"\1.bias"], 95 | # '_w' -> '.weight' 96 | [r"(.*)[\._]w\Z", r"\1.weight"], 97 | ] 98 | 99 | def convert_caffe2_name_to_pytorch(caffe2_layer_name): 100 | """ 101 | Convert the caffe2_layer_name to pytorch format by apply the list of 102 | regular expressions. 103 | Args: 104 | caffe2_layer_name (str): caffe2 layer name. 105 | Returns: 106 | (str): pytorch layer name. 107 | """ 108 | for source, dest in pairs: 109 | caffe2_layer_name = re.sub(source, dest, caffe2_layer_name) 110 | return caffe2_layer_name 111 | 112 | return convert_caffe2_name_to_pytorch 113 | -------------------------------------------------------------------------------- /slowfast/utils/env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Set up Environment.""" 5 | 6 | import slowfast.utils.logging as logging 7 | 8 | _ENV_SETUP_DONE = False 9 | 10 | 11 | def setup_environment(): 12 | global _ENV_SETUP_DONE 13 | if _ENV_SETUP_DONE: 14 | return 15 | _ENV_SETUP_DONE = True 16 | -------------------------------------------------------------------------------- /slowfast/utils/logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Logging.""" 5 | 6 | import builtins 7 | import decimal 8 | import functools 9 | import logging 10 | import os 11 | import sys 12 | import simplejson 13 | from fvcore.common.file_io import PathManager 14 | 15 | import slowfast.utils.distributed as du 16 | 17 | 18 | def _suppress_print(): 19 | """ 20 | Suppresses printing from the current process. 21 | """ 22 | 23 | def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False): 24 | pass 25 | 26 | builtins.print = print_pass 27 | 28 | 29 | @functools.lru_cache(maxsize=None) 30 | def _cached_log_stream(filename): 31 | return PathManager.open(filename, "a") 32 | 33 | 34 | def setup_logging(output_dir=None): 35 | """ 36 | Sets up the logging for multiple processes. Only enable the logging for the 37 | master process, and suppress logging for the non-master processes. 38 | """ 39 | # Set up logging format. 40 | _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" 41 | 42 | if du.is_master_proc(): 43 | # Enable logging for the master process. 44 | logging.root.handlers = [] 45 | logging.basicConfig( 46 | level=logging.INFO, format=_FORMAT, stream=sys.stdout 47 | ) 48 | else: 49 | # Suppress logging for non-master processes. 50 | _suppress_print() 51 | 52 | logger = logging.getLogger() 53 | logger.setLevel(logging.DEBUG) 54 | logger.propagate = False 55 | plain_formatter = logging.Formatter( 56 | "[%(asctime)s][%(levelname)s] %(name)s: %(lineno)4d: %(message)s", 57 | datefmt="%m/%d %H:%M:%S", 58 | ) 59 | 60 | if du.is_master_proc(): 61 | ch = logging.StreamHandler(stream=sys.stdout) 62 | ch.setLevel(logging.DEBUG) 63 | ch.setFormatter(plain_formatter) 64 | logger.addHandler(ch) 65 | 66 | if output_dir is not None and du.is_master_proc(du.get_world_size()): 67 | filename = os.path.join(output_dir, "stdout.log") 68 | fh = logging.StreamHandler(_cached_log_stream(filename)) 69 | fh.setLevel(logging.DEBUG) 70 | fh.setFormatter(plain_formatter) 71 | logger.addHandler(fh) 72 | 73 | 74 | def get_logger(name): 75 | """ 76 | Retrieve the logger with the specified name or, if name is None, return a 77 | logger which is the root logger of the hierarchy. 78 | Args: 79 | name (string): name of the logger. 80 | """ 81 | return logging.getLogger(name) 82 | 83 | 84 | def log_json_stats(stats): 85 | """ 86 | Logs json stats. 87 | Args: 88 | stats (dict): a dictionary of statistical information to log. 89 | """ 90 | stats = { 91 | k: decimal.Decimal("{:.6f}".format(v)) if isinstance(v, float) else v 92 | for k, v in stats.items() 93 | } 94 | json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True) 95 | logger = get_logger(__name__) 96 | logger.info("json_stats: {:s}".format(json_stats)) 97 | -------------------------------------------------------------------------------- /slowfast/utils/lr_policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Learning rate policy.""" 5 | 6 | import math 7 | 8 | 9 | def get_lr_at_epoch(cfg, cur_epoch): 10 | """ 11 | Retrieve the learning rate of the current epoch with the option to perform 12 | warm up in the beginning of the training stage. 13 | Args: 14 | cfg (CfgNode): configs. Details can be found in 15 | slowfast/config/defaults.py 16 | cur_epoch (float): the number of epoch of the current training stage. 17 | """ 18 | lr = get_lr_func(cfg.SOLVER.LR_POLICY)(cfg, cur_epoch) 19 | # Perform warm up. 20 | if cur_epoch < cfg.SOLVER.WARMUP_EPOCHS: 21 | lr_start = cfg.SOLVER.WARMUP_START_LR 22 | lr_end = get_lr_func(cfg.SOLVER.LR_POLICY)( 23 | cfg, cfg.SOLVER.WARMUP_EPOCHS 24 | ) 25 | alpha = (lr_end - lr_start) / cfg.SOLVER.WARMUP_EPOCHS 26 | lr = cur_epoch * alpha + lr_start 27 | return lr 28 | 29 | 30 | def lr_func_cosine(cfg, cur_epoch): 31 | """ 32 | Retrieve the learning rate to specified values at specified epoch with the 33 | cosine learning rate schedule. Details can be found in: 34 | Ilya Loshchilov, and Frank Hutter 35 | SGDR: Stochastic Gradient Descent With Warm Restarts. 36 | Args: 37 | cfg (CfgNode): configs. Details can be found in 38 | slowfast/config/defaults.py 39 | cur_epoch (float): the number of epoch of the current training stage. 40 | """ 41 | return ( 42 | cfg.SOLVER.BASE_LR 43 | * (math.cos(math.pi * cur_epoch / cfg.SOLVER.MAX_EPOCH) + 1.0) 44 | * 0.5 45 | ) 46 | 47 | 48 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch): 49 | """ 50 | Retrieve the learning rate to specified values at specified epoch with the 51 | steps with relative learning rate schedule. 52 | Args: 53 | cfg (CfgNode): configs. Details can be found in 54 | slowfast/config/defaults.py 55 | cur_epoch (float): the number of epoch of the current training stage. 56 | """ 57 | ind = get_step_index(cfg, cur_epoch) 58 | return cfg.SOLVER.LRS[ind] * cfg.SOLVER.BASE_LR 59 | 60 | 61 | def get_step_index(cfg, cur_epoch): 62 | """ 63 | Retrieves the lr step index for the given epoch. 64 | Args: 65 | cfg (CfgNode): configs. Details can be found in 66 | slowfast/config/defaults.py 67 | cur_epoch (float): the number of epoch of the current training stage. 68 | """ 69 | steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_EPOCH] 70 | for ind, step in enumerate(steps): # NoQA 71 | if cur_epoch < step: 72 | break 73 | return ind - 1 74 | 75 | 76 | def get_lr_func(lr_policy): 77 | """ 78 | Given the configs, retrieve the specified lr policy function. 79 | Args: 80 | lr_policy (string): the learning rate policy to use for the job. 81 | """ 82 | policy = "lr_func_" + lr_policy 83 | if policy not in globals(): 84 | raise NotImplementedError("Unknown LR policy: {}".format(lr_policy)) 85 | else: 86 | return globals()[policy] 87 | -------------------------------------------------------------------------------- /slowfast/utils/metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Functions for computing metrics.""" 5 | 6 | import torch 7 | 8 | 9 | def topks_correct(preds, labels, ks): 10 | """ 11 | Given the predictions, labels, and a list of top-k values, compute the 12 | number of correct predictions for each top-k value. 13 | 14 | Args: 15 | preds (array): array of predictions. Dimension is batchsize 16 | N x ClassNum. 17 | labels (array): array of labels. Dimension is batchsize N. 18 | ks (list): list of top-k values. For example, ks = [1, 5] correspods 19 | to top-1 and top-5. 20 | 21 | Returns: 22 | topks_correct (list): list of numbers, where the `i`-th entry 23 | corresponds to the number of top-`ks[i]` correct predictions. 24 | """ 25 | assert preds.size(0) == labels.size( 26 | 0 27 | ), "Batch dim of predictions and labels must match" 28 | # Find the top max_k predictions for each sample 29 | _top_max_k_vals, top_max_k_inds = torch.topk( 30 | preds, max(ks), dim=1, largest=True, sorted=True 31 | ) 32 | # (batch_size, max_k) -> (max_k, batch_size). 33 | top_max_k_inds = top_max_k_inds.t() 34 | # (batch_size, ) -> (max_k, batch_size). 35 | rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds) 36 | # (i, j) = 1 if top i-th prediction for the j-th sample is correct. 37 | top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels) 38 | # Compute the number of topk correct predictions for each k. 39 | topks_correct = [ 40 | top_max_k_correct[:k, :].view(-1).float().sum() for k in ks 41 | ] 42 | return topks_correct 43 | 44 | 45 | def topk_errors(preds, labels, ks): 46 | """ 47 | Computes the top-k error for each k. 48 | Args: 49 | preds (array): array of predictions. Dimension is N. 50 | labels (array): array of labels. Dimension is N. 51 | ks (list): list of ks to calculate the top accuracies. 52 | """ 53 | num_topks_correct = topks_correct(preds, labels, ks) 54 | return [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] 55 | 56 | 57 | def topk_accuracies(preds, labels, ks): 58 | """ 59 | Computes the top-k accuracy for each k. 60 | Args: 61 | preds (array): array of predictions. Dimension is N. 62 | labels (array): array of labels. Dimension is N. 63 | ks (list): list of ks to calculate the top accuracies. 64 | """ 65 | num_topks_correct = topks_correct(preds, labels, ks) 66 | return [(x / preds.size(0)) * 100.0 for x in num_topks_correct] 67 | -------------------------------------------------------------------------------- /slowfast/utils/multiprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Multiprocessing helpers.""" 5 | 6 | import torch 7 | 8 | 9 | def run( 10 | local_rank, num_proc, func, init_method, shard_id, num_shards, backend, cfg 11 | ): 12 | """ 13 | Runs a function from a child process. 14 | Args: 15 | local_rank (int): rank of the current process on the current machine. 16 | num_proc (int): number of processes per machine. 17 | func (function): function to execute on each of the process. 18 | init_method (string): method to initialize the distributed training. 19 | TCP initialization: equiring a network address reachable from all 20 | processes followed by the port. 21 | Shared file-system initialization: makes use of a file system that 22 | is shared and visible from all machines. The URL should start with 23 | file:// and contain a path to a non-existent file on a shared file 24 | system. 25 | shard_id (int): the rank of the current machine. 26 | num_shards (int): number of overall machines for the distributed 27 | training job. 28 | backend (string): three distributed backends ('nccl', 'gloo', 'mpi') are 29 | supports, each with different capabilities. Details can be found 30 | here: 31 | https://pytorch.org/docs/stable/distributed.html 32 | cfg (CfgNode): configs. Details can be found in 33 | slowfast/config/defaults.py 34 | """ 35 | # Initialize the process group. 36 | world_size = num_proc * num_shards 37 | rank = shard_id * num_proc + local_rank 38 | 39 | try: 40 | torch.distributed.init_process_group( 41 | backend=backend, 42 | init_method=init_method, 43 | world_size=world_size, 44 | rank=rank, 45 | ) 46 | except Exception as e: 47 | raise e 48 | 49 | torch.cuda.set_device(local_rank) 50 | func(cfg) 51 | -------------------------------------------------------------------------------- /slowfast/utils/parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Argument parser functions.""" 5 | 6 | import argparse 7 | import sys 8 | 9 | import slowfast.utils.checkpoint as cu 10 | from slowfast.config.defaults import get_cfg 11 | 12 | 13 | def parse_args(): 14 | """ 15 | Parse the following arguments for a default parser for PySlowFast users. 16 | Args: 17 | shard_id (int): shard id for the current machine. Starts from 0 to 18 | num_shards - 1. If single machine is used, then set shard id to 0. 19 | num_shards (int): number of shards using by the job. 20 | init_method (str): initialization method to launch the job with multiple 21 | devices. Options includes TCP or shared file-system for 22 | initialization. details can be find in 23 | https://pytorch.org/docs/stable/distributed.html#tcp-initialization 24 | cfg (str): path to the config file. 25 | opts (argument): provide addtional options from the command line, it 26 | overwrites the config loaded from file. 27 | """ 28 | parser = argparse.ArgumentParser( 29 | description="Provide SlowFast video training and testing pipeline." 30 | ) 31 | parser.add_argument( 32 | "--shard_id", 33 | help="The shard id of current node, Starts from 0 to num_shards - 1", 34 | default=0, 35 | type=int, 36 | ) 37 | parser.add_argument( 38 | "--num_shards", 39 | help="Number of shards using by the job", 40 | default=1, 41 | type=int, 42 | ) 43 | parser.add_argument( 44 | "--init_method", 45 | help="Initialization method, includes TCP or shared file-system", 46 | default="tcp://localhost:9999", 47 | type=str, 48 | ) 49 | parser.add_argument( 50 | "--cfg", 51 | dest="cfg_file", 52 | help="Path to the config file", 53 | default="configs/Kinetics/SLOWFAST_4x16_R50.yaml", 54 | type=str, 55 | ) 56 | parser.add_argument( 57 | "opts", 58 | help="See slowfast/config/defaults.py for all options", 59 | default=None, 60 | nargs=argparse.REMAINDER, 61 | ) 62 | if len(sys.argv) == 1: 63 | parser.print_help() 64 | return parser.parse_args() 65 | 66 | 67 | def load_config(args): 68 | """ 69 | Given the arguemnts, load and initialize the configs. 70 | Args: 71 | args (argument): arguments includes `shard_id`, `num_shards`, 72 | `init_method`, `cfg_file`, and `opts`. 73 | """ 74 | # Setup cfg. 75 | cfg = get_cfg() 76 | # Load config from cfg. 77 | if args.cfg_file is not None: 78 | cfg.merge_from_file(args.cfg_file) 79 | # Load config from command line, overwrite config from opts. 80 | if args.opts is not None: 81 | cfg.merge_from_list(args.opts) 82 | 83 | # Inherit parameters from args. 84 | if hasattr(args, "num_shards") and hasattr(args, "shard_id"): 85 | cfg.NUM_SHARDS = args.num_shards 86 | cfg.SHARD_ID = args.shard_id 87 | if hasattr(args, "rng_seed"): 88 | cfg.RNG_SEED = args.rng_seed 89 | if hasattr(args, "output_dir"): 90 | cfg.OUTPUT_DIR = args.output_dir 91 | 92 | # Create the checkpoint dir. 93 | cu.make_checkpoint_dir(cfg.OUTPUT_DIR) 94 | return cfg 95 | -------------------------------------------------------------------------------- /slowfast/utils/weight_init_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Utility function for weight initialization""" 5 | 6 | import torch.nn as nn 7 | from fvcore.nn.weight_init import c2_msra_fill 8 | 9 | 10 | def init_weights(model, fc_init_std=0.01, zero_init_final_bn=True): 11 | """ 12 | Performs ResNet style weight initialization. 13 | Args: 14 | fc_init_std (float): the expected standard deviation for fc layer. 15 | zero_init_final_bn (bool): if True, zero initialize the final bn for 16 | every bottleneck. 17 | """ 18 | for m in model.modules(): 19 | if isinstance(m, nn.Conv3d): 20 | """ 21 | Follow the initialization method proposed in: 22 | {He, Kaiming, et al. 23 | "Delving deep into rectifiers: Surpassing human-level 24 | performance on imagenet classification." 25 | arXiv preprint arXiv:1502.01852 (2015)} 26 | """ 27 | c2_msra_fill(m) 28 | elif isinstance(m, nn.BatchNorm3d): 29 | if ( 30 | hasattr(m, "transform_final_bn") 31 | and m.transform_final_bn 32 | and zero_init_final_bn 33 | ): 34 | batchnorm_weight = 0.0 35 | else: 36 | batchnorm_weight = 1.0 37 | if m.weight is not None: 38 | m.weight.data.fill_(batchnorm_weight) 39 | if m.bias is not None: 40 | m.bias.data.zero_() 41 | if isinstance(m, nn.Linear): 42 | m.weight.data.normal_(mean=0.0, std=fc_init_std) 43 | m.bias.data.zero_() 44 | -------------------------------------------------------------------------------- /slowfast/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /slowfast/visualization/demo_loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import cv2 5 | 6 | from slowfast.visualization.utils import TaskInfo 7 | 8 | 9 | class VideoReader: 10 | """ 11 | VideoReader object for getting frames from video source for real-time inference. 12 | """ 13 | 14 | def __init__(self, cfg): 15 | """ 16 | Args: 17 | cfg (CfgNode): configs. Details can be found in 18 | slowfast/config/defaults.py 19 | """ 20 | assert ( 21 | cfg.DEMO.WEBCAM > -1 or cfg.DEMO.INPUT_VIDEO != "" 22 | ), "Must specify a data source as input." 23 | 24 | self.source = ( 25 | cfg.DEMO.WEBCAM if cfg.DEMO.WEBCAM > -1 else cfg.DEMO.INPUT_VIDEO 26 | ) 27 | 28 | self.display_width = cfg.DEMO.DISPLAY_WIDTH 29 | self.display_height = cfg.DEMO.DISPLAY_HEIGHT 30 | 31 | self.cap = cv2.VideoCapture(self.source) 32 | 33 | if self.display_width > 0 and self.display_height > 0: 34 | self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, self.display_width) 35 | self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self.display_height) 36 | else: 37 | self.display_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 38 | self.display_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 39 | 40 | if not self.cap.isOpened(): 41 | raise IOError("Video {} cannot be opened".format(self.source)) 42 | 43 | self.output_file = None 44 | if cfg.DEMO.OUTPUT_FILE != "": 45 | if cfg.DEMO.OUTPUT_FPS == -1: 46 | output_fps = self.cap.get(cv2.CAP_PROP_FPS) 47 | else: 48 | output_fps = cfg.DEMO.OUTPUT_FPS 49 | self.output_file = self.get_output_file( 50 | cfg.DEMO.OUTPUT_FILE, fps=output_fps 51 | ) 52 | self.id = -1 53 | self.buffer = [] 54 | self.buffer_size = cfg.DEMO.BUFFER_SIZE 55 | self.seq_length = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE 56 | self.test_crop_size = cfg.DATA.TEST_CROP_SIZE 57 | self.clip_vis_size = cfg.DEMO.CLIP_VIS_SIZE 58 | 59 | def __iter__(self): 60 | return self 61 | 62 | def __next__(self): 63 | """ 64 | Read and return the required number of frames for 1 clip. 65 | Returns: 66 | was_read (bool): False if not enough frames to return. 67 | task (TaskInfo object): object contains metadata for the current clips. 68 | """ 69 | self.id += 1 70 | task = TaskInfo() 71 | 72 | task.img_height = self.display_height 73 | task.img_width = self.display_width 74 | task.crop_size = self.test_crop_size 75 | task.clip_vis_size = self.clip_vis_size 76 | 77 | frames = [] 78 | if len(self.buffer) != 0: 79 | frames = self.buffer 80 | was_read = True 81 | while was_read and len(frames) < self.seq_length: 82 | was_read, frame = self.cap.read() 83 | frames.append(frame) 84 | if was_read and self.buffer_size != 0: 85 | self.buffer = frames[-self.buffer_size :] 86 | 87 | task.add_frames(self.id, frames) 88 | task.num_buffer_frames = 0 if self.id == 0 else self.buffer_size 89 | 90 | return was_read, task 91 | 92 | def get_output_file(self, path, fps=30): 93 | """ 94 | Return a video writer object. 95 | Args: 96 | path (str): path to the output video file. 97 | fps (int or float): frames per second. 98 | """ 99 | return cv2.VideoWriter( 100 | filename=path, 101 | fourcc=cv2.VideoWriter_fourcc(*"mp4v"), 102 | fps=float(fps), 103 | frameSize=(self.display_width, self.display_height), 104 | isColor=True, 105 | ) 106 | 107 | def display(self, frame): 108 | """ 109 | Either display a single frame (BGR image) to a window or write to 110 | an output file if output path is provided. 111 | """ 112 | if self.output_file is None: 113 | cv2.imshow("SlowFast", frame) 114 | else: 115 | self.output_file.write(frame) 116 | 117 | def clean(self): 118 | """ 119 | Clean up open video files and windows. 120 | """ 121 | self.cap.release() 122 | if self.output_file is None: 123 | cv2.destroyAllWindows() 124 | else: 125 | self.output_file.release() 126 | -------------------------------------------------------------------------------- /slowfast/visualization/predictor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import cv2 5 | import torch 6 | from detectron2 import model_zoo 7 | from detectron2.config import get_cfg 8 | from detectron2.engine import DefaultPredictor 9 | 10 | import slowfast.utils.checkpoint as cu 11 | from slowfast.datasets import cv2_transform 12 | from slowfast.models import build_model 13 | from slowfast.utils import logging 14 | from slowfast.visualization.utils import process_cv2_inputs 15 | 16 | logger = logging.get_logger(__name__) 17 | 18 | 19 | class Predictor: 20 | """ 21 | Action Predictor for action recognition. 22 | """ 23 | 24 | def __init__(self, cfg, gpu_id=None): 25 | """ 26 | Args: 27 | cfg (CfgNode): configs. Details can be found in 28 | slowfast/config/defaults.py 29 | gpu_id (Optional[int]): GPU id. 30 | """ 31 | if cfg.NUM_GPUS: 32 | self.gpu_id = torch.cuda.current_device() if gpu_id is None else gpu_id 33 | 34 | # Build the video model and print model statistics. 35 | self.model = build_model(cfg, gpu_id=gpu_id) 36 | self.model.eval() 37 | self.cfg = cfg 38 | 39 | if cfg.DETECTION.ENABLE: 40 | self.object_detector = Detectron2Predictor(cfg, gpu_id=self.gpu_id) 41 | 42 | logger.info("Start loading model weights.") 43 | cu.load_test_checkpoint(cfg, self.model) 44 | logger.info("Finish loading model weights") 45 | 46 | def __call__(self, task): 47 | """ 48 | Returns the prediction results for the current task. 49 | Args: 50 | task (TaskInfo object): task object that contain 51 | the necessary information for action prediction. (e.g. frames, boxes) 52 | Returns: 53 | task (TaskInfo object): the same task info object but filled with 54 | prediction values (a tensor) and the corresponding boxes for 55 | action detection task. 56 | """ 57 | if self.cfg.DETECTION.ENABLE: 58 | task = self.object_detector(task) 59 | 60 | frames, bboxes = task.frames, task.bboxes 61 | if bboxes is not None: 62 | bboxes = cv2_transform.scale_boxes( 63 | self.cfg.DATA.TEST_CROP_SIZE, 64 | bboxes, 65 | task.img_height, 66 | task.img_width, 67 | ) 68 | if self.cfg.DEMO.INPUT_FORMAT == "BGR": 69 | frames = [ 70 | cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames 71 | ] 72 | 73 | frames = [ 74 | cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame) 75 | for frame in frames 76 | ] 77 | inputs = process_cv2_inputs(frames, self.cfg) 78 | if bboxes is not None: 79 | index_pad = torch.full( 80 | size=(bboxes.shape[0], 1), 81 | fill_value=float(0), 82 | device=bboxes.device, 83 | ) 84 | 85 | # Pad frame index for each box. 86 | bboxes = torch.cat([index_pad, bboxes], axis=1) 87 | if self.cfg.NUM_GPUS > 0: 88 | # Transfer the data to the current GPU device. 89 | if isinstance(inputs, (list,)): 90 | for i in range(len(inputs)): 91 | inputs[i] = inputs[i].cuda( 92 | device=torch.device(self.gpu_id), non_blocking=True 93 | ) 94 | else: 95 | inputs = inputs.cuda( 96 | device=torch.device(self.gpu_id), non_blocking=True 97 | ) 98 | if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]: 99 | preds = torch.tensor([]) 100 | else: 101 | preds = self.model(inputs, bboxes) 102 | 103 | if self.cfg.NUM_GPUS: 104 | preds = preds.cpu() 105 | if bboxes is not None: 106 | bboxes = bboxes.detach().cpu() 107 | 108 | preds = preds.detach() 109 | task.add_action_preds(preds) 110 | if bboxes is not None: 111 | task.add_bboxes(bboxes[:, 1:]) 112 | 113 | return task 114 | 115 | 116 | class ActionPredictor: 117 | """ 118 | Synchronous Action Prediction and Visualization pipeline with AsyncVis. 119 | """ 120 | def __init__(self, cfg, async_vis=None, gpu_id=None): 121 | """ 122 | Args: 123 | cfg (CfgNode): configs. Details can be found in 124 | slowfast/config/defaults.py 125 | async_vis (AsyncVis object): asynchronous visualizer. 126 | gpu_id (Optional[int]): GPU id. 127 | """ 128 | self.predictor = Predictor(cfg=cfg, gpu_id=gpu_id) 129 | self.async_vis = async_vis 130 | 131 | def put(self, task): 132 | """ 133 | Make prediction and put the results in `async_vis` task queue. 134 | Args: 135 | task (TaskInfo object): task object that contain 136 | the necessary information for action prediction. (e.g. frames, boxes) 137 | """ 138 | task = self.predictor(task) 139 | self.async_vis.put(task) 140 | 141 | 142 | class Detectron2Predictor: 143 | """ 144 | Wrapper around Detectron2 to return the required predicted bounding boxes 145 | as a ndarray. 146 | """ 147 | 148 | def __init__(self, cfg, gpu_id=None): 149 | """ 150 | Args: 151 | cfg (CfgNode): configs. Details can be found in 152 | slowfast/config/defaults.py 153 | gpu_id (Optional[int]): GPU id. 154 | """ 155 | 156 | self.cfg = get_cfg() 157 | self.cfg.merge_from_file( 158 | model_zoo.get_config_file(cfg.DEMO.DETECTRON2_CFG) 159 | ) 160 | self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = cfg.DEMO.DETECTRON2_THRESH 161 | self.cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_WEIGHTS 162 | self.cfg.INPUT.FORMAT = cfg.DEMO.INPUT_FORMAT 163 | if cfg.NUM_GPUS and gpu_id is None: 164 | gpu_id = torch.cuda.current_device() 165 | self.cfg.MODEL.DEVICE = ( 166 | "cuda:{}".format(gpu_id) if cfg.NUM_GPUS > 0 else "cpu" 167 | ) 168 | 169 | logger.info("Initialized Detectron2 Object Detection Model.") 170 | 171 | self.predictor = DefaultPredictor(self.cfg) 172 | 173 | def __call__(self, task): 174 | """ 175 | Return bounding boxes predictions as a tensor. 176 | Args: 177 | task (TaskInfo object): task object that contain 178 | the necessary information for action prediction. (e.g. frames, boxes) 179 | Returns: 180 | task (TaskInfo object): the same task info object but filled with 181 | prediction values (a tensor) and the corresponding boxes for 182 | action detection task. 183 | """ 184 | middle_frame = task.frames[len(task.frames) // 2] 185 | outputs = self.predictor(middle_frame) 186 | # Get only human instances 187 | mask = outputs["instances"].pred_classes == 0 188 | pred_boxes = outputs["instances"].pred_boxes.tensor[mask] 189 | task.add_bboxes(pred_boxes) 190 | 191 | return task 192 | -------------------------------------------------------------------------------- /tools/benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | """ 4 | A script to benchmark data loading. 5 | """ 6 | 7 | import slowfast.utils.logging as logging 8 | from slowfast.utils.benchmark import benchmark_data_loading 9 | from slowfast.utils.misc import launch_job 10 | from slowfast.utils.parser import load_config, parse_args 11 | 12 | logger = logging.get_logger(__name__) 13 | 14 | 15 | def main(): 16 | args = parse_args() 17 | cfg = load_config(args) 18 | 19 | launch_job( 20 | cfg=cfg, init_method=args.init_method, func=benchmark_data_loading 21 | ) 22 | 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /tools/demo_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import numpy as np 5 | import queue 6 | import cv2 7 | import torch 8 | import tqdm 9 | 10 | from slowfast.utils import logging 11 | from slowfast.visualization.async_predictor import ( 12 | AsycnActionPredictor, 13 | AsyncVis, 14 | ) 15 | from slowfast.visualization.ava_demo_precomputed_boxes import ( 16 | AVAVisualizerWithPrecomputedBox, 17 | ) 18 | from slowfast.visualization.demo_loader import VideoReader 19 | from slowfast.visualization.predictor import ActionPredictor 20 | from slowfast.visualization.video_visualizer import VideoVisualizer 21 | 22 | logger = logging.get_logger(__name__) 23 | 24 | 25 | def run_demo(cfg, frame_provider): 26 | """ 27 | Run demo visualization. 28 | Args: 29 | cfg (CfgNode): configs. Details can be found in 30 | slowfast/config/defaults.py 31 | frame_provider (iterator): Python iterator that return task objects that are filled 32 | with necessary information such as `frames`, `id` and `num_buffer_frames` for the 33 | prediction and visualization pipeline. 34 | """ 35 | # Set random seed from configs. 36 | np.random.seed(cfg.RNG_SEED) 37 | torch.manual_seed(cfg.RNG_SEED) 38 | # Setup logging format. 39 | logging.setup_logging(cfg.OUTPUT_DIR) 40 | # Print config. 41 | logger.info("Run demo with config:") 42 | logger.info(cfg) 43 | 44 | common_classes = ( 45 | cfg.DEMO.COMMON_CLASS_NAMES 46 | if len(cfg.DEMO.LABEL_FILE_PATH) != 0 47 | else None 48 | ) 49 | 50 | video_vis = VideoVisualizer( 51 | num_classes=cfg.MODEL.NUM_CLASSES, 52 | class_names_path=cfg.DEMO.LABEL_FILE_PATH, 53 | top_k=cfg.TENSORBOARD.MODEL_VIS.TOPK_PREDS, 54 | thres=cfg.DEMO.COMMON_CLASS_THRES, 55 | lower_thres=cfg.DEMO.UNCOMMON_CLASS_THRES, 56 | common_class_names=common_classes, 57 | colormap=cfg.TENSORBOARD.MODEL_VIS.COLORMAP, 58 | mode=cfg.DEMO.VIS_MODE, 59 | ) 60 | 61 | async_vis = AsyncVis(video_vis, n_workers=cfg.DEMO.NUM_VIS_INSTANCES) 62 | 63 | if cfg.NUM_GPUS <= 1: 64 | model = ActionPredictor(cfg=cfg, async_vis=async_vis) 65 | else: 66 | model = AsycnActionPredictor(cfg, async_vis.task_queue) 67 | 68 | seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE 69 | 70 | assert ( 71 | cfg.DEMO.BUFFER_SIZE <= seq_len // 2 72 | ), "Buffer size cannot be greater than half of sequence length." 73 | num_task = 0 74 | for able_to_read, task in frame_provider: 75 | if not able_to_read: 76 | break 77 | num_task += 1 78 | 79 | model.put(task) 80 | 81 | try: 82 | frames = async_vis.get() 83 | num_task -= 1 84 | yield frames 85 | except queue.Empty: 86 | continue 87 | # hit Esc to quit the demo. 88 | key = cv2.waitKey(1) 89 | if key == 27: 90 | break 91 | 92 | while num_task != 0: 93 | try: 94 | frames = async_vis.get() 95 | num_task -= 1 96 | yield frames 97 | except queue.Empty: 98 | continue 99 | # hit Esc to quit the demo. 100 | key = cv2.waitKey(1) 101 | if key == 27: 102 | break 103 | 104 | 105 | def demo(cfg): 106 | """ 107 | Run inference on an input video or stream from webcam. 108 | Args: 109 | cfg (CfgNode): configs. Details can be found in 110 | slowfast/config/defaults.py 111 | """ 112 | # AVA format-specific visualization with precomputed boxes. 113 | if cfg.DETECTION.ENABLE and cfg.DEMO.PREDS_BOXES != "": 114 | precomputed_box_vis = AVAVisualizerWithPrecomputedBox(cfg) 115 | precomputed_box_vis() 116 | else: 117 | frame_provider = VideoReader(cfg) 118 | 119 | for frames in tqdm.tqdm(run_demo(cfg, frame_provider)): 120 | for frame in frames: 121 | frame_provider.display(frame) 122 | frame_provider.clean() 123 | -------------------------------------------------------------------------------- /tools/run_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Wrapper to train and test a video classification model.""" 5 | import torch 6 | 7 | from slowfast.utils.misc import launch_job 8 | from slowfast.utils.parser import load_config, parse_args 9 | 10 | from demo_net import demo 11 | from test_net import test 12 | from train_net import train 13 | from visualization import visualize 14 | 15 | 16 | def main(): 17 | """ 18 | Main function to spawn the train and test process. 19 | """ 20 | args = parse_args() 21 | cfg = load_config(args) 22 | 23 | # Perform training. 24 | if cfg.TRAIN.ENABLE: 25 | launch_job(cfg=cfg, init_method=args.init_method, func=train) 26 | 27 | # Perform multi-clip testing. 28 | if cfg.TEST.ENABLE: 29 | launch_job(cfg=cfg, init_method=args.init_method, func=test) 30 | 31 | # Perform model visualization. 32 | if cfg.TENSORBOARD.ENABLE and cfg.TENSORBOARD.MODEL_VIS.ENABLE: 33 | launch_job(cfg=cfg, init_method=args.init_method, func=visualize) 34 | 35 | # Run demo. 36 | if cfg.DEMO.ENABLE: 37 | demo(cfg) 38 | 39 | 40 | if __name__ == "__main__": 41 | # torch.multiprocessing.set_start_method("forkserver") 42 | main() 43 | --------------------------------------------------------------------------------