├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── configs ├── Kinetics │ ├── SLOWFAST_4x16_R50.yaml │ ├── SLOWFAST_8x8_R101.yaml │ ├── SLOWFAST_8x8_R50.yaml │ ├── TimeSformer_divST_16x16_448.yaml │ ├── TimeSformer_divST_8x32_224.yaml │ ├── TimeSformer_divST_8x32_224_4gpus.yaml │ ├── TimeSformer_divST_8x32_224_TEST.yaml │ ├── TimeSformer_divST_96x4_224.yaml │ ├── TimeSformer_jointST_8x32_224.yaml │ └── TimeSformer_spaceOnly_8x32_224.yaml └── SSv2 │ ├── SLOWFAST_16x8_R50.yaml │ ├── TimeSformer_divST_16_448.yaml │ ├── TimeSformer_divST_64_224.yaml │ └── TimeSformer_divST_8_224.yaml ├── environment.yml ├── example.ipynb ├── setup.cfg ├── setup.py ├── slurm_scripts ├── run_multi_node_job.sh └── run_single_node_job.sh ├── timesformer ├── __init__.py ├── config │ ├── __init__.py │ └── defaults.py ├── datasets │ ├── DATASET.md │ ├── __init__.py │ ├── build.py │ ├── cv2_transform.py │ ├── decoder.py │ ├── kinetics.py │ ├── loader.py │ ├── multigrid_helper.py │ ├── ssv2.py │ ├── transform.py │ ├── utils.py │ └── video_container.py ├── models │ ├── __init__.py │ ├── batchnorm_helper.py │ ├── build.py │ ├── conv2d_same.py │ ├── custom_video_model_builder.py │ ├── features.py │ ├── head_helper.py │ ├── helpers.py │ ├── linear.py │ ├── losses.py │ ├── nonlocal_helper.py │ ├── operators.py │ ├── optimizer.py │ ├── resnet_helper.py │ ├── stem_helper.py │ ├── video_model_builder.py │ ├── vit.py │ └── vit_utils.py ├── utils │ ├── __init__.py │ ├── ava_eval_helper.py │ ├── ava_evaluation │ │ ├── README.md │ │ ├── __init__.py │ │ ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt │ │ ├── label_map_util.py │ │ ├── metrics.py │ │ ├── np_box_list.py │ │ ├── np_box_list_ops.py │ │ ├── np_box_mask_list.py │ │ ├── np_box_mask_list_ops.py │ │ ├── np_box_ops.py │ │ ├── np_mask_ops.py │ │ ├── object_detection_evaluation.py │ │ ├── per_image_evaluation.py │ │ └── standard_fields.py │ ├── benchmark.py │ ├── bn_helper.py │ ├── c2_model_loading.py │ ├── checkpoint.py │ ├── distributed.py │ ├── env.py │ ├── logging.py │ ├── lr_policy.py │ ├── meters.py │ ├── metrics.py │ ├── misc.py │ ├── multigrid.py │ ├── multiprocessing.py │ ├── parser.py │ └── weight_init_helper.py └── visualization │ ├── __init__.py │ ├── tensorboard_vis.py │ └── utils.py └── tools ├── benchmark.py ├── run_net.py ├── submit.py ├── test_net.py ├── train_net.py └── visualization.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # Docker file from Python is inspired from here : 6 | # https://github.com/github/gitignore/blob/master/Python.gitignore 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | tests/report/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | 142 | # Cython debug symbols 143 | cython_debug/ 144 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to TimeSformer 2 | 3 | ## Pull Requests 4 | We actively welcome your pull requests. 5 | 6 | 1. Fork the repo and create your branch from `master`. 7 | 2. If you've added code that should be tested, add tests. 8 | 3. If you've changed APIs, update the documentation. 9 | 4. Ensure the test suite passes. 10 | 5. Make sure your code lints. 11 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 12 | 13 | ## Contributor License Agreement ("CLA") 14 | In order to accept your pull request, we need you to submit a CLA. You only need 15 | to do this once to work on any of Facebook's open source projects. 16 | 17 | Complete your CLA here: 18 | 19 | ## Issues 20 | We use GitHub issues to track public bugs. Please ensure your description is 21 | clear and has sufficient instructions to be able to reproduce the issue. 22 | 23 | ## License 24 | By contributing to TimeSformer, you agree that your contributions will be licensed 25 | under the [LICENSE.md](LICENSE.md) file in the root directory of this source tree. 26 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_4x16_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 32 11 | SAMPLING_RATE: 2 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 256 15 | INPUT_CHANNEL_NUM: [3, 3] 16 | SLOWFAST: 17 | ALPHA: 8 18 | BETA_INV: 8 19 | FUSION_CONV_CHANNEL_RATIO: 2 20 | FUSION_KERNEL_SZ: 5 21 | RESNET: 22 | ZERO_INIT_FINAL_BN: True 23 | WIDTH_PER_GROUP: 64 24 | NUM_GROUPS: 1 25 | DEPTH: 50 26 | TRANS_FUNC: bottleneck_transform 27 | STRIDE_1X1: False 28 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 29 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 30 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 31 | NONLOCAL: 32 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 33 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 34 | INSTANTIATION: dot_product 35 | BN: 36 | USE_PRECISE_STATS: True 37 | NUM_BATCHES_PRECISE: 200 38 | SOLVER: 39 | BASE_LR: 0.8 40 | LR_POLICY: cosine 41 | MAX_EPOCH: 196 42 | MOMENTUM: 0.9 43 | WEIGHT_DECAY: 1e-4 44 | WARMUP_EPOCHS: 34.0 45 | WARMUP_START_LR: 0.01 46 | OPTIMIZING_METHOD: sgd 47 | MODEL: 48 | NUM_CLASSES: 400 49 | ARCH: slowfast 50 | MODEL_NAME: SlowFast 51 | LOSS_FUNC: cross_entropy 52 | DROPOUT_RATE: 0.5 53 | TEST: 54 | ENABLE: True 55 | DATASET: kinetics 56 | BATCH_SIZE: 64 57 | DATA_LOADER: 58 | NUM_WORKERS: 8 59 | PIN_MEMORY: True 60 | NUM_GPUS: 8 61 | NUM_SHARDS: 1 62 | RNG_SEED: 0 63 | OUTPUT_DIR: . 64 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_8x8_R101.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 32 11 | SAMPLING_RATE: 2 12 | TRAIN_JITTER_SCALES: [256, 340] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 256 15 | INPUT_CHANNEL_NUM: [3, 3] 16 | SLOWFAST: 17 | ALPHA: 4 18 | BETA_INV: 8 19 | FUSION_CONV_CHANNEL_RATIO: 2 20 | FUSION_KERNEL_SZ: 5 21 | RESNET: 22 | ZERO_INIT_FINAL_BN: True 23 | WIDTH_PER_GROUP: 64 24 | NUM_GROUPS: 1 25 | DEPTH: 101 26 | TRANS_FUNC: bottleneck_transform 27 | STRIDE_1X1: False 28 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 29 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 30 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 31 | NONLOCAL: 32 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 33 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 34 | INSTANTIATION: dot_product 35 | BN: 36 | USE_PRECISE_STATS: True 37 | NUM_BATCHES_PRECISE: 200 38 | SOLVER: 39 | BASE_LR: 0.8 ## 8 nodes 40 | LR_POLICY: cosine 41 | MAX_EPOCH: 196 42 | MOMENTUM: 0.9 43 | WEIGHT_DECAY: 1e-4 44 | WARMUP_EPOCHS: 34.0 45 | WARMUP_START_LR: 0.01 46 | OPTIMIZING_METHOD: sgd 47 | MODEL: 48 | NUM_CLASSES: 400 49 | ARCH: slowfast 50 | MODEL_NAME: SlowFast 51 | LOSS_FUNC: cross_entropy 52 | DROPOUT_RATE: 0.5 53 | TEST: 54 | ENABLE: True 55 | DATASET: kinetics 56 | BATCH_SIZE: 64 57 | DATA_LOADER: 58 | NUM_WORKERS: 8 59 | PIN_MEMORY: True 60 | NUM_GPUS: 8 61 | NUM_SHARDS: 1 62 | RNG_SEED: 0 63 | OUTPUT_DIR: . 64 | -------------------------------------------------------------------------------- /configs/Kinetics/SLOWFAST_8x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 64 5 | EVAL_PERIOD: 10 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 32 11 | SAMPLING_RATE: 2 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 256 15 | INPUT_CHANNEL_NUM: [3, 3] 16 | SLOWFAST: 17 | ALPHA: 4 18 | BETA_INV: 8 19 | FUSION_CONV_CHANNEL_RATIO: 2 20 | FUSION_KERNEL_SZ: 7 21 | RESNET: 22 | ZERO_INIT_FINAL_BN: True 23 | WIDTH_PER_GROUP: 64 24 | NUM_GROUPS: 1 25 | DEPTH: 50 26 | TRANS_FUNC: bottleneck_transform 27 | STRIDE_1X1: False 28 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 29 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 30 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 31 | NONLOCAL: 32 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 33 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 34 | INSTANTIATION: dot_product 35 | BN: 36 | USE_PRECISE_STATS: True 37 | NUM_BATCHES_PRECISE: 200 38 | SOLVER: 39 | BASE_LR: 0.8 40 | LR_POLICY: cosine 41 | MAX_EPOCH: 196 42 | MOMENTUM: 0.9 43 | WEIGHT_DECAY: 1e-4 44 | WARMUP_EPOCHS: 34.0 45 | WARMUP_START_LR: 0.01 46 | OPTIMIZING_METHOD: sgd 47 | MODEL: 48 | NUM_CLASSES: 400 49 | ARCH: slowfast 50 | MODEL_NAME: SlowFast 51 | LOSS_FUNC: cross_entropy 52 | DROPOUT_RATE: 0.5 53 | TEST: 54 | ENABLE: True 55 | DATASET: kinetics 56 | BATCH_SIZE: 64 57 | DATA_LOADER: 58 | NUM_WORKERS: 8 59 | PIN_MEMORY: True 60 | NUM_GPUS: 8 61 | NUM_SHARDS: 1 62 | RNG_SEED: 0 63 | OUTPUT_DIR: . 64 | -------------------------------------------------------------------------------- /configs/Kinetics/TimeSformer_divST_16x16_448.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 16 11 | SAMPLING_RATE: 16 12 | TRAIN_JITTER_SCALES: [448, 512] 13 | TRAIN_CROP_SIZE: 448 14 | TEST_CROP_SIZE: 448 15 | INPUT_CHANNEL_NUM: [3] 16 | TIMESFORMER: 17 | ATTENTION_TYPE: 'divided_space_time' 18 | SOLVER: 19 | BASE_LR: 0.005 20 | LR_POLICY: steps_with_relative_lrs 21 | STEPS: [0, 11, 14] 22 | LRS: [1, 0.1, 0.01] 23 | MAX_EPOCH: 15 24 | MOMENTUM: 0.9 25 | WEIGHT_DECAY: 1e-4 26 | OPTIMIZING_METHOD: sgd 27 | MODEL: 28 | MODEL_NAME: vit_base_patch16_224 29 | NUM_CLASSES: 400 30 | ARCH: vit 31 | LOSS_FUNC: cross_entropy 32 | DROPOUT_RATE: 0.5 33 | TEST: 34 | ENABLE: True 35 | DATASET: kinetics 36 | BATCH_SIZE: 8 37 | NUM_ENSEMBLE_VIEWS: 1 38 | NUM_SPATIAL_CROPS: 3 39 | DATA_LOADER: 40 | NUM_WORKERS: 8 41 | PIN_MEMORY: True 42 | NUM_GPUS: 8 43 | NUM_SHARDS: 1 44 | RNG_SEED: 0 45 | OUTPUT_DIR: . 46 | -------------------------------------------------------------------------------- /configs/Kinetics/TimeSformer_divST_8x32_224.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 8 11 | SAMPLING_RATE: 32 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 224 15 | INPUT_CHANNEL_NUM: [3] 16 | TIMESFORMER: 17 | ATTENTION_TYPE: 'divided_space_time' 18 | SOLVER: 19 | BASE_LR: 0.005 20 | LR_POLICY: steps_with_relative_lrs 21 | STEPS: [0, 11, 14] 22 | LRS: [1, 0.1, 0.01] 23 | MAX_EPOCH: 15 24 | MOMENTUM: 0.9 25 | WEIGHT_DECAY: 1e-4 26 | OPTIMIZING_METHOD: sgd 27 | MODEL: 28 | MODEL_NAME: vit_base_patch16_224 29 | NUM_CLASSES: 400 30 | ARCH: vit 31 | LOSS_FUNC: cross_entropy 32 | DROPOUT_RATE: 0.5 33 | TEST: 34 | ENABLE: True 35 | DATASET: kinetics 36 | BATCH_SIZE: 8 37 | NUM_ENSEMBLE_VIEWS: 1 38 | NUM_SPATIAL_CROPS: 3 39 | DATA_LOADER: 40 | NUM_WORKERS: 8 41 | PIN_MEMORY: True 42 | NUM_GPUS: 8 43 | NUM_SHARDS: 1 44 | RNG_SEED: 0 45 | OUTPUT_DIR: . 46 | -------------------------------------------------------------------------------- /configs/Kinetics/TimeSformer_divST_8x32_224_4gpus.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 4 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 8 11 | SAMPLING_RATE: 32 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 224 15 | INPUT_CHANNEL_NUM: [3] 16 | TIMESFORMER: 17 | ATTENTION_TYPE: 'divided_space_time' 18 | SOLVER: 19 | BASE_LR: 0.005 20 | LR_POLICY: steps_with_relative_lrs 21 | STEPS: [0, 11, 14] 22 | LRS: [1, 0.1, 0.01] 23 | MAX_EPOCH: 15 24 | MOMENTUM: 0.9 25 | WEIGHT_DECAY: 1e-4 26 | OPTIMIZING_METHOD: sgd 27 | MODEL: 28 | MODEL_NAME: vit_base_patch16_224 29 | NUM_CLASSES: 400 30 | ARCH: vit 31 | LOSS_FUNC: cross_entropy 32 | DROPOUT_RATE: 0.5 33 | TEST: 34 | ENABLE: True 35 | DATASET: kinetics 36 | BATCH_SIZE: 4 37 | NUM_ENSEMBLE_VIEWS: 1 38 | NUM_SPATIAL_CROPS: 3 39 | DATA_LOADER: 40 | NUM_WORKERS: 4 41 | PIN_MEMORY: True 42 | NUM_GPUS: 4 43 | NUM_SHARDS: 1 44 | RNG_SEED: 0 45 | OUTPUT_DIR: . 46 | -------------------------------------------------------------------------------- /configs/Kinetics/TimeSformer_divST_8x32_224_TEST.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: False 3 | DATASET: kinetics 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 8 11 | SAMPLING_RATE: 32 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 224 15 | INPUT_CHANNEL_NUM: [3] 16 | TIMESFORMER: 17 | ATTENTION_TYPE: 'divided_space_time' 18 | SOLVER: 19 | BASE_LR: 0.005 20 | LR_POLICY: steps_with_relative_lrs 21 | STEPS: [0, 11, 14] 22 | LRS: [1, 0.1, 0.01] 23 | MAX_EPOCH: 15 24 | MOMENTUM: 0.9 25 | WEIGHT_DECAY: 1e-4 26 | OPTIMIZING_METHOD: sgd 27 | MODEL: 28 | MODEL_NAME: vit_base_patch16_224 29 | NUM_CLASSES: 400 30 | ARCH: vit 31 | LOSS_FUNC: cross_entropy 32 | DROPOUT_RATE: 0.5 33 | TEST: 34 | ENABLE: True 35 | DATASET: kinetics 36 | BATCH_SIZE: 8 37 | NUM_ENSEMBLE_VIEWS: 1 38 | NUM_SPATIAL_CROPS: 3 39 | CHECKPOINT_FILE_PATH: '/checkpoint/gedas/jobs/timesformer/kinetics_400/TimeSformer_divST_8x32_224/checkpoints/checkpoint_epoch_00025.pyth' 40 | DATA_LOADER: 41 | NUM_WORKERS: 8 42 | PIN_MEMORY: True 43 | NUM_GPUS: 8 44 | NUM_SHARDS: 1 45 | RNG_SEED: 0 46 | OUTPUT_DIR: . 47 | -------------------------------------------------------------------------------- /configs/Kinetics/TimeSformer_divST_96x4_224.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 96 11 | SAMPLING_RATE: 4 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 224 15 | INPUT_CHANNEL_NUM: [3] 16 | TIMESFORMER: 17 | ATTENTION_TYPE: 'divided_space_time' 18 | SOLVER: 19 | BASE_LR: 0.005 20 | LR_POLICY: steps_with_relative_lrs 21 | STEPS: [0, 11, 14] 22 | LRS: [1, 0.1, 0.01] 23 | MAX_EPOCH: 15 24 | MOMENTUM: 0.9 25 | WEIGHT_DECAY: 1e-4 26 | OPTIMIZING_METHOD: sgd 27 | MODEL: 28 | MODEL_NAME: vit_base_patch16_224 29 | NUM_CLASSES: 400 30 | ARCH: vit 31 | LOSS_FUNC: cross_entropy 32 | DROPOUT_RATE: 0.5 33 | TEST: 34 | ENABLE: True 35 | DATASET: kinetics 36 | BATCH_SIZE: 8 37 | NUM_ENSEMBLE_VIEWS: 1 38 | NUM_SPATIAL_CROPS: 3 39 | DATA_LOADER: 40 | NUM_WORKERS: 8 41 | PIN_MEMORY: True 42 | NUM_GPUS: 8 43 | NUM_SHARDS: 1 44 | RNG_SEED: 0 45 | OUTPUT_DIR: . 46 | -------------------------------------------------------------------------------- /configs/Kinetics/TimeSformer_jointST_8x32_224.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 8 11 | SAMPLING_RATE: 32 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 224 15 | INPUT_CHANNEL_NUM: [3] 16 | TIMESFORMER: 17 | ATTENTION_TYPE: 'joint_space_time' 18 | SOLVER: 19 | BASE_LR: 0.005 20 | LR_POLICY: steps_with_relative_lrs 21 | STEPS: [0, 11, 14] 22 | LRS: [1, 0.1, 0.01] 23 | MAX_EPOCH: 15 24 | MOMENTUM: 0.9 25 | WEIGHT_DECAY: 1e-4 26 | OPTIMIZING_METHOD: sgd 27 | MODEL: 28 | MODEL_NAME: vit_base_patch16_224 29 | NUM_CLASSES: 400 30 | ARCH: vit 31 | LOSS_FUNC: cross_entropy 32 | DROPOUT_RATE: 0.5 33 | TEST: 34 | ENABLE: True 35 | DATASET: kinetics 36 | BATCH_SIZE: 8 37 | NUM_ENSEMBLE_VIEWS: 1 38 | NUM_SPATIAL_CROPS: 3 39 | DATA_LOADER: 40 | NUM_WORKERS: 8 41 | PIN_MEMORY: True 42 | NUM_GPUS: 8 43 | NUM_SHARDS: 1 44 | RNG_SEED: 0 45 | OUTPUT_DIR: . 46 | -------------------------------------------------------------------------------- /configs/Kinetics/TimeSformer_spaceOnly_8x32_224.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: kinetics 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: /path/to/kinetics/ 10 | NUM_FRAMES: 8 11 | SAMPLING_RATE: 32 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 224 15 | INPUT_CHANNEL_NUM: [3] 16 | TIMESFORMER: 17 | ATTENTION_TYPE: 'space_only' 18 | SOLVER: 19 | BASE_LR: 0.005 20 | LR_POLICY: steps_with_relative_lrs 21 | STEPS: [0, 11, 14] 22 | LRS: [1, 0.1, 0.01] 23 | MAX_EPOCH: 15 24 | MOMENTUM: 0.9 25 | WEIGHT_DECAY: 1e-4 26 | OPTIMIZING_METHOD: sgd 27 | MODEL: 28 | MODEL_NAME: vit_base_patch16_224 29 | NUM_CLASSES: 400 30 | ARCH: vit 31 | LOSS_FUNC: cross_entropy 32 | DROPOUT_RATE: 0.5 33 | TEST: 34 | ENABLE: True 35 | DATASET: kinetics 36 | BATCH_SIZE: 8 37 | NUM_ENSEMBLE_VIEWS: 1 38 | NUM_SPATIAL_CROPS: 3 39 | DATA_LOADER: 40 | NUM_WORKERS: 8 41 | PIN_MEMORY: True 42 | NUM_GPUS: 8 43 | NUM_SHARDS: 1 44 | RNG_SEED: 0 45 | OUTPUT_DIR: . 46 | -------------------------------------------------------------------------------- /configs/SSv2/SLOWFAST_16x8_R50.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: ssv2 4 | BATCH_SIZE: 16 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/" 10 | PATH_PREFIX: "/path/to/ssv2/frames/" 11 | NUM_FRAMES: 64 12 | SAMPLING_RATE: 2 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 256 16 | INPUT_CHANNEL_NUM: [3, 3] 17 | INV_UNIFORM_SAMPLE: True 18 | RANDOM_FLIP: False 19 | REVERSE_INPUT_CHANNEL: True 20 | SLOWFAST: 21 | ALPHA: 4 22 | BETA_INV: 8 23 | FUSION_CONV_CHANNEL_RATIO: 2 24 | FUSION_KERNEL_SZ: 7 25 | RESNET: 26 | SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]] 27 | SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]] 28 | ZERO_INIT_FINAL_BN: True 29 | WIDTH_PER_GROUP: 64 30 | NUM_GROUPS: 1 31 | DEPTH: 50 32 | TRANS_FUNC: bottleneck_transform 33 | STRIDE_1X1: False 34 | NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]] 35 | NONLOCAL: 36 | LOCATION: [[[], []], [[], []], [[], []], [[], []]] 37 | GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]] 38 | INSTANTIATION: dot_product 39 | BN: 40 | USE_PRECISE_STATS: True 41 | NUM_BATCHES_PRECISE: 200 42 | NORM_TYPE: sync_batchnorm 43 | NUM_SYNC_DEVICES: 4 44 | SOLVER: 45 | BASE_LR: 0.2 #8 nodes 46 | LR_POLICY: cosine 47 | MAX_EPOCH: 200 48 | MOMENTUM: 0.9 49 | WEIGHT_DECAY: 1e-4 50 | WARMUP_EPOCHS: 34.0 51 | WARMUP_START_LR: 0.01 52 | OPTIMIZING_METHOD: sgd 53 | #SOLVER: 54 | # BASE_LR: 0.03 55 | # LR_POLICY: steps_with_relative_lrs 56 | # LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001] 57 | # STEPS: [0, 14, 18] 58 | # MAX_EPOCH: 22 59 | # MOMENTUM: 0.9 60 | # WEIGHT_DECAY: 1e-6 61 | # WARMUP_EPOCHS: 0.19 62 | # WARMUP_START_LR: 0.0001 63 | # OPTIMIZING_METHOD: sgd 64 | MODEL: 65 | NUM_CLASSES: 174 66 | ARCH: slowfast 67 | LOSS_FUNC: cross_entropy 68 | DROPOUT_RATE: 0.5 69 | TEST: 70 | ENABLE: True 71 | DATASET: ssv2 72 | BATCH_SIZE: 16 73 | NUM_ENSEMBLE_VIEWS: 1 74 | NUM_SPATIAL_CROPS: 1 75 | DATA_LOADER: 76 | NUM_WORKERS: 4 77 | PIN_MEMORY: True 78 | NUM_GPUS: 8 79 | NUM_SHARDS: 1 80 | RNG_SEED: 0 81 | OUTPUT_DIR: . 82 | #LOG_MODEL_INFO: False 83 | LOG_MODEL_INFO: True 84 | -------------------------------------------------------------------------------- /configs/SSv2/TimeSformer_divST_16_448.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: ssv2 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/" 10 | PATH_PREFIX: "/path/to/ssv2/frames/" 11 | NUM_FRAMES: 16 12 | TRAIN_JITTER_SCALES: [448, 512] 13 | TRAIN_CROP_SIZE: 448 14 | TEST_CROP_SIZE: 448 15 | INPUT_CHANNEL_NUM: [3] 16 | INV_UNIFORM_SAMPLE: True 17 | RANDOM_FLIP: False 18 | REVERSE_INPUT_CHANNEL: True 19 | TIMESFORMER: 20 | ATTENTION_TYPE: 'divided_space_time' 21 | SOLVER: 22 | BASE_LR: 0.005 23 | LR_POLICY: steps_with_relative_lrs 24 | STEPS: [0, 11, 14] 25 | LRS: [1, 0.1, 0.01] 26 | MAX_EPOCH: 15 27 | MOMENTUM: 0.9 28 | WEIGHT_DECAY: 1e-4 29 | OPTIMIZING_METHOD: sgd 30 | MODEL: 31 | MODEL_NAME: vit_base_patch16_224 32 | NUM_CLASSES: 174 33 | ARCH: vit 34 | LOSS_FUNC: cross_entropy 35 | DROPOUT_RATE: 0.5 36 | TEST: 37 | ENABLE: True 38 | DATASET: ssv2 39 | BATCH_SIZE: 8 40 | NUM_ENSEMBLE_VIEWS: 1 41 | NUM_SPATIAL_CROPS: 3 42 | DATA_LOADER: 43 | NUM_WORKERS: 4 44 | PIN_MEMORY: True 45 | NUM_GPUS: 8 46 | NUM_SHARDS: 1 47 | RNG_SEED: 0 48 | OUTPUT_DIR: . 49 | -------------------------------------------------------------------------------- /configs/SSv2/TimeSformer_divST_64_224.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: ssv2 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/" 10 | PATH_PREFIX: "/path/to/ssv2/frames/" 11 | NUM_FRAMES: 64 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 224 15 | INPUT_CHANNEL_NUM: [3] 16 | INV_UNIFORM_SAMPLE: True 17 | RANDOM_FLIP: False 18 | REVERSE_INPUT_CHANNEL: True 19 | TIMESFORMER: 20 | ATTENTION_TYPE: 'divided_space_time' 21 | SOLVER: 22 | BASE_LR: 0.005 23 | LR_POLICY: steps_with_relative_lrs 24 | STEPS: [0, 11, 14] 25 | LRS: [1, 0.1, 0.01] 26 | MAX_EPOCH: 15 27 | MOMENTUM: 0.9 28 | WEIGHT_DECAY: 1e-4 29 | OPTIMIZING_METHOD: sgd 30 | MODEL: 31 | MODEL_NAME: vit_base_patch16_224 32 | NUM_CLASSES: 174 33 | ARCH: vit 34 | LOSS_FUNC: cross_entropy 35 | DROPOUT_RATE: 0.5 36 | TEST: 37 | ENABLE: True 38 | DATASET: ssv2 39 | BATCH_SIZE: 8 40 | NUM_ENSEMBLE_VIEWS: 1 41 | NUM_SPATIAL_CROPS: 3 42 | DATA_LOADER: 43 | NUM_WORKERS: 4 44 | PIN_MEMORY: True 45 | NUM_GPUS: 8 46 | NUM_SHARDS: 1 47 | RNG_SEED: 0 48 | OUTPUT_DIR: . 49 | -------------------------------------------------------------------------------- /configs/SSv2/TimeSformer_divST_8_224.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: ssv2 4 | BATCH_SIZE: 8 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/" 10 | PATH_PREFIX: "/path/to/ssv2/frames/" 11 | NUM_FRAMES: 8 12 | TRAIN_JITTER_SCALES: [256, 320] 13 | TRAIN_CROP_SIZE: 224 14 | TEST_CROP_SIZE: 224 15 | INPUT_CHANNEL_NUM: [3] 16 | INV_UNIFORM_SAMPLE: True 17 | RANDOM_FLIP: False 18 | REVERSE_INPUT_CHANNEL: True 19 | TIMESFORMER: 20 | ATTENTION_TYPE: 'divided_space_time' 21 | SOLVER: 22 | BASE_LR: 0.005 23 | LR_POLICY: steps_with_relative_lrs 24 | STEPS: [0, 11, 14] 25 | LRS: [1, 0.1, 0.01] 26 | MAX_EPOCH: 15 27 | MOMENTUM: 0.9 28 | WEIGHT_DECAY: 1e-4 29 | OPTIMIZING_METHOD: sgd 30 | MODEL: 31 | MODEL_NAME: vit_base_patch16_224 32 | NUM_CLASSES: 174 33 | ARCH: vit 34 | LOSS_FUNC: cross_entropy 35 | DROPOUT_RATE: 0.5 36 | TEST: 37 | ENABLE: True 38 | DATASET: ssv2 39 | BATCH_SIZE: 8 40 | NUM_ENSEMBLE_VIEWS: 1 41 | NUM_SPATIAL_CROPS: 3 42 | DATA_LOADER: 43 | NUM_WORKERS: 4 44 | PIN_MEMORY: True 45 | NUM_GPUS: 8 46 | NUM_SHARDS: 1 47 | RNG_SEED: 0 48 | OUTPUT_DIR: . 49 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: timesformer 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - python>3.7 8 | - jupyterlab 9 | - pandas>=1.2 10 | - numpy>1.19 11 | - pytorch>=1.6 12 | - torchvision>=0.7 13 | - scikit-learn>=0.22 14 | - opencv>=4.2 15 | - pyyaml>=5.1 16 | - yacs>=0.1.6 17 | - einops>=0.3 18 | - tensorboard 19 | - psutil 20 | - tqdm 21 | - matplotlib 22 | - simplejson 23 | - pip 24 | - pip: 25 | - fvcore 26 | - av -------------------------------------------------------------------------------- /example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "08fe0c59", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pathlib import Path\n", 11 | "\n", 12 | "import torch\n", 13 | "from timesformer.models.vit import TimeSformer" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "10239d32", 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "True" 26 | ] 27 | }, 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "model_file = Path.home()/'TimeSformer/models/TimeSformer_divST_8x32_224_K600.pyth'\n", 35 | "model_file.exists()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "id": "652fb03e", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "model = TimeSformer(img_size=224, num_classes=600, num_frames=8, attention_type='divided_space_time', pretrained_model=str(model_file))\n", 46 | "\n", 47 | "dummy_video = torch.randn(2, 3, 8, 224, 224) # (batch x channels x frames x height x width)\n", 48 | "\n", 49 | "pred = model(dummy_video,) # (2, 600)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 6, 55 | "id": "83de13c5-791c-4db7-aba4-6d29ce88584e", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "assert pred.shape == (2,600)" 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.9.4" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 5 84 | } 85 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=4 4 | known_standard_library=numpy,setuptools 5 | known_myself=timesformer 6 | known_third_party=fvcore,av,torch,pycocotools,yacs,termcolor,scipy,simplejson,matplotlib,torchvision,yaml,tqdm,psutil,opencv-python,pandas,tensorboard,moviepy,sklearn,cv2 7 | no_lines_before=STDLIB,THIRDPARTY 8 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER 9 | default_section=FIRSTPARTY 10 | 11 | [mypy] 12 | python_version=3.6 13 | ignore_missing_imports = True 14 | warn_unused_configs = True 15 | disallow_untyped_defs = True 16 | check_untyped_defs = True 17 | warn_unused_ignores = True 18 | warn_redundant_casts = True 19 | show_column_numbers = True 20 | follow_imports = silent 21 | allow_redefinition = True 22 | ; Require all functions to be annotated 23 | disallow_incomplete_defs = True 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from setuptools import find_packages, setup 4 | 5 | setup( 6 | name="timesformer", 7 | version="1.0", 8 | author="FBAI", 9 | url="unknown", 10 | description="TimeSformer", 11 | keywords = [ 12 | 'artificial intelligence', 13 | 'attention mechanism', 14 | 'transformers', 15 | 'video classification', 16 | ], 17 | install_requires=[ 18 | 'einops>=0.3', 19 | 'torch>=1.6' 20 | ], 21 | extras_require={"tensorboard_video_visualization": ["moviepy"]}, 22 | packages=find_packages(exclude=("configs", "tests")), 23 | ) 24 | -------------------------------------------------------------------------------- /slurm_scripts/run_multi_node_job.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # A script with a list of commands for submitting SLURM jobs 3 | 4 | #### Kinetics training 5 | JOB_NAME=TimeSformer_divST_8x32_224 6 | python tools/submit.py --cfg configs/Kinetics/TimeSformer_divST_8x32_224.yaml --job_dir /your/job/dir/${JOB_NAME}/ --num_shards 4 --partition dev --comment "" --name ${JOB_NAME} --use_volta32 7 | 8 | #JOB_NAME=TimeSformer_jointST_8x32_224 9 | #python tools/submit.py --cfg configs/Kinetics/TimeSformer_jointST_8x32_224.yaml --job_dir /your/job/dir/${JOB_NAME}/ --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32 10 | 11 | #JOB_NAME=TimeSformer_spaceOnly_8x32_224 12 | #python tools/submit.py --cfg configs/Kinetics/TimeSformer_spaceOnly_8x32_224.yaml --job_dir /your/job/dir/${JOB_NAME}/ --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32 13 | 14 | #### Kinetics inference 15 | #JOB_NAME=TimeSformer_divST_8x32_224_TEST_3clips 16 | #python tools/submit.py --cfg configs/Kinetics/TimeSformer_divST_8x32_224_TEST.yaml --job_dir /your/job/dir/${JOB_NAME}/ --num_shards 4 --partition dev --comment "" --name ${JOB_NAME} --use_volta32 17 | 18 | 19 | ##### SSv2 training 20 | #JOB_NAME=TimeSformer_divST_8_224 21 | #python tools/submit.py --cfg configs/SSv2/TimeSformer_divST_8_224.yaml --job_dir /your/job/dir/${JOB_NAME}/ --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32 22 | 23 | ##### Sth-Sth_v2 inference 24 | #JOB_NAME=TimeSformer_divST_8_224_TEST_3clips 25 | #python tools/submit.py --cfg configs/SSv2/TimeSformer_divST_8_224_TEST.yaml --job_dir /your/job/dir/${JOB_NAME}/ --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32 26 | -------------------------------------------------------------------------------- /slurm_scripts/run_single_node_job.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # A script with a list of commands for submitting SLURM jobs 3 | 4 | #SBATCH --job-name=timesformer 5 | #SBATCH --mail-type=END,FAIL,REQUEUE 6 | #SBATCH --mail-user=name@domain.com 7 | 8 | ## %j is the job id, %u is the user id 9 | #SBATCH --output=/path/to/output/logs/slog-%A-%a.out 10 | 11 | ## filename for job standard error output (stderr) 12 | #SBATCH --error=/path/to/error/logs/slog-%A-%a.err 13 | 14 | #SBATCH --array=1 15 | #SBATCH --partition=partition_of_your_choice 16 | #SBATCH --nodes=1 -C volta32gb 17 | #SBATCH --ntasks-per-node=1 18 | #SBATCH --gpus-per-node=8 19 | #SBATCH --cpus-per-task=80 20 | #SBATCH --mem=480GB 21 | #SBATCH --signal=USR1@600 22 | #SBATCH --time=72:00:00 23 | #SBATCH --open-mode=append 24 | 25 | module purge 26 | module load cuda/10.0 27 | module load NCCL/2.4.7-1-cuda.10.0 28 | module load cudnn/v7.4-cuda.10.0 29 | source activate timesformer 30 | 31 | WORKINGDIR=/path/to/TimeSformer 32 | CURPYTHON=/path/to/python 33 | 34 | srun --label ${CURPYTHON} ${WORKINGDIR}/tools/run_net.py --cfg ${WORKINGDIR}/configs/Kinetics/TimeSformer_divST_8x32_224.yaml NUM_GPUS 8 TRAIN.BATCH_SIZE 8 35 | 36 | -------------------------------------------------------------------------------- /timesformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from timesformer.utils.env import setup_environment 4 | 5 | setup_environment() 6 | -------------------------------------------------------------------------------- /timesformer/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /timesformer/datasets/DATASET.md: -------------------------------------------------------------------------------- 1 | # Dataset Preparation 2 | 3 | ## Kinetics 4 | 5 | The Kinetics Dataset could be downloaded from the following [link](https://github.com/cvdfoundation/kinetics-dataset): 6 | 7 | After all the videos were downloaded, resize the video to the short edge size of 256, then prepare the csv files for training, validation, and testing set as `train.csv`, `val.csv`, `test.csv`. The format of the csv file is: 8 | 9 | ``` 10 | path_to_video_1 label_1 11 | path_to_video_2 label_2 12 | path_to_video_3 label_3 13 | ... 14 | path_to_video_N label_N 15 | ``` 16 | 17 | ## Something-Something V2 18 | 1. Please download the dataset and annotations from [dataset provider](https://20bn.com/datasets/something-something). 19 | 20 | 2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/val.csv)). 21 | 22 | 3. Extract the frames at 30 FPS. (We used ffmpeg-4.1.3 with command 23 | `ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}"` 24 | in experiments.) Please put the frames in a structure consistent with the frame lists. 25 | 26 | Please put all annotation json files and the frame lists in the same folder, and set `DATA.PATH_TO_DATA_DIR` to the path. Set `DATA.PATH_PREFIX` to be the path to the folder containing extracted frames. 27 | -------------------------------------------------------------------------------- /timesformer/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .build import DATASET_REGISTRY, build_dataset # noqa 4 | from .kinetics import Kinetics # noqa 5 | from .ssv2 import Ssv2 # noqa 6 | -------------------------------------------------------------------------------- /timesformer/datasets/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from fvcore.common.registry import Registry 4 | 5 | DATASET_REGISTRY = Registry("DATASET") 6 | DATASET_REGISTRY.__doc__ = """ 7 | Registry for dataset. 8 | 9 | The registered object will be called with `obj(cfg, split)`. 10 | The call should return a `torch.utils.data.Dataset` object. 11 | """ 12 | 13 | 14 | def build_dataset(dataset_name, cfg, split): 15 | """ 16 | Build a dataset, defined by `dataset_name`. 17 | Args: 18 | dataset_name (str): the name of the dataset to be constructed. 19 | cfg (CfgNode): configs. Details can be found in 20 | slowfast/config/defaults.py 21 | split (str): the split of the data loader. Options include `train`, 22 | `val`, and `test`. 23 | Returns: 24 | Dataset: a constructed dataset specified by dataset_name. 25 | """ 26 | # Capitalize the the first letter of the dataset_name since the dataset_name 27 | # in configs may be in lowercase but the name of dataset class should always 28 | # start with an uppercase letter. 29 | name = dataset_name.capitalize() 30 | return DATASET_REGISTRY.get(name)(cfg, split) 31 | -------------------------------------------------------------------------------- /timesformer/datasets/loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Data loader.""" 4 | 5 | import itertools 6 | import numpy as np 7 | import torch 8 | from torch.utils.data._utils.collate import default_collate 9 | from torch.utils.data.distributed import DistributedSampler 10 | from torch.utils.data.sampler import RandomSampler 11 | 12 | from timesformer.datasets.multigrid_helper import ShortCycleBatchSampler 13 | 14 | from . import utils as utils 15 | from .build import build_dataset 16 | 17 | 18 | def detection_collate(batch): 19 | """ 20 | Collate function for detection task. Concatanate bboxes, labels and 21 | metadata from different samples in the first dimension instead of 22 | stacking them to have a batch-size dimension. 23 | Args: 24 | batch (tuple or list): data batch to collate. 25 | Returns: 26 | (tuple): collated detection data batch. 27 | """ 28 | inputs, labels, video_idx, extra_data = zip(*batch) 29 | inputs, video_idx = default_collate(inputs), default_collate(video_idx) 30 | labels = torch.tensor(np.concatenate(labels, axis=0)).float() 31 | 32 | collated_extra_data = {} 33 | for key in extra_data[0].keys(): 34 | data = [d[key] for d in extra_data] 35 | if key == "boxes" or key == "ori_boxes": 36 | # Append idx info to the bboxes before concatenating them. 37 | bboxes = [ 38 | np.concatenate( 39 | [np.full((data[i].shape[0], 1), float(i)), data[i]], axis=1 40 | ) 41 | for i in range(len(data)) 42 | ] 43 | bboxes = np.concatenate(bboxes, axis=0) 44 | collated_extra_data[key] = torch.tensor(bboxes).float() 45 | elif key == "metadata": 46 | collated_extra_data[key] = torch.tensor( 47 | list(itertools.chain(*data)) 48 | ).view(-1, 2) 49 | else: 50 | collated_extra_data[key] = default_collate(data) 51 | 52 | return inputs, labels, video_idx, collated_extra_data 53 | 54 | 55 | def construct_loader(cfg, split, is_precise_bn=False): 56 | """ 57 | Constructs the data loader for the given dataset. 58 | Args: 59 | cfg (CfgNode): configs. Details can be found in 60 | slowfast/config/defaults.py 61 | split (str): the split of the data loader. Options include `train`, 62 | `val`, and `test`. 63 | """ 64 | assert split in ["train", "val", "test"] 65 | if split in ["train"]: 66 | dataset_name = cfg.TRAIN.DATASET 67 | batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 68 | shuffle = True 69 | drop_last = True 70 | elif split in ["val"]: 71 | dataset_name = cfg.TRAIN.DATASET 72 | batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 73 | shuffle = False 74 | drop_last = False 75 | elif split in ["test"]: 76 | dataset_name = cfg.TEST.DATASET 77 | batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 78 | shuffle = False 79 | drop_last = False 80 | 81 | # Construct the dataset 82 | dataset = build_dataset(dataset_name, cfg, split) 83 | 84 | if cfg.MULTIGRID.SHORT_CYCLE and split in ["train"] and not is_precise_bn: 85 | # Create a sampler for multi-process training 86 | sampler = utils.create_sampler(dataset, shuffle, cfg) 87 | batch_sampler = ShortCycleBatchSampler( 88 | sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg 89 | ) 90 | # Create a loader 91 | loader = torch.utils.data.DataLoader( 92 | dataset, 93 | batch_sampler=batch_sampler, 94 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 95 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 96 | worker_init_fn=utils.loader_worker_init_fn(dataset), 97 | ) 98 | else: 99 | # Create a sampler for multi-process training 100 | sampler = utils.create_sampler(dataset, shuffle, cfg) 101 | # Create a loader 102 | loader = torch.utils.data.DataLoader( 103 | dataset, 104 | batch_size=batch_size, 105 | shuffle=(False if sampler else shuffle), 106 | sampler=sampler, 107 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 108 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 109 | drop_last=drop_last, 110 | collate_fn=detection_collate if cfg.DETECTION.ENABLE else None, 111 | worker_init_fn=utils.loader_worker_init_fn(dataset), 112 | ) 113 | return loader 114 | 115 | 116 | def shuffle_dataset(loader, cur_epoch): 117 | """ " 118 | Shuffles the data. 119 | Args: 120 | loader (loader): data loader to perform shuffle. 121 | cur_epoch (int): number of the current epoch. 122 | """ 123 | sampler = ( 124 | loader.batch_sampler.sampler 125 | if isinstance(loader.batch_sampler, ShortCycleBatchSampler) 126 | else loader.sampler 127 | ) 128 | assert isinstance( 129 | sampler, (RandomSampler, DistributedSampler) 130 | ), "Sampler type '{}' not supported".format(type(sampler)) 131 | # RandomSampler handles shuffling automatically 132 | if isinstance(sampler, DistributedSampler): 133 | # DistributedSampler shuffles data based on epoch 134 | sampler.set_epoch(cur_epoch) 135 | -------------------------------------------------------------------------------- /timesformer/datasets/multigrid_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Helper functions for multigrid training.""" 4 | 5 | import numpy as np 6 | from torch._six import int_classes as _int_classes 7 | from torch.utils.data.sampler import Sampler 8 | 9 | 10 | class ShortCycleBatchSampler(Sampler): 11 | """ 12 | Extend Sampler to support "short cycle" sampling. 13 | See paper "A Multigrid Method for Efficiently Training Video Models", 14 | Wu et al., 2019 (https://arxiv.org/abs/1912.00998) for details. 15 | """ 16 | 17 | def __init__(self, sampler, batch_size, drop_last, cfg): 18 | if not isinstance(sampler, Sampler): 19 | raise ValueError( 20 | "sampler should be an instance of " 21 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 22 | ) 23 | if ( 24 | not isinstance(batch_size, _int_classes) 25 | or isinstance(batch_size, bool) 26 | or batch_size <= 0 27 | ): 28 | raise ValueError( 29 | "batch_size should be a positive integer value, " 30 | "but got batch_size={}".format(batch_size) 31 | ) 32 | if not isinstance(drop_last, bool): 33 | raise ValueError( 34 | "drop_last should be a boolean value, but got " 35 | "drop_last={}".format(drop_last) 36 | ) 37 | self.sampler = sampler 38 | self.drop_last = drop_last 39 | 40 | bs_factor = [ 41 | int( 42 | round( 43 | ( 44 | float(cfg.DATA.TRAIN_CROP_SIZE) 45 | / (s * cfg.MULTIGRID.DEFAULT_S) 46 | ) 47 | ** 2 48 | ) 49 | ) 50 | for s in cfg.MULTIGRID.SHORT_CYCLE_FACTORS 51 | ] 52 | 53 | self.batch_sizes = [ 54 | batch_size * bs_factor[0], 55 | batch_size * bs_factor[1], 56 | batch_size, 57 | ] 58 | 59 | def __iter__(self): 60 | counter = 0 61 | batch_size = self.batch_sizes[0] 62 | batch = [] 63 | for idx in self.sampler: 64 | batch.append((idx, counter % 3)) 65 | if len(batch) == batch_size: 66 | yield batch 67 | counter += 1 68 | batch_size = self.batch_sizes[counter % 3] 69 | batch = [] 70 | if len(batch) > 0 and not self.drop_last: 71 | yield batch 72 | 73 | def __len__(self): 74 | avg_batch_size = sum(self.batch_sizes) / 3.0 75 | if self.drop_last: 76 | return int(np.floor(len(self.sampler) / avg_batch_size)) 77 | else: 78 | return int(np.ceil(len(self.sampler) / avg_batch_size)) 79 | -------------------------------------------------------------------------------- /timesformer/datasets/video_container.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | import av 4 | 5 | 6 | def get_video_container(path_to_vid, multi_thread_decode=False, backend="pyav"): 7 | """ 8 | Given the path to the video, return the pyav video container. 9 | Args: 10 | path_to_vid (str): path to the video. 11 | multi_thread_decode (bool): if True, perform multi-thread decoding. 12 | backend (str): decoder backend, options include `pyav` and 13 | `torchvision`, default is `pyav`. 14 | Returns: 15 | container (container): video container. 16 | """ 17 | if backend == "torchvision": 18 | with open(path_to_vid, "rb") as fp: 19 | container = fp.read() 20 | return container 21 | elif backend == "pyav": 22 | #try: 23 | container = av.open(path_to_vid) 24 | if multi_thread_decode: 25 | # Enable multiple threads for decoding. 26 | container.streams.video[0].thread_type = "AUTO" 27 | #except: 28 | # container = None 29 | return container 30 | else: 31 | raise NotImplementedError("Unknown backend {}".format(backend)) 32 | -------------------------------------------------------------------------------- /timesformer/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | from .build import MODEL_REGISTRY, build_model # noqa 4 | from .custom_video_model_builder import * # noqa 5 | from .video_model_builder import ResNet, SlowFast # noqa 6 | -------------------------------------------------------------------------------- /timesformer/models/batchnorm_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """BatchNorm (BN) utility functions and custom batch-size BN implementations""" 4 | 5 | from functools import partial 6 | import torch 7 | import torch.distributed as dist 8 | import torch.nn as nn 9 | from torch.autograd.function import Function 10 | 11 | import timesformer.utils.distributed as du 12 | 13 | 14 | def get_norm(cfg): 15 | """ 16 | Args: 17 | cfg (CfgNode): model building configs, details are in the comments of 18 | the config file. 19 | Returns: 20 | nn.Module: the normalization layer. 21 | """ 22 | if cfg.BN.NORM_TYPE == "batchnorm": 23 | return nn.BatchNorm3d 24 | elif cfg.BN.NORM_TYPE == "sub_batchnorm": 25 | return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS) 26 | elif cfg.BN.NORM_TYPE == "sync_batchnorm": 27 | return partial( 28 | NaiveSyncBatchNorm3d, num_sync_devices=cfg.BN.NUM_SYNC_DEVICES 29 | ) 30 | else: 31 | raise NotImplementedError( 32 | "Norm type {} is not supported".format(cfg.BN.NORM_TYPE) 33 | ) 34 | 35 | 36 | class SubBatchNorm3d(nn.Module): 37 | """ 38 | The standard BN layer computes stats across all examples in a GPU. In some 39 | cases it is desirable to compute stats across only a subset of examples 40 | (e.g., in multigrid training https://arxiv.org/abs/1912.00998). 41 | SubBatchNorm3d splits the batch dimension into N splits, and run BN on 42 | each of them separately (so that the stats are computed on each subset of 43 | examples (1/N of batch) independently. During evaluation, it aggregates 44 | the stats from all splits into one BN. 45 | """ 46 | 47 | def __init__(self, num_splits, **args): 48 | """ 49 | Args: 50 | num_splits (int): number of splits. 51 | args (list): other arguments. 52 | """ 53 | super(SubBatchNorm3d, self).__init__() 54 | self.num_splits = num_splits 55 | num_features = args["num_features"] 56 | # Keep only one set of weight and bias. 57 | if args.get("affine", True): 58 | self.affine = True 59 | args["affine"] = False 60 | self.weight = torch.nn.Parameter(torch.ones(num_features)) 61 | self.bias = torch.nn.Parameter(torch.zeros(num_features)) 62 | else: 63 | self.affine = False 64 | self.bn = nn.BatchNorm3d(**args) 65 | args["num_features"] = num_features * num_splits 66 | self.split_bn = nn.BatchNorm3d(**args) 67 | 68 | def _get_aggregated_mean_std(self, means, stds, n): 69 | """ 70 | Calculate the aggregated mean and stds. 71 | Args: 72 | means (tensor): mean values. 73 | stds (tensor): standard deviations. 74 | n (int): number of sets of means and stds. 75 | """ 76 | mean = means.view(n, -1).sum(0) / n 77 | std = ( 78 | stds.view(n, -1).sum(0) / n 79 | + ((means.view(n, -1) - mean) ** 2).view(n, -1).sum(0) / n 80 | ) 81 | return mean.detach(), std.detach() 82 | 83 | def aggregate_stats(self): 84 | """ 85 | Synchronize running_mean, and running_var. Call this before eval. 86 | """ 87 | if self.split_bn.track_running_stats: 88 | ( 89 | self.bn.running_mean.data, 90 | self.bn.running_var.data, 91 | ) = self._get_aggregated_mean_std( 92 | self.split_bn.running_mean, 93 | self.split_bn.running_var, 94 | self.num_splits, 95 | ) 96 | 97 | def forward(self, x): 98 | if self.training: 99 | n, c, t, h, w = x.shape 100 | x = x.view(n // self.num_splits, c * self.num_splits, t, h, w) 101 | x = self.split_bn(x) 102 | x = x.view(n, c, t, h, w) 103 | else: 104 | x = self.bn(x) 105 | if self.affine: 106 | x = x * self.weight.view((-1, 1, 1, 1)) 107 | x = x + self.bias.view((-1, 1, 1, 1)) 108 | return x 109 | 110 | 111 | class GroupGather(Function): 112 | """ 113 | GroupGather performs all gather on each of the local process/ GPU groups. 114 | """ 115 | 116 | @staticmethod 117 | def forward(ctx, input, num_sync_devices, num_groups): 118 | """ 119 | Perform forwarding, gathering the stats across different process/ GPU 120 | group. 121 | """ 122 | ctx.num_sync_devices = num_sync_devices 123 | ctx.num_groups = num_groups 124 | 125 | input_list = [ 126 | torch.zeros_like(input) for k in range(du.get_local_size()) 127 | ] 128 | dist.all_gather( 129 | input_list, input, async_op=False, group=du._LOCAL_PROCESS_GROUP 130 | ) 131 | 132 | inputs = torch.stack(input_list, dim=0) 133 | if num_groups > 1: 134 | rank = du.get_local_rank() 135 | group_idx = rank // num_sync_devices 136 | inputs = inputs[ 137 | group_idx 138 | * num_sync_devices : (group_idx + 1) 139 | * num_sync_devices 140 | ] 141 | inputs = torch.sum(inputs, dim=0) 142 | return inputs 143 | 144 | @staticmethod 145 | def backward(ctx, grad_output): 146 | """ 147 | Perform backwarding, gathering the gradients across different process/ GPU 148 | group. 149 | """ 150 | grad_output_list = [ 151 | torch.zeros_like(grad_output) for k in range(du.get_local_size()) 152 | ] 153 | dist.all_gather( 154 | grad_output_list, 155 | grad_output, 156 | async_op=False, 157 | group=du._LOCAL_PROCESS_GROUP, 158 | ) 159 | 160 | grads = torch.stack(grad_output_list, dim=0) 161 | if ctx.num_groups > 1: 162 | rank = du.get_local_rank() 163 | group_idx = rank // ctx.num_sync_devices 164 | grads = grads[ 165 | group_idx 166 | * ctx.num_sync_devices : (group_idx + 1) 167 | * ctx.num_sync_devices 168 | ] 169 | grads = torch.sum(grads, dim=0) 170 | return grads, None, None 171 | 172 | 173 | class NaiveSyncBatchNorm3d(nn.BatchNorm3d): 174 | def __init__(self, num_sync_devices, **args): 175 | """ 176 | Naive version of Synchronized 3D BatchNorm. 177 | Args: 178 | num_sync_devices (int): number of device to sync. 179 | args (list): other arguments. 180 | """ 181 | self.num_sync_devices = num_sync_devices 182 | if self.num_sync_devices > 0: 183 | assert du.get_local_size() % self.num_sync_devices == 0, ( 184 | du.get_local_size(), 185 | self.num_sync_devices, 186 | ) 187 | self.num_groups = du.get_local_size() // self.num_sync_devices 188 | else: 189 | self.num_sync_devices = du.get_local_size() 190 | self.num_groups = 1 191 | super(NaiveSyncBatchNorm3d, self).__init__(**args) 192 | 193 | def forward(self, input): 194 | if du.get_local_size() == 1 or not self.training: 195 | return super().forward(input) 196 | 197 | assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs" 198 | C = input.shape[1] 199 | mean = torch.mean(input, dim=[0, 2, 3, 4]) 200 | meansqr = torch.mean(input * input, dim=[0, 2, 3, 4]) 201 | 202 | vec = torch.cat([mean, meansqr], dim=0) 203 | vec = GroupGather.apply(vec, self.num_sync_devices, self.num_groups) * ( 204 | 1.0 / self.num_sync_devices 205 | ) 206 | 207 | mean, meansqr = torch.split(vec, C) 208 | var = meansqr - mean * mean 209 | self.running_mean += self.momentum * (mean.detach() - self.running_mean) 210 | self.running_var += self.momentum * (var.detach() - self.running_var) 211 | 212 | invstd = torch.rsqrt(var + self.eps) 213 | scale = self.weight * invstd 214 | bias = self.bias - mean * scale 215 | scale = scale.reshape(1, -1, 1, 1, 1) 216 | bias = bias.reshape(1, -1, 1, 1, 1) 217 | return input * scale + bias 218 | -------------------------------------------------------------------------------- /timesformer/models/build.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Model construction functions.""" 4 | 5 | import torch 6 | from fvcore.common.registry import Registry 7 | 8 | MODEL_REGISTRY = Registry("MODEL") 9 | MODEL_REGISTRY.__doc__ = """ 10 | Registry for video model. 11 | 12 | The registered object will be called with `obj(cfg)`. 13 | The call should return a `torch.nn.Module` object. 14 | """ 15 | 16 | 17 | def build_model(cfg, gpu_id=None): 18 | """ 19 | Builds the video model. 20 | Args: 21 | cfg (configs): configs that contains the hyper-parameters to build the 22 | backbone. Details can be seen in slowfast/config/defaults.py. 23 | gpu_id (Optional[int]): specify the gpu index to build model. 24 | """ 25 | if torch.cuda.is_available(): 26 | assert ( 27 | cfg.NUM_GPUS <= torch.cuda.device_count() 28 | ), "Cannot use more GPU devices than available" 29 | else: 30 | assert ( 31 | cfg.NUM_GPUS == 0 32 | ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs." 33 | 34 | # Construct the model 35 | name = cfg.MODEL.MODEL_NAME 36 | model = MODEL_REGISTRY.get(name)(cfg) 37 | 38 | if cfg.NUM_GPUS: 39 | if gpu_id is None: 40 | # Determine the GPU used by the current process 41 | cur_device = torch.cuda.current_device() 42 | else: 43 | cur_device = gpu_id 44 | # Transfer the model to the current GPU device 45 | model = model.cuda(device=cur_device) 46 | 47 | 48 | # Use multi-process data parallel model in the multi-gpu setting 49 | if cfg.NUM_GPUS > 1: 50 | # Make model replica operate on the current device 51 | model = torch.nn.parallel.DistributedDataParallel( 52 | module=model, device_ids=[cur_device], output_device=cur_device 53 | ) 54 | return model 55 | -------------------------------------------------------------------------------- /timesformer/models/conv2d_same.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Ross Wightman 2 | # Conv2d w/ Same Padding 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from typing import Tuple, Optional 8 | 9 | import math 10 | from typing import List, Tuple 11 | #from .padding import pad_same, get_padding_value 12 | 13 | # Dynamically pad input x with 'SAME' padding for conv with specified args 14 | def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0): 15 | ih, iw = x.size()[-2:] 16 | pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1]) 17 | if pad_h > 0 or pad_w > 0: 18 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value) 19 | return x 20 | 21 | # Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution 22 | def get_same_padding(x: int, k: int, s: int, d: int): 23 | return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0) 24 | 25 | def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]: 26 | dynamic = False 27 | if isinstance(padding, str): 28 | # for any string padding, the padding will be calculated for you, one of three ways 29 | padding = padding.lower() 30 | if padding == 'same': 31 | # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact 32 | if is_static_pad(kernel_size, **kwargs): 33 | # static case, no extra overhead 34 | padding = get_padding(kernel_size, **kwargs) 35 | else: 36 | # dynamic 'SAME' padding, has runtime/GPU memory overhead 37 | padding = 0 38 | dynamic = True 39 | elif padding == 'valid': 40 | # 'VALID' padding, same as padding=0 41 | padding = 0 42 | else: 43 | # Default to PyTorch style 'same'-ish symmetric padding 44 | padding = get_padding(kernel_size, **kwargs) 45 | return padding, dynamic 46 | 47 | def conv2d_same( 48 | x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1), 49 | padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1): 50 | x = pad_same(x, weight.shape[-2:], stride, dilation) 51 | return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups) 52 | 53 | 54 | class Conv2dSame(nn.Conv2d): 55 | """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions 56 | """ 57 | 58 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, 59 | padding=0, dilation=1, groups=1, bias=True): 60 | super(Conv2dSame, self).__init__( 61 | in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) 62 | 63 | def forward(self, x): 64 | return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 65 | 66 | 67 | def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs): 68 | padding = kwargs.pop('padding', '') 69 | kwargs.setdefault('bias', False) 70 | padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs) 71 | if is_dynamic: 72 | return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs) 73 | else: 74 | return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs) 75 | -------------------------------------------------------------------------------- /timesformer/models/custom_video_model_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | 4 | """A More Flexible Video models.""" 5 | -------------------------------------------------------------------------------- /timesformer/models/head_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """ResNe(X)t Head helper.""" 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | class ResNetBasicHead(nn.Module): 9 | """ 10 | ResNe(X)t 3D head. 11 | This layer performs a fully-connected projection during training, when the 12 | input size is 1x1x1. It performs a convolutional projection during testing 13 | when the input size is larger than 1x1x1. If the inputs are from multiple 14 | different pathways, the inputs will be concatenated after pooling. 15 | """ 16 | 17 | def __init__( 18 | self, 19 | dim_in, 20 | num_classes, 21 | pool_size, 22 | dropout_rate=0.0, 23 | act_func="softmax", 24 | ): 25 | """ 26 | The `__init__` method of any subclass should also contain these 27 | arguments. 28 | ResNetBasicHead takes p pathways as input where p in [1, infty]. 29 | 30 | Args: 31 | dim_in (list): the list of channel dimensions of the p inputs to the 32 | ResNetHead. 33 | num_classes (int): the channel dimensions of the p outputs to the 34 | ResNetHead. 35 | pool_size (list): the list of kernel sizes of p spatial temporal 36 | poolings, temporal pool kernel size, spatial pool kernel size, 37 | spatial pool kernel size in order. 38 | dropout_rate (float): dropout rate. If equal to 0.0, perform no 39 | dropout. 40 | act_func (string): activation function to use. 'softmax': applies 41 | softmax on the output. 'sigmoid': applies sigmoid on the output. 42 | """ 43 | super(ResNetBasicHead, self).__init__() 44 | assert ( 45 | len({len(pool_size), len(dim_in)}) == 1 46 | ), "pathway dimensions are not consistent." 47 | self.num_pathways = len(pool_size) 48 | 49 | for pathway in range(self.num_pathways): 50 | if pool_size[pathway] is None: 51 | avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 52 | else: 53 | avg_pool = nn.AvgPool3d(pool_size[pathway], stride=1) 54 | self.add_module("pathway{}_avgpool".format(pathway), avg_pool) 55 | 56 | if dropout_rate > 0.0: 57 | self.dropout = nn.Dropout(dropout_rate) 58 | # Perform FC in a fully convolutional manner. The FC layer will be 59 | # initialized with a different std comparing to convolutional layers. 60 | self.projection = nn.Linear(sum(dim_in), num_classes, bias=True) 61 | 62 | # Softmax for evaluation and testing. 63 | if act_func == "softmax": 64 | self.act = nn.Softmax(dim=4) 65 | elif act_func == "sigmoid": 66 | self.act = nn.Sigmoid() 67 | else: 68 | raise NotImplementedError( 69 | "{} is not supported as an activation" 70 | "function.".format(act_func) 71 | ) 72 | 73 | def forward(self, inputs): 74 | assert ( 75 | len(inputs) == self.num_pathways 76 | ), "Input tensor does not contain {} pathway".format(self.num_pathways) 77 | pool_out = [] 78 | for pathway in range(self.num_pathways): 79 | m = getattr(self, "pathway{}_avgpool".format(pathway)) 80 | pool_out.append(m(inputs[pathway])) 81 | x = torch.cat(pool_out, 1) 82 | # (N, C, T, H, W) -> (N, T, H, W, C). 83 | x = x.permute((0, 2, 3, 4, 1)) 84 | # Perform dropout. 85 | if hasattr(self, "dropout"): 86 | x = self.dropout(x) 87 | x = self.projection(x) 88 | 89 | # Performs fully convlutional inference. 90 | if not self.training: 91 | x = self.act(x) 92 | x = x.mean([1, 2, 3]) 93 | 94 | x = x.view(x.shape[0], -1) 95 | return x 96 | 97 | 98 | class X3DHead(nn.Module): 99 | """ 100 | X3D head. 101 | This layer performs a fully-connected projection during training, when the 102 | input size is 1x1x1. It performs a convolutional projection during testing 103 | when the input size is larger than 1x1x1. If the inputs are from multiple 104 | different pathways, the inputs will be concatenated after pooling. 105 | """ 106 | 107 | def __init__( 108 | self, 109 | dim_in, 110 | dim_inner, 111 | dim_out, 112 | num_classes, 113 | pool_size, 114 | dropout_rate=0.0, 115 | act_func="softmax", 116 | inplace_relu=True, 117 | eps=1e-5, 118 | bn_mmt=0.1, 119 | norm_module=nn.BatchNorm3d, 120 | bn_lin5_on=False, 121 | ): 122 | """ 123 | The `__init__` method of any subclass should also contain these 124 | arguments. 125 | X3DHead takes a 5-dim feature tensor (BxCxTxHxW) as input. 126 | 127 | Args: 128 | dim_in (float): the channel dimension C of the input. 129 | num_classes (int): the channel dimensions of the output. 130 | pool_size (float): a single entry list of kernel size for 131 | spatiotemporal pooling for the TxHxW dimensions. 132 | dropout_rate (float): dropout rate. If equal to 0.0, perform no 133 | dropout. 134 | act_func (string): activation function to use. 'softmax': applies 135 | softmax on the output. 'sigmoid': applies sigmoid on the output. 136 | inplace_relu (bool): if True, calculate the relu on the original 137 | input without allocating new memory. 138 | eps (float): epsilon for batch norm. 139 | bn_mmt (float): momentum for batch norm. Noted that BN momentum in 140 | PyTorch = 1 - BN momentum in Caffe2. 141 | norm_module (nn.Module): nn.Module for the normalization layer. The 142 | default is nn.BatchNorm3d. 143 | bn_lin5_on (bool): if True, perform normalization on the features 144 | before the classifier. 145 | """ 146 | super(X3DHead, self).__init__() 147 | self.pool_size = pool_size 148 | self.dropout_rate = dropout_rate 149 | self.num_classes = num_classes 150 | self.act_func = act_func 151 | self.eps = eps 152 | self.bn_mmt = bn_mmt 153 | self.inplace_relu = inplace_relu 154 | self.bn_lin5_on = bn_lin5_on 155 | self._construct_head(dim_in, dim_inner, dim_out, norm_module) 156 | 157 | def _construct_head(self, dim_in, dim_inner, dim_out, norm_module): 158 | 159 | self.conv_5 = nn.Conv3d( 160 | dim_in, 161 | dim_inner, 162 | kernel_size=(1, 1, 1), 163 | stride=(1, 1, 1), 164 | padding=(0, 0, 0), 165 | bias=False, 166 | ) 167 | self.conv_5_bn = norm_module( 168 | num_features=dim_inner, eps=self.eps, momentum=self.bn_mmt 169 | ) 170 | self.conv_5_relu = nn.ReLU(self.inplace_relu) 171 | 172 | if self.pool_size is None: 173 | self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 174 | else: 175 | self.avg_pool = nn.AvgPool3d(self.pool_size, stride=1) 176 | 177 | self.lin_5 = nn.Conv3d( 178 | dim_inner, 179 | dim_out, 180 | kernel_size=(1, 1, 1), 181 | stride=(1, 1, 1), 182 | padding=(0, 0, 0), 183 | bias=False, 184 | ) 185 | if self.bn_lin5_on: 186 | self.lin_5_bn = norm_module( 187 | num_features=dim_out, eps=self.eps, momentum=self.bn_mmt 188 | ) 189 | self.lin_5_relu = nn.ReLU(self.inplace_relu) 190 | 191 | if self.dropout_rate > 0.0: 192 | self.dropout = nn.Dropout(self.dropout_rate) 193 | # Perform FC in a fully convolutional manner. The FC layer will be 194 | # initialized with a different std comparing to convolutional layers. 195 | self.projection = nn.Linear(dim_out, self.num_classes, bias=True) 196 | 197 | # Softmax for evaluation and testing. 198 | if self.act_func == "softmax": 199 | self.act = nn.Softmax(dim=4) 200 | elif self.act_func == "sigmoid": 201 | self.act = nn.Sigmoid() 202 | else: 203 | raise NotImplementedError( 204 | "{} is not supported as an activation" 205 | "function.".format(self.act_func) 206 | ) 207 | 208 | def forward(self, inputs): 209 | # In its current design the X3D head is only useable for a single 210 | # pathway input. 211 | assert len(inputs) == 1, "Input tensor does not contain 1 pathway" 212 | x = self.conv_5(inputs[0]) 213 | x = self.conv_5_bn(x) 214 | x = self.conv_5_relu(x) 215 | x = self.avg_pool(x) 216 | 217 | x = self.lin_5(x) 218 | if self.bn_lin5_on: 219 | x = self.lin_5_bn(x) 220 | x = self.lin_5_relu(x) 221 | 222 | # (N, C, T, H, W) -> (N, T, H, W, C). 223 | x = x.permute((0, 2, 3, 4, 1)) 224 | # Perform dropout. 225 | if hasattr(self, "dropout"): 226 | x = self.dropout(x) 227 | x = self.projection(x) 228 | 229 | # Performs fully convlutional inference. 230 | if not self.training: 231 | x = self.act(x) 232 | x = x.mean([1, 2, 3]) 233 | 234 | x = x.view(x.shape[0], -1) 235 | return x 236 | -------------------------------------------------------------------------------- /timesformer/models/linear.py: -------------------------------------------------------------------------------- 1 | """ Linear layer (alternate definition) 2 | """ 3 | import torch 4 | import torch.nn.functional as F 5 | from torch import nn as nn 6 | 7 | class Linear(nn.Linear): 8 | def forward(self, input: torch.Tensor) -> torch.Tensor: 9 | if torch.jit.is_scripting(): 10 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None 11 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) 12 | else: 13 | return F.linear(input, self.weight, self.bias) 14 | -------------------------------------------------------------------------------- /timesformer/models/losses.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Loss functions.""" 4 | 5 | import torch.nn as nn 6 | 7 | _LOSSES = { 8 | "cross_entropy": nn.CrossEntropyLoss, 9 | "bce": nn.BCELoss, 10 | "bce_logit": nn.BCEWithLogitsLoss, 11 | } 12 | 13 | 14 | def get_loss_func(loss_name): 15 | """ 16 | Retrieve the loss given the loss name. 17 | Args (int): 18 | loss_name: the name of the loss to use. 19 | """ 20 | if loss_name not in _LOSSES.keys(): 21 | raise NotImplementedError("Loss {} is not supported".format(loss_name)) 22 | return _LOSSES[loss_name] 23 | -------------------------------------------------------------------------------- /timesformer/models/nonlocal_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Non-local helper""" 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | 9 | class Nonlocal(nn.Module): 10 | """ 11 | Builds Non-local Neural Networks as a generic family of building 12 | blocks for capturing long-range dependencies. Non-local Network 13 | computes the response at a position as a weighted sum of the 14 | features at all positions. This building block can be plugged into 15 | many computer vision architectures. 16 | More details in the paper: https://arxiv.org/pdf/1711.07971.pdf 17 | """ 18 | 19 | def __init__( 20 | self, 21 | dim, 22 | dim_inner, 23 | pool_size=None, 24 | instantiation="softmax", 25 | zero_init_final_conv=False, 26 | zero_init_final_norm=True, 27 | norm_eps=1e-5, 28 | norm_momentum=0.1, 29 | norm_module=nn.BatchNorm3d, 30 | ): 31 | """ 32 | Args: 33 | dim (int): number of dimension for the input. 34 | dim_inner (int): number of dimension inside of the Non-local block. 35 | pool_size (list): the kernel size of spatial temporal pooling, 36 | temporal pool kernel size, spatial pool kernel size, spatial 37 | pool kernel size in order. By default pool_size is None, 38 | then there would be no pooling used. 39 | instantiation (string): supports two different instantiation method: 40 | "dot_product": normalizing correlation matrix with L2. 41 | "softmax": normalizing correlation matrix with Softmax. 42 | zero_init_final_conv (bool): If true, zero initializing the final 43 | convolution of the Non-local block. 44 | zero_init_final_norm (bool): 45 | If true, zero initializing the final batch norm of the Non-local 46 | block. 47 | norm_module (nn.Module): nn.Module for the normalization layer. The 48 | default is nn.BatchNorm3d. 49 | """ 50 | super(Nonlocal, self).__init__() 51 | self.dim = dim 52 | self.dim_inner = dim_inner 53 | self.pool_size = pool_size 54 | self.instantiation = instantiation 55 | self.use_pool = ( 56 | False 57 | if pool_size is None 58 | else any((size > 1 for size in pool_size)) 59 | ) 60 | self.norm_eps = norm_eps 61 | self.norm_momentum = norm_momentum 62 | self._construct_nonlocal( 63 | zero_init_final_conv, zero_init_final_norm, norm_module 64 | ) 65 | 66 | def _construct_nonlocal( 67 | self, zero_init_final_conv, zero_init_final_norm, norm_module 68 | ): 69 | # Three convolution heads: theta, phi, and g. 70 | self.conv_theta = nn.Conv3d( 71 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 72 | ) 73 | self.conv_phi = nn.Conv3d( 74 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 75 | ) 76 | self.conv_g = nn.Conv3d( 77 | self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0 78 | ) 79 | 80 | # Final convolution output. 81 | self.conv_out = nn.Conv3d( 82 | self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0 83 | ) 84 | # Zero initializing the final convolution output. 85 | self.conv_out.zero_init = zero_init_final_conv 86 | 87 | # TODO: change the name to `norm` 88 | self.bn = norm_module( 89 | num_features=self.dim, 90 | eps=self.norm_eps, 91 | momentum=self.norm_momentum, 92 | ) 93 | # Zero initializing the final bn. 94 | self.bn.transform_final_bn = zero_init_final_norm 95 | 96 | # Optional to add the spatial-temporal pooling. 97 | if self.use_pool: 98 | self.pool = nn.MaxPool3d( 99 | kernel_size=self.pool_size, 100 | stride=self.pool_size, 101 | padding=[0, 0, 0], 102 | ) 103 | 104 | def forward(self, x): 105 | x_identity = x 106 | N, C, T, H, W = x.size() 107 | 108 | theta = self.conv_theta(x) 109 | 110 | # Perform temporal-spatial pooling to reduce the computation. 111 | if self.use_pool: 112 | x = self.pool(x) 113 | 114 | phi = self.conv_phi(x) 115 | g = self.conv_g(x) 116 | 117 | theta = theta.view(N, self.dim_inner, -1) 118 | phi = phi.view(N, self.dim_inner, -1) 119 | g = g.view(N, self.dim_inner, -1) 120 | 121 | # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW). 122 | theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi)) 123 | # For original Non-local paper, there are two main ways to normalize 124 | # the affinity tensor: 125 | # 1) Softmax normalization (norm on exp). 126 | # 2) dot_product normalization. 127 | if self.instantiation == "softmax": 128 | # Normalizing the affinity tensor theta_phi before softmax. 129 | theta_phi = theta_phi * (self.dim_inner ** -0.5) 130 | theta_phi = nn.functional.softmax(theta_phi, dim=2) 131 | elif self.instantiation == "dot_product": 132 | spatial_temporal_dim = theta_phi.shape[2] 133 | theta_phi = theta_phi / spatial_temporal_dim 134 | else: 135 | raise NotImplementedError( 136 | "Unknown norm type {}".format(self.instantiation) 137 | ) 138 | 139 | # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW). 140 | theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g)) 141 | 142 | # (N, C, TxHxW) => (N, C, T, H, W). 143 | theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W) 144 | 145 | p = self.conv_out(theta_phi_g) 146 | p = self.bn(p) 147 | return x_identity + p 148 | -------------------------------------------------------------------------------- /timesformer/models/operators.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Custom operators.""" 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | 9 | class Swish(nn.Module): 10 | """Swish activation function: x * sigmoid(x).""" 11 | 12 | def __init__(self): 13 | super(Swish, self).__init__() 14 | 15 | def forward(self, x): 16 | return SwishEfficient.apply(x) 17 | 18 | 19 | class SwishEfficient(torch.autograd.Function): 20 | """Swish activation function: x * sigmoid(x).""" 21 | 22 | @staticmethod 23 | def forward(ctx, x): 24 | result = x * torch.sigmoid(x) 25 | ctx.save_for_backward(x) 26 | return result 27 | 28 | @staticmethod 29 | def backward(ctx, grad_output): 30 | x = ctx.saved_variables[0] 31 | sigmoid_x = torch.sigmoid(x) 32 | return grad_output * (sigmoid_x * (1 + x * (1 - sigmoid_x))) 33 | 34 | 35 | class SE(nn.Module): 36 | """Squeeze-and-Excitation (SE) block w/ Swish: AvgPool, FC, Swish, FC, Sigmoid.""" 37 | 38 | def _round_width(self, width, multiplier, min_width=8, divisor=8): 39 | """ 40 | Round width of filters based on width multiplier 41 | Args: 42 | width (int): the channel dimensions of the input. 43 | multiplier (float): the multiplication factor. 44 | min_width (int): the minimum width after multiplication. 45 | divisor (int): the new width should be dividable by divisor. 46 | """ 47 | if not multiplier: 48 | return width 49 | 50 | width *= multiplier 51 | min_width = min_width or divisor 52 | width_out = max( 53 | min_width, int(width + divisor / 2) // divisor * divisor 54 | ) 55 | if width_out < 0.9 * width: 56 | width_out += divisor 57 | return int(width_out) 58 | 59 | def __init__(self, dim_in, ratio, relu_act=True): 60 | """ 61 | Args: 62 | dim_in (int): the channel dimensions of the input. 63 | ratio (float): the channel reduction ratio for squeeze. 64 | relu_act (bool): whether to use ReLU activation instead 65 | of Swish (default). 66 | divisor (int): the new width should be dividable by divisor. 67 | """ 68 | super(SE, self).__init__() 69 | self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) 70 | dim_fc = self._round_width(dim_in, ratio) 71 | self.fc1 = nn.Conv3d(dim_in, dim_fc, 1, bias=True) 72 | self.fc1_act = nn.ReLU() if relu_act else Swish() 73 | self.fc2 = nn.Conv3d(dim_fc, dim_in, 1, bias=True) 74 | 75 | self.fc2_sig = nn.Sigmoid() 76 | 77 | def forward(self, x): 78 | x_in = x 79 | for module in self.children(): 80 | x = module(x) 81 | return x_in * x 82 | -------------------------------------------------------------------------------- /timesformer/models/optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Optimizer.""" 4 | 5 | import torch 6 | 7 | import timesformer.utils.lr_policy as lr_policy 8 | 9 | 10 | def construct_optimizer(model, cfg): 11 | """ 12 | Construct a stochastic gradient descent or ADAM optimizer with momentum. 13 | Details can be found in: 14 | Herbert Robbins, and Sutton Monro. "A stochastic approximation method." 15 | and 16 | Diederik P.Kingma, and Jimmy Ba. 17 | "Adam: A Method for Stochastic Optimization." 18 | 19 | Args: 20 | model (model): model to perform stochastic gradient descent 21 | optimization or ADAM optimization. 22 | cfg (config): configs of hyper-parameters of SGD or ADAM, includes base 23 | learning rate, momentum, weight_decay, dampening, and etc. 24 | """ 25 | # Batchnorm parameters. 26 | bn_params = [] 27 | # Non-batchnorm parameters. 28 | non_bn_parameters = [] 29 | for name, p in model.named_parameters(): 30 | if "bn" in name: 31 | bn_params.append(p) 32 | else: 33 | non_bn_parameters.append(p) 34 | # Apply different weight decay to Batchnorm and non-batchnorm parameters. 35 | # In Caffe2 classification codebase the weight decay for batchnorm is 0.0. 36 | # Having a different weight decay on batchnorm might cause a performance 37 | # drop. 38 | optim_params = [ 39 | {"params": bn_params, "weight_decay": cfg.BN.WEIGHT_DECAY}, 40 | {"params": non_bn_parameters, "weight_decay": cfg.SOLVER.WEIGHT_DECAY}, 41 | ] 42 | # Check all parameters will be passed into optimizer. 43 | assert len(list(model.parameters())) == len(non_bn_parameters) + len( 44 | bn_params 45 | ), "parameter size does not match: {} + {} != {}".format( 46 | len(non_bn_parameters), len(bn_params), len(list(model.parameters())) 47 | ) 48 | 49 | if cfg.SOLVER.OPTIMIZING_METHOD == "sgd": 50 | return torch.optim.SGD( 51 | optim_params, 52 | lr=cfg.SOLVER.BASE_LR, 53 | momentum=cfg.SOLVER.MOMENTUM, 54 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 55 | dampening=cfg.SOLVER.DAMPENING, 56 | nesterov=cfg.SOLVER.NESTEROV, 57 | ) 58 | elif cfg.SOLVER.OPTIMIZING_METHOD == "adam": 59 | return torch.optim.Adam( 60 | optim_params, 61 | lr=cfg.SOLVER.BASE_LR, 62 | betas=(0.9, 0.999), 63 | eps=1e-08, 64 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 65 | ) 66 | elif cfg.SOLVER.OPTIMIZING_METHOD == "adamw": 67 | return torch.optim.AdamW( 68 | optim_params, 69 | lr=cfg.SOLVER.BASE_LR, 70 | betas=(0.9, 0.999), 71 | eps=1e-08, 72 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 73 | ) 74 | else: 75 | raise NotImplementedError( 76 | "Does not support {} optimizer".format(cfg.SOLVER.OPTIMIZING_METHOD) 77 | ) 78 | 79 | 80 | def get_epoch_lr(cur_epoch, cfg): 81 | """ 82 | Retrieves the lr for the given epoch (as specified by the lr policy). 83 | Args: 84 | cfg (config): configs of hyper-parameters of ADAM, includes base 85 | learning rate, betas, and weight decays. 86 | cur_epoch (float): the number of epoch of the current training stage. 87 | """ 88 | return lr_policy.get_lr_at_epoch(cfg, cur_epoch) 89 | 90 | 91 | def set_lr(optimizer, new_lr): 92 | """ 93 | Sets the optimizer lr to the specified value. 94 | Args: 95 | optimizer (optim): the optimizer using to optimize the current network. 96 | new_lr (float): the new learning rate to set. 97 | """ 98 | for param_group in optimizer.param_groups: 99 | param_group["lr"] = new_lr 100 | -------------------------------------------------------------------------------- /timesformer/models/stem_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """ResNe(X)t 3D stem helper.""" 4 | 5 | import torch.nn as nn 6 | 7 | 8 | def get_stem_func(name): 9 | """ 10 | Retrieves the stem module by name. 11 | """ 12 | trans_funcs = {"x3d_stem": X3DStem, "basic_stem": ResNetBasicStem} 13 | assert ( 14 | name in trans_funcs.keys() 15 | ), "Transformation function '{}' not supported".format(name) 16 | return trans_funcs[name] 17 | 18 | 19 | class VideoModelStem(nn.Module): 20 | """ 21 | Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool 22 | on input data tensor for one or multiple pathways. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | dim_in, 28 | dim_out, 29 | kernel, 30 | stride, 31 | padding, 32 | inplace_relu=True, 33 | eps=1e-5, 34 | bn_mmt=0.1, 35 | norm_module=nn.BatchNorm3d, 36 | stem_func_name="basic_stem", 37 | ): 38 | """ 39 | The `__init__` method of any subclass should also contain these 40 | arguments. List size of 1 for single pathway models (C2D, I3D, Slow 41 | and etc), list size of 2 for two pathway models (SlowFast). 42 | 43 | Args: 44 | dim_in (list): the list of channel dimensions of the inputs. 45 | dim_out (list): the output dimension of the convolution in the stem 46 | layer. 47 | kernel (list): the kernels' size of the convolutions in the stem 48 | layers. Temporal kernel size, height kernel size, width kernel 49 | size in order. 50 | stride (list): the stride sizes of the convolutions in the stem 51 | layer. Temporal kernel stride, height kernel size, width kernel 52 | size in order. 53 | padding (list): the paddings' sizes of the convolutions in the stem 54 | layer. Temporal padding size, height padding size, width padding 55 | size in order. 56 | inplace_relu (bool): calculate the relu on the original input 57 | without allocating new memory. 58 | eps (float): epsilon for batch norm. 59 | bn_mmt (float): momentum for batch norm. Noted that BN momentum in 60 | PyTorch = 1 - BN momentum in Caffe2. 61 | norm_module (nn.Module): nn.Module for the normalization layer. The 62 | default is nn.BatchNorm3d. 63 | stem_func_name (string): name of the the stem function applied on 64 | input to the network. 65 | """ 66 | super(VideoModelStem, self).__init__() 67 | 68 | assert ( 69 | len( 70 | { 71 | len(dim_in), 72 | len(dim_out), 73 | len(kernel), 74 | len(stride), 75 | len(padding), 76 | } 77 | ) 78 | == 1 79 | ), "Input pathway dimensions are not consistent." 80 | self.num_pathways = len(dim_in) 81 | self.kernel = kernel 82 | self.stride = stride 83 | self.padding = padding 84 | self.inplace_relu = inplace_relu 85 | self.eps = eps 86 | self.bn_mmt = bn_mmt 87 | # Construct the stem layer. 88 | self._construct_stem(dim_in, dim_out, norm_module, stem_func_name) 89 | 90 | def _construct_stem(self, dim_in, dim_out, norm_module, stem_func_name): 91 | trans_func = get_stem_func(stem_func_name) 92 | 93 | for pathway in range(len(dim_in)): 94 | stem = trans_func( 95 | dim_in[pathway], 96 | dim_out[pathway], 97 | self.kernel[pathway], 98 | self.stride[pathway], 99 | self.padding[pathway], 100 | self.inplace_relu, 101 | self.eps, 102 | self.bn_mmt, 103 | norm_module, 104 | ) 105 | self.add_module("pathway{}_stem".format(pathway), stem) 106 | 107 | def forward(self, x): 108 | assert ( 109 | len(x) == self.num_pathways 110 | ), "Input tensor does not contain {} pathway".format(self.num_pathways) 111 | for pathway in range(len(x)): 112 | m = getattr(self, "pathway{}_stem".format(pathway)) 113 | x[pathway] = m(x[pathway]) 114 | return x 115 | 116 | 117 | class ResNetBasicStem(nn.Module): 118 | """ 119 | ResNe(X)t 3D stem module. 120 | Performs spatiotemporal Convolution, BN, and Relu following by a 121 | spatiotemporal pooling. 122 | """ 123 | 124 | def __init__( 125 | self, 126 | dim_in, 127 | dim_out, 128 | kernel, 129 | stride, 130 | padding, 131 | inplace_relu=True, 132 | eps=1e-5, 133 | bn_mmt=0.1, 134 | norm_module=nn.BatchNorm3d, 135 | ): 136 | """ 137 | The `__init__` method of any subclass should also contain these arguments. 138 | 139 | Args: 140 | dim_in (int): the channel dimension of the input. Normally 3 is used 141 | for rgb input, and 2 or 3 is used for optical flow input. 142 | dim_out (int): the output dimension of the convolution in the stem 143 | layer. 144 | kernel (list): the kernel size of the convolution in the stem layer. 145 | temporal kernel size, height kernel size, width kernel size in 146 | order. 147 | stride (list): the stride size of the convolution in the stem layer. 148 | temporal kernel stride, height kernel size, width kernel size in 149 | order. 150 | padding (int): the padding size of the convolution in the stem 151 | layer, temporal padding size, height padding size, width 152 | padding size in order. 153 | inplace_relu (bool): calculate the relu on the original input 154 | without allocating new memory. 155 | eps (float): epsilon for batch norm. 156 | bn_mmt (float): momentum for batch norm. Noted that BN momentum in 157 | PyTorch = 1 - BN momentum in Caffe2. 158 | norm_module (nn.Module): nn.Module for the normalization layer. The 159 | default is nn.BatchNorm3d. 160 | """ 161 | super(ResNetBasicStem, self).__init__() 162 | self.kernel = kernel 163 | self.stride = stride 164 | self.padding = padding 165 | self.inplace_relu = inplace_relu 166 | self.eps = eps 167 | self.bn_mmt = bn_mmt 168 | # Construct the stem layer. 169 | self._construct_stem(dim_in, dim_out, norm_module) 170 | 171 | def _construct_stem(self, dim_in, dim_out, norm_module): 172 | self.conv = nn.Conv3d( 173 | dim_in, 174 | dim_out, 175 | self.kernel, 176 | stride=self.stride, 177 | padding=self.padding, 178 | bias=False, 179 | ) 180 | self.bn = norm_module( 181 | num_features=dim_out, eps=self.eps, momentum=self.bn_mmt 182 | ) 183 | self.relu = nn.ReLU(self.inplace_relu) 184 | self.pool_layer = nn.MaxPool3d( 185 | kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1] 186 | ) 187 | 188 | def forward(self, x): 189 | x = self.conv(x) 190 | x = self.bn(x) 191 | x = self.relu(x) 192 | x = self.pool_layer(x) 193 | return x 194 | 195 | 196 | class X3DStem(nn.Module): 197 | """ 198 | X3D's 3D stem module. 199 | Performs a spatial followed by a depthwise temporal Convolution, BN, and Relu following by a 200 | spatiotemporal pooling. 201 | """ 202 | 203 | def __init__( 204 | self, 205 | dim_in, 206 | dim_out, 207 | kernel, 208 | stride, 209 | padding, 210 | inplace_relu=True, 211 | eps=1e-5, 212 | bn_mmt=0.1, 213 | norm_module=nn.BatchNorm3d, 214 | ): 215 | """ 216 | The `__init__` method of any subclass should also contain these arguments. 217 | 218 | Args: 219 | dim_in (int): the channel dimension of the input. Normally 3 is used 220 | for rgb input, and 2 or 3 is used for optical flow input. 221 | dim_out (int): the output dimension of the convolution in the stem 222 | layer. 223 | kernel (list): the kernel size of the convolution in the stem layer. 224 | temporal kernel size, height kernel size, width kernel size in 225 | order. 226 | stride (list): the stride size of the convolution in the stem layer. 227 | temporal kernel stride, height kernel size, width kernel size in 228 | order. 229 | padding (int): the padding size of the convolution in the stem 230 | layer, temporal padding size, height padding size, width 231 | padding size in order. 232 | inplace_relu (bool): calculate the relu on the original input 233 | without allocating new memory. 234 | eps (float): epsilon for batch norm. 235 | bn_mmt (float): momentum for batch norm. Noted that BN momentum in 236 | PyTorch = 1 - BN momentum in Caffe2. 237 | norm_module (nn.Module): nn.Module for the normalization layer. The 238 | default is nn.BatchNorm3d. 239 | """ 240 | super(X3DStem, self).__init__() 241 | self.kernel = kernel 242 | self.stride = stride 243 | self.padding = padding 244 | self.inplace_relu = inplace_relu 245 | self.eps = eps 246 | self.bn_mmt = bn_mmt 247 | # Construct the stem layer. 248 | self._construct_stem(dim_in, dim_out, norm_module) 249 | 250 | def _construct_stem(self, dim_in, dim_out, norm_module): 251 | self.conv_xy = nn.Conv3d( 252 | dim_in, 253 | dim_out, 254 | kernel_size=(1, self.kernel[1], self.kernel[2]), 255 | stride=(1, self.stride[1], self.stride[2]), 256 | padding=(0, self.padding[1], self.padding[2]), 257 | bias=False, 258 | ) 259 | self.conv = nn.Conv3d( 260 | dim_out, 261 | dim_out, 262 | kernel_size=(self.kernel[0], 1, 1), 263 | stride=(self.stride[0], 1, 1), 264 | padding=(self.padding[0], 0, 0), 265 | bias=False, 266 | groups=dim_out, 267 | ) 268 | 269 | self.bn = norm_module( 270 | num_features=dim_out, eps=self.eps, momentum=self.bn_mmt 271 | ) 272 | self.relu = nn.ReLU(self.inplace_relu) 273 | 274 | def forward(self, x): 275 | x = self.conv_xy(x) 276 | x = self.conv(x) 277 | x = self.bn(x) 278 | x = self.relu(x) 279 | return x 280 | -------------------------------------------------------------------------------- /timesformer/models/vit_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Ross Wightman 2 | # Various utility functions 3 | 4 | import torch 5 | import torch.nn as nn 6 | from functools import partial 7 | import math 8 | import warnings 9 | import torch.nn.functional as F 10 | 11 | from timesformer.models.helpers import load_pretrained 12 | from .build import MODEL_REGISTRY 13 | from itertools import repeat 14 | from torch._six import container_abcs 15 | 16 | DEFAULT_CROP_PCT = 0.875 17 | IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) 18 | IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) 19 | IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) 20 | IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) 21 | IMAGENET_DPN_MEAN = (124 / 255, 117 / 255, 104 / 255) 22 | IMAGENET_DPN_STD = tuple([1 / (.0167 * 255)] * 3) 23 | 24 | def _no_grad_trunc_normal_(tensor, mean, std, a, b): 25 | def norm_cdf(x): 26 | # Computes standard normal cumulative distribution function 27 | return (1. + math.erf(x / math.sqrt(2.))) / 2. 28 | 29 | if (mean < a - 2 * std) or (mean > b + 2 * std): 30 | warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " 31 | "The distribution of values may be incorrect.", 32 | stacklevel=2) 33 | 34 | with torch.no_grad(): 35 | # Values are generated by using a truncated uniform distribution and 36 | # then using the inverse CDF for the normal distribution. 37 | # Get upper and lower cdf values 38 | l = norm_cdf((a - mean) / std) 39 | u = norm_cdf((b - mean) / std) 40 | 41 | # Uniformly fill tensor with values from [l, u], then translate to 42 | # [2l-1, 2u-1]. 43 | tensor.uniform_(2 * l - 1, 2 * u - 1) 44 | 45 | # Use inverse cdf transform for normal distribution to get truncated 46 | # standard normal 47 | tensor.erfinv_() 48 | 49 | # Transform to proper mean, std 50 | tensor.mul_(std * math.sqrt(2.)) 51 | tensor.add_(mean) 52 | 53 | # Clamp to ensure it's in the proper range 54 | tensor.clamp_(min=a, max=b) 55 | return tensor 56 | 57 | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): 58 | # type: (Tensor, float, float, float, float) -> Tensor 59 | r"""Fills the input Tensor with values drawn from a truncated 60 | normal distribution. The values are effectively drawn from the 61 | normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` 62 | with values outside :math:`[a, b]` redrawn until they are within 63 | the bounds. The method used for generating the random values works 64 | best when :math:`a \leq \text{mean} \leq b`. 65 | Args: 66 | tensor: an n-dimensional `torch.Tensor` 67 | mean: the mean of the normal distribution 68 | std: the standard deviation of the normal distribution 69 | a: the minimum cutoff value 70 | b: the maximum cutoff value 71 | Examples: 72 | >>> w = torch.empty(3, 5) 73 | >>> nn.init.trunc_normal_(w) 74 | """ 75 | return _no_grad_trunc_normal_(tensor, mean, std, a, b) 76 | 77 | # From PyTorch internals 78 | def _ntuple(n): 79 | def parse(x): 80 | if isinstance(x, container_abcs.Iterable): 81 | return x 82 | return tuple(repeat(x, n)) 83 | return parse 84 | to_2tuple = _ntuple(2) 85 | 86 | # Calculate symmetric padding for a convolution 87 | def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int: 88 | padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 89 | return padding 90 | 91 | def get_padding_value(padding, kernel_size, **kwargs): 92 | dynamic = False 93 | if isinstance(padding, str): 94 | # for any string padding, the padding will be calculated for you, one of three ways 95 | padding = padding.lower() 96 | if padding == 'same': 97 | # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact 98 | if is_static_pad(kernel_size, **kwargs): 99 | # static case, no extra overhead 100 | padding = get_padding(kernel_size, **kwargs) 101 | else: 102 | # dynamic 'SAME' padding, has runtime/GPU memory overhead 103 | padding = 0 104 | dynamic = True 105 | elif padding == 'valid': 106 | # 'VALID' padding, same as padding=0 107 | padding = 0 108 | else: 109 | # Default to PyTorch style 'same'-ish symmetric padding 110 | padding = get_padding(kernel_size, **kwargs) 111 | return padding, dynamic 112 | 113 | # Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution 114 | def get_same_padding(x: int, k: int, s: int, d: int): 115 | return max((int(math.ceil(x // s)) - 1) * s + (k - 1) * d + 1 - x, 0) 116 | 117 | 118 | # Can SAME padding for given args be done statically? 119 | def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_): 120 | return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0 121 | 122 | 123 | # Dynamically pad input x with 'SAME' padding for conv with specified args 124 | #def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0): 125 | def pad_same(x, k, s, d=(1, 1), value= 0): 126 | ih, iw = x.size()[-2:] 127 | pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1]) 128 | if pad_h > 0 or pad_w > 0: 129 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value) 130 | return x 131 | 132 | def adaptive_pool_feat_mult(pool_type='avg'): 133 | if pool_type == 'catavgmax': 134 | return 2 135 | else: 136 | return 1 137 | 138 | def drop_path(x, drop_prob: float = 0., training: bool = False): 139 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). 140 | This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, 141 | the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... 142 | See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for 143 | changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 144 | 'survival rate' as the argument. 145 | """ 146 | if drop_prob == 0. or not training: 147 | return x 148 | keep_prob = 1 - drop_prob 149 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 150 | random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) 151 | random_tensor.floor_() # binarize 152 | output = x.div(keep_prob) * random_tensor 153 | return output 154 | 155 | class DropPath(nn.Module): 156 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). 157 | """ 158 | def __init__(self, drop_prob=None): 159 | super(DropPath, self).__init__() 160 | self.drop_prob = drop_prob 161 | 162 | def forward(self, x): 163 | return drop_path(x, self.drop_prob, self.training) 164 | -------------------------------------------------------------------------------- /timesformer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /timesformer/utils/ava_eval_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | ############################################################################## 15 | # 16 | # Based on: 17 | # -------------------------------------------------------- 18 | # ActivityNet 19 | # Copyright (c) 2015 ActivityNet 20 | # Licensed under The MIT License 21 | # [see https://github.com/activitynet/ActivityNet/blob/master/LICENSE for details] 22 | # -------------------------------------------------------- 23 | 24 | """Helper functions for AVA evaluation.""" 25 | 26 | from __future__ import ( 27 | absolute_import, 28 | division, 29 | print_function, 30 | unicode_literals, 31 | ) 32 | import csv 33 | import logging 34 | import numpy as np 35 | import pprint 36 | import time 37 | from collections import defaultdict 38 | from fvcore.common.file_io import PathManager 39 | import timesformer.utils.distributed as du 40 | 41 | from timesformer.utils.ava_evaluation import ( 42 | object_detection_evaluation, 43 | standard_fields, 44 | ) 45 | 46 | logger = logging.getLogger(__name__) 47 | 48 | 49 | def make_image_key(video_id, timestamp): 50 | """Returns a unique identifier for a video id & timestamp.""" 51 | return "%s,%04d" % (video_id, int(timestamp)) 52 | 53 | 54 | def read_csv(csv_file, class_whitelist=None, load_score=False): 55 | """Loads boxes and class labels from a CSV file in the AVA format. 56 | CSV file format described at https://research.google.com/ava/download.html. 57 | Args: 58 | csv_file: A file object. 59 | class_whitelist: If provided, boxes corresponding to (integer) class labels 60 | not in this set are skipped. 61 | Returns: 62 | boxes: A dictionary mapping each unique image key (string) to a list of 63 | boxes, given as coordinates [y1, x1, y2, x2]. 64 | labels: A dictionary mapping each unique image key (string) to a list of 65 | integer class lables, matching the corresponding box in `boxes`. 66 | scores: A dictionary mapping each unique image key (string) to a list of 67 | score values lables, matching the corresponding label in `labels`. If 68 | scores are not provided in the csv, then they will default to 1.0. 69 | """ 70 | boxes = defaultdict(list) 71 | labels = defaultdict(list) 72 | scores = defaultdict(list) 73 | with PathManager.open(csv_file, "r") as f: 74 | reader = csv.reader(f) 75 | for row in reader: 76 | assert len(row) in [7, 8], "Wrong number of columns: " + row 77 | image_key = make_image_key(row[0], row[1]) 78 | x1, y1, x2, y2 = [float(n) for n in row[2:6]] 79 | action_id = int(row[6]) 80 | if class_whitelist and action_id not in class_whitelist: 81 | continue 82 | score = 1.0 83 | if load_score: 84 | score = float(row[7]) 85 | boxes[image_key].append([y1, x1, y2, x2]) 86 | labels[image_key].append(action_id) 87 | scores[image_key].append(score) 88 | return boxes, labels, scores 89 | 90 | 91 | def read_exclusions(exclusions_file): 92 | """Reads a CSV file of excluded timestamps. 93 | Args: 94 | exclusions_file: A file object containing a csv of video-id,timestamp. 95 | Returns: 96 | A set of strings containing excluded image keys, e.g. "aaaaaaaaaaa,0904", 97 | or an empty set if exclusions file is None. 98 | """ 99 | excluded = set() 100 | if exclusions_file: 101 | with PathManager.open(exclusions_file, "r") as f: 102 | reader = csv.reader(f) 103 | for row in reader: 104 | assert len(row) == 2, "Expected only 2 columns, got: " + row 105 | excluded.add(make_image_key(row[0], row[1])) 106 | return excluded 107 | 108 | 109 | def read_labelmap(labelmap_file): 110 | """Read label map and class ids.""" 111 | 112 | labelmap = [] 113 | class_ids = set() 114 | name = "" 115 | class_id = "" 116 | with PathManager.open(labelmap_file, "r") as f: 117 | for line in f: 118 | if line.startswith(" name:"): 119 | name = line.split('"')[1] 120 | elif line.startswith(" id:") or line.startswith(" label_id:"): 121 | class_id = int(line.strip().split(" ")[-1]) 122 | labelmap.append({"id": class_id, "name": name}) 123 | class_ids.add(class_id) 124 | return labelmap, class_ids 125 | 126 | 127 | def evaluate_ava_from_files(labelmap, groundtruth, detections, exclusions): 128 | """Run AVA evaluation given annotation/prediction files.""" 129 | 130 | categories, class_whitelist = read_labelmap(labelmap) 131 | excluded_keys = read_exclusions(exclusions) 132 | groundtruth = read_csv(groundtruth, class_whitelist, load_score=False) 133 | detections = read_csv(detections, class_whitelist, load_score=True) 134 | run_evaluation(categories, groundtruth, detections, excluded_keys) 135 | 136 | 137 | def evaluate_ava( 138 | preds, 139 | original_boxes, 140 | metadata, 141 | excluded_keys, 142 | class_whitelist, 143 | categories, 144 | groundtruth=None, 145 | video_idx_to_name=None, 146 | name="latest", 147 | ): 148 | """Run AVA evaluation given numpy arrays.""" 149 | 150 | eval_start = time.time() 151 | 152 | detections = get_ava_eval_data( 153 | preds, 154 | original_boxes, 155 | metadata, 156 | class_whitelist, 157 | video_idx_to_name=video_idx_to_name, 158 | ) 159 | 160 | logger.info("Evaluating with %d unique GT frames." % len(groundtruth[0])) 161 | logger.info( 162 | "Evaluating with %d unique detection frames" % len(detections[0]) 163 | ) 164 | 165 | write_results(detections, "detections_%s.csv" % name) 166 | write_results(groundtruth, "groundtruth_%s.csv" % name) 167 | 168 | results = run_evaluation(categories, groundtruth, detections, excluded_keys) 169 | 170 | logger.info("AVA eval done in %f seconds." % (time.time() - eval_start)) 171 | return results["PascalBoxes_Precision/mAP@0.5IOU"] 172 | 173 | 174 | def run_evaluation( 175 | categories, groundtruth, detections, excluded_keys, verbose=True 176 | ): 177 | """AVA evaluation main logic.""" 178 | 179 | pascal_evaluator = object_detection_evaluation.PascalDetectionEvaluator( 180 | categories 181 | ) 182 | 183 | boxes, labels, _ = groundtruth 184 | 185 | gt_keys = [] 186 | pred_keys = [] 187 | 188 | for image_key in boxes: 189 | if image_key in excluded_keys: 190 | logging.info( 191 | ( 192 | "Found excluded timestamp in ground truth: %s. " 193 | "It will be ignored." 194 | ), 195 | image_key, 196 | ) 197 | continue 198 | pascal_evaluator.add_single_ground_truth_image_info( 199 | image_key, 200 | { 201 | standard_fields.InputDataFields.groundtruth_boxes: np.array( 202 | boxes[image_key], dtype=float 203 | ), 204 | standard_fields.InputDataFields.groundtruth_classes: np.array( 205 | labels[image_key], dtype=int 206 | ), 207 | standard_fields.InputDataFields.groundtruth_difficult: np.zeros( 208 | len(boxes[image_key]), dtype=bool 209 | ), 210 | }, 211 | ) 212 | 213 | gt_keys.append(image_key) 214 | 215 | boxes, labels, scores = detections 216 | 217 | for image_key in boxes: 218 | if image_key in excluded_keys: 219 | logging.info( 220 | ( 221 | "Found excluded timestamp in detections: %s. " 222 | "It will be ignored." 223 | ), 224 | image_key, 225 | ) 226 | continue 227 | pascal_evaluator.add_single_detected_image_info( 228 | image_key, 229 | { 230 | standard_fields.DetectionResultFields.detection_boxes: np.array( 231 | boxes[image_key], dtype=float 232 | ), 233 | standard_fields.DetectionResultFields.detection_classes: np.array( 234 | labels[image_key], dtype=int 235 | ), 236 | standard_fields.DetectionResultFields.detection_scores: np.array( 237 | scores[image_key], dtype=float 238 | ), 239 | }, 240 | ) 241 | 242 | pred_keys.append(image_key) 243 | 244 | metrics = pascal_evaluator.evaluate() 245 | 246 | if du.is_master_proc(): 247 | pprint.pprint(metrics, indent=2) 248 | return metrics 249 | 250 | 251 | def get_ava_eval_data( 252 | scores, 253 | boxes, 254 | metadata, 255 | class_whitelist, 256 | verbose=False, 257 | video_idx_to_name=None, 258 | ): 259 | """ 260 | Convert our data format into the data format used in official AVA 261 | evaluation. 262 | """ 263 | 264 | out_scores = defaultdict(list) 265 | out_labels = defaultdict(list) 266 | out_boxes = defaultdict(list) 267 | count = 0 268 | for i in range(scores.shape[0]): 269 | video_idx = int(np.round(metadata[i][0])) 270 | sec = int(np.round(metadata[i][1])) 271 | 272 | video = video_idx_to_name[video_idx] 273 | 274 | key = video + "," + "%04d" % (sec) 275 | batch_box = boxes[i].tolist() 276 | # The first is batch idx. 277 | batch_box = [batch_box[j] for j in [0, 2, 1, 4, 3]] 278 | 279 | one_scores = scores[i].tolist() 280 | for cls_idx, score in enumerate(one_scores): 281 | if cls_idx + 1 in class_whitelist: 282 | out_scores[key].append(score) 283 | out_labels[key].append(cls_idx + 1) 284 | out_boxes[key].append(batch_box[1:]) 285 | count += 1 286 | 287 | return out_boxes, out_labels, out_scores 288 | 289 | 290 | def write_results(detections, filename): 291 | """Write prediction results into official formats.""" 292 | start = time.time() 293 | 294 | boxes, labels, scores = detections 295 | with PathManager.open(filename, "w") as f: 296 | for key in boxes.keys(): 297 | for box, label, score in zip(boxes[key], labels[key], scores[key]): 298 | f.write( 299 | "%s,%.03f,%.03f,%.03f,%.03f,%d,%.04f\n" 300 | % (key, box[1], box[0], box[3], box[2], label, score) 301 | ) 302 | 303 | logger.info("AVA results wrote to %s" % filename) 304 | logger.info("\ttook %d seconds." % (time.time() - start)) 305 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/README.md: -------------------------------------------------------------------------------- 1 | The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet). 2 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/TimeSformer/a5ef29a7b7264baff199a30b3306ac27de901133/timesformer/utils/ava_evaluation/__init__.py -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt: -------------------------------------------------------------------------------- 1 | item { 2 | name: "bend/bow (at the waist)" 3 | id: 1 4 | } 5 | item { 6 | name: "crouch/kneel" 7 | id: 3 8 | } 9 | item { 10 | name: "dance" 11 | id: 4 12 | } 13 | item { 14 | name: "fall down" 15 | id: 5 16 | } 17 | item { 18 | name: "get up" 19 | id: 6 20 | } 21 | item { 22 | name: "jump/leap" 23 | id: 7 24 | } 25 | item { 26 | name: "lie/sleep" 27 | id: 8 28 | } 29 | item { 30 | name: "martial art" 31 | id: 9 32 | } 33 | item { 34 | name: "run/jog" 35 | id: 10 36 | } 37 | item { 38 | name: "sit" 39 | id: 11 40 | } 41 | item { 42 | name: "stand" 43 | id: 12 44 | } 45 | item { 46 | name: "swim" 47 | id: 13 48 | } 49 | item { 50 | name: "walk" 51 | id: 14 52 | } 53 | item { 54 | name: "answer phone" 55 | id: 15 56 | } 57 | item { 58 | name: "carry/hold (an object)" 59 | id: 17 60 | } 61 | item { 62 | name: "climb (e.g., a mountain)" 63 | id: 20 64 | } 65 | item { 66 | name: "close (e.g., a door, a box)" 67 | id: 22 68 | } 69 | item { 70 | name: "cut" 71 | id: 24 72 | } 73 | item { 74 | name: "dress/put on clothing" 75 | id: 26 76 | } 77 | item { 78 | name: "drink" 79 | id: 27 80 | } 81 | item { 82 | name: "drive (e.g., a car, a truck)" 83 | id: 28 84 | } 85 | item { 86 | name: "eat" 87 | id: 29 88 | } 89 | item { 90 | name: "enter" 91 | id: 30 92 | } 93 | item { 94 | name: "hit (an object)" 95 | id: 34 96 | } 97 | item { 98 | name: "lift/pick up" 99 | id: 36 100 | } 101 | item { 102 | name: "listen (e.g., to music)" 103 | id: 37 104 | } 105 | item { 106 | name: "open (e.g., a window, a car door)" 107 | id: 38 108 | } 109 | item { 110 | name: "play musical instrument" 111 | id: 41 112 | } 113 | item { 114 | name: "point to (an object)" 115 | id: 43 116 | } 117 | item { 118 | name: "pull (an object)" 119 | id: 45 120 | } 121 | item { 122 | name: "push (an object)" 123 | id: 46 124 | } 125 | item { 126 | name: "put down" 127 | id: 47 128 | } 129 | item { 130 | name: "read" 131 | id: 48 132 | } 133 | item { 134 | name: "ride (e.g., a bike, a car, a horse)" 135 | id: 49 136 | } 137 | item { 138 | name: "sail boat" 139 | id: 51 140 | } 141 | item { 142 | name: "shoot" 143 | id: 52 144 | } 145 | item { 146 | name: "smoke" 147 | id: 54 148 | } 149 | item { 150 | name: "take a photo" 151 | id: 56 152 | } 153 | item { 154 | name: "text on/look at a cellphone" 155 | id: 57 156 | } 157 | item { 158 | name: "throw" 159 | id: 58 160 | } 161 | item { 162 | name: "touch (an object)" 163 | id: 59 164 | } 165 | item { 166 | name: "turn (e.g., a screwdriver)" 167 | id: 60 168 | } 169 | item { 170 | name: "watch (e.g., TV)" 171 | id: 61 172 | } 173 | item { 174 | name: "work on a computer" 175 | id: 62 176 | } 177 | item { 178 | name: "write" 179 | id: 63 180 | } 181 | item { 182 | name: "fight/hit (a person)" 183 | id: 64 184 | } 185 | item { 186 | name: "give/serve (an object) to (a person)" 187 | id: 65 188 | } 189 | item { 190 | name: "grab (a person)" 191 | id: 66 192 | } 193 | item { 194 | name: "hand clap" 195 | id: 67 196 | } 197 | item { 198 | name: "hand shake" 199 | id: 68 200 | } 201 | item { 202 | name: "hand wave" 203 | id: 69 204 | } 205 | item { 206 | name: "hug (a person)" 207 | id: 70 208 | } 209 | item { 210 | name: "kiss (a person)" 211 | id: 72 212 | } 213 | item { 214 | name: "lift (a person)" 215 | id: 73 216 | } 217 | item { 218 | name: "listen to (a person)" 219 | id: 74 220 | } 221 | item { 222 | name: "push (another person)" 223 | id: 76 224 | } 225 | item { 226 | name: "sing to (e.g., self, a person, a group)" 227 | id: 77 228 | } 229 | item { 230 | name: "take (an object) from (a person)" 231 | id: 78 232 | } 233 | item { 234 | name: "talk to (e.g., self, a person, a group)" 235 | id: 79 236 | } 237 | item { 238 | name: "watch (a person)" 239 | id: 80 240 | } 241 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/label_map_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Label map utility functions.""" 16 | 17 | from __future__ import ( 18 | absolute_import, 19 | division, 20 | print_function, 21 | unicode_literals, 22 | ) 23 | import logging 24 | 25 | # from google.protobuf import text_format 26 | # from google3.third_party.tensorflow_models.object_detection.protos import string_int_label_map_pb2 27 | 28 | 29 | def _validate_label_map(label_map): 30 | """Checks if a label map is valid. 31 | 32 | Args: 33 | label_map: StringIntLabelMap to validate. 34 | 35 | Raises: 36 | ValueError: if label map is invalid. 37 | """ 38 | for item in label_map.item: 39 | if item.id < 1: 40 | raise ValueError("Label map ids should be >= 1.") 41 | 42 | 43 | def create_category_index(categories): 44 | """Creates dictionary of COCO compatible categories keyed by category id. 45 | 46 | Args: 47 | categories: a list of dicts, each of which has the following keys: 48 | 'id': (required) an integer id uniquely identifying this category. 49 | 'name': (required) string representing category name 50 | e.g., 'cat', 'dog', 'pizza'. 51 | 52 | Returns: 53 | category_index: a dict containing the same entries as categories, but keyed 54 | by the 'id' field of each category. 55 | """ 56 | category_index = {} 57 | for cat in categories: 58 | category_index[cat["id"]] = cat 59 | return category_index 60 | 61 | 62 | def get_max_label_map_index(label_map): 63 | """Get maximum index in label map. 64 | 65 | Args: 66 | label_map: a StringIntLabelMapProto 67 | 68 | Returns: 69 | an integer 70 | """ 71 | return max([item.id for item in label_map.item]) 72 | 73 | 74 | def convert_label_map_to_categories( 75 | label_map, max_num_classes, use_display_name=True 76 | ): 77 | """Loads label map proto and returns categories list compatible with eval. 78 | 79 | This function loads a label map and returns a list of dicts, each of which 80 | has the following keys: 81 | 'id': (required) an integer id uniquely identifying this category. 82 | 'name': (required) string representing category name 83 | e.g., 'cat', 'dog', 'pizza'. 84 | We only allow class into the list if its id-label_id_offset is 85 | between 0 (inclusive) and max_num_classes (exclusive). 86 | If there are several items mapping to the same id in the label map, 87 | we will only keep the first one in the categories list. 88 | 89 | Args: 90 | label_map: a StringIntLabelMapProto or None. If None, a default categories 91 | list is created with max_num_classes categories. 92 | max_num_classes: maximum number of (consecutive) label indices to include. 93 | use_display_name: (boolean) choose whether to load 'display_name' field 94 | as category name. If False or if the display_name field does not exist, 95 | uses 'name' field as category names instead. 96 | Returns: 97 | categories: a list of dictionaries representing all possible categories. 98 | """ 99 | categories = [] 100 | list_of_ids_already_added = [] 101 | if not label_map: 102 | label_id_offset = 1 103 | for class_id in range(max_num_classes): 104 | categories.append( 105 | { 106 | "id": class_id + label_id_offset, 107 | "name": "category_{}".format(class_id + label_id_offset), 108 | } 109 | ) 110 | return categories 111 | for item in label_map.item: 112 | if not 0 < item.id <= max_num_classes: 113 | logging.info( 114 | "Ignore item %d since it falls outside of requested " 115 | "label range.", 116 | item.id, 117 | ) 118 | continue 119 | if use_display_name and item.HasField("display_name"): 120 | name = item.display_name 121 | else: 122 | name = item.name 123 | if item.id not in list_of_ids_already_added: 124 | list_of_ids_already_added.append(item.id) 125 | categories.append({"id": item.id, "name": name}) 126 | return categories 127 | 128 | 129 | def load_labelmap(path): 130 | """Loads label map proto. 131 | 132 | Args: 133 | path: path to StringIntLabelMap proto text file. 134 | Returns: 135 | a StringIntLabelMapProto 136 | """ 137 | with open(path, "r") as fid: 138 | label_map_string = fid.read() 139 | label_map = string_int_label_map_pb2.StringIntLabelMap() 140 | try: 141 | text_format.Merge(label_map_string, label_map) 142 | except text_format.ParseError: 143 | label_map.ParseFromString(label_map_string) 144 | _validate_label_map(label_map) 145 | return label_map 146 | 147 | 148 | def get_label_map_dict(label_map_path, use_display_name=False): 149 | """Reads a label map and returns a dictionary of label names to id. 150 | 151 | Args: 152 | label_map_path: path to label_map. 153 | use_display_name: whether to use the label map items' display names as keys. 154 | 155 | Returns: 156 | A dictionary mapping label names to id. 157 | """ 158 | label_map = load_labelmap(label_map_path) 159 | label_map_dict = {} 160 | for item in label_map.item: 161 | if use_display_name: 162 | label_map_dict[item.display_name] = item.id 163 | else: 164 | label_map_dict[item.name] = item.id 165 | return label_map_dict 166 | 167 | 168 | def create_category_index_from_labelmap(label_map_path): 169 | """Reads a label map and returns a category index. 170 | 171 | Args: 172 | label_map_path: Path to `StringIntLabelMap` proto text file. 173 | 174 | Returns: 175 | A category index, which is a dictionary that maps integer ids to dicts 176 | containing categories, e.g. 177 | {1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...} 178 | """ 179 | label_map = load_labelmap(label_map_path) 180 | max_num_classes = max(item.id for item in label_map.item) 181 | categories = convert_label_map_to_categories(label_map, max_num_classes) 182 | return create_category_index(categories) 183 | 184 | 185 | def create_class_agnostic_category_index(): 186 | """Creates a category index with a single `object` class.""" 187 | return {1: {"id": 1, "name": "object"}} 188 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Functions for computing metrics like precision, recall, CorLoc and etc.""" 17 | from __future__ import division 18 | import numpy as np 19 | 20 | 21 | def compute_precision_recall(scores, labels, num_gt): 22 | """Compute precision and recall. 23 | 24 | Args: 25 | scores: A float numpy array representing detection score 26 | labels: A boolean numpy array representing true/false positive labels 27 | num_gt: Number of ground truth instances 28 | 29 | Raises: 30 | ValueError: if the input is not of the correct format 31 | 32 | Returns: 33 | precision: Fraction of positive instances over detected ones. This value is 34 | None if no ground truth labels are present. 35 | recall: Fraction of detected positive instance over all positive instances. 36 | This value is None if no ground truth labels are present. 37 | 38 | """ 39 | if ( 40 | not isinstance(labels, np.ndarray) 41 | or labels.dtype != np.bool 42 | or len(labels.shape) != 1 43 | ): 44 | raise ValueError("labels must be single dimension bool numpy array") 45 | 46 | if not isinstance(scores, np.ndarray) or len(scores.shape) != 1: 47 | raise ValueError("scores must be single dimension numpy array") 48 | 49 | if num_gt < np.sum(labels): 50 | raise ValueError( 51 | "Number of true positives must be smaller than num_gt." 52 | ) 53 | 54 | if len(scores) != len(labels): 55 | raise ValueError("scores and labels must be of the same size.") 56 | 57 | if num_gt == 0: 58 | return None, None 59 | 60 | sorted_indices = np.argsort(scores) 61 | sorted_indices = sorted_indices[::-1] 62 | labels = labels.astype(int) 63 | true_positive_labels = labels[sorted_indices] 64 | false_positive_labels = 1 - true_positive_labels 65 | cum_true_positives = np.cumsum(true_positive_labels) 66 | cum_false_positives = np.cumsum(false_positive_labels) 67 | precision = cum_true_positives.astype(float) / ( 68 | cum_true_positives + cum_false_positives 69 | ) 70 | recall = cum_true_positives.astype(float) / num_gt 71 | return precision, recall 72 | 73 | 74 | def compute_average_precision(precision, recall): 75 | """Compute Average Precision according to the definition in VOCdevkit. 76 | 77 | Precision is modified to ensure that it does not decrease as recall 78 | decrease. 79 | 80 | Args: 81 | precision: A float [N, 1] numpy array of precisions 82 | recall: A float [N, 1] numpy array of recalls 83 | 84 | Raises: 85 | ValueError: if the input is not of the correct format 86 | 87 | Returns: 88 | average_precison: The area under the precision recall curve. NaN if 89 | precision and recall are None. 90 | 91 | """ 92 | if precision is None: 93 | if recall is not None: 94 | raise ValueError("If precision is None, recall must also be None") 95 | return np.NAN 96 | 97 | if not isinstance(precision, np.ndarray) or not isinstance( 98 | recall, np.ndarray 99 | ): 100 | raise ValueError("precision and recall must be numpy array") 101 | if precision.dtype != np.float or recall.dtype != np.float: 102 | raise ValueError("input must be float numpy array.") 103 | if len(precision) != len(recall): 104 | raise ValueError("precision and recall must be of the same size.") 105 | if not precision.size: 106 | return 0.0 107 | if np.amin(precision) < 0 or np.amax(precision) > 1: 108 | raise ValueError("Precision must be in the range of [0, 1].") 109 | if np.amin(recall) < 0 or np.amax(recall) > 1: 110 | raise ValueError("recall must be in the range of [0, 1].") 111 | if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): 112 | raise ValueError("recall must be a non-decreasing array") 113 | 114 | recall = np.concatenate([[0], recall, [1]]) 115 | precision = np.concatenate([[0], precision, [0]]) 116 | 117 | # Preprocess precision to be a non-decreasing array 118 | for i in range(len(precision) - 2, -1, -1): 119 | precision[i] = np.maximum(precision[i], precision[i + 1]) 120 | 121 | indices = np.where(recall[1:] != recall[:-1])[0] + 1 122 | average_precision = np.sum( 123 | (recall[indices] - recall[indices - 1]) * precision[indices] 124 | ) 125 | return average_precision 126 | 127 | 128 | def compute_cor_loc( 129 | num_gt_imgs_per_class, num_images_correctly_detected_per_class 130 | ): 131 | """Compute CorLoc according to the definition in the following paper. 132 | 133 | https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf 134 | 135 | Returns nans if there are no ground truth images for a class. 136 | 137 | Args: 138 | num_gt_imgs_per_class: 1D array, representing number of images containing 139 | at least one object instance of a particular class 140 | num_images_correctly_detected_per_class: 1D array, representing number of 141 | images that are correctly detected at least one object instance of a 142 | particular class 143 | 144 | Returns: 145 | corloc_per_class: A float numpy array represents the corloc score of each 146 | class 147 | """ 148 | # Divide by zero expected for classes with no gt examples. 149 | with np.errstate(divide="ignore", invalid="ignore"): 150 | return np.where( 151 | num_gt_imgs_per_class == 0, 152 | np.nan, 153 | num_images_correctly_detected_per_class / num_gt_imgs_per_class, 154 | ) 155 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/np_box_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxList classes and functions.""" 17 | 18 | from __future__ import ( 19 | absolute_import, 20 | division, 21 | print_function, 22 | unicode_literals, 23 | ) 24 | import numpy as np 25 | 26 | 27 | class BoxList(object): 28 | """Box collection. 29 | 30 | BoxList represents a list of bounding boxes as numpy array, where each 31 | bounding box is represented as a row of 4 numbers, 32 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes within a 33 | given list correspond to a single image. 34 | 35 | Optionally, users can add additional related fields (such as 36 | objectness/classification scores). 37 | """ 38 | 39 | def __init__(self, data): 40 | """Constructs box collection. 41 | 42 | Args: 43 | data: a numpy array of shape [N, 4] representing box coordinates 44 | 45 | Raises: 46 | ValueError: if bbox data is not a numpy array 47 | ValueError: if invalid dimensions for bbox data 48 | """ 49 | if not isinstance(data, np.ndarray): 50 | raise ValueError("data must be a numpy array.") 51 | if len(data.shape) != 2 or data.shape[1] != 4: 52 | raise ValueError("Invalid dimensions for box data.") 53 | if data.dtype != np.float32 and data.dtype != np.float64: 54 | raise ValueError( 55 | "Invalid data type for box data: float is required." 56 | ) 57 | if not self._is_valid_boxes(data): 58 | raise ValueError( 59 | "Invalid box data. data must be a numpy array of " 60 | "N*[y_min, x_min, y_max, x_max]" 61 | ) 62 | self.data = {"boxes": data} 63 | 64 | def num_boxes(self): 65 | """Return number of boxes held in collections.""" 66 | return self.data["boxes"].shape[0] 67 | 68 | def get_extra_fields(self): 69 | """Return all non-box fields.""" 70 | return [k for k in self.data.keys() if k != "boxes"] 71 | 72 | def has_field(self, field): 73 | return field in self.data 74 | 75 | def add_field(self, field, field_data): 76 | """Add data to a specified field. 77 | 78 | Args: 79 | field: a string parameter used to speficy a related field to be accessed. 80 | field_data: a numpy array of [N, ...] representing the data associated 81 | with the field. 82 | Raises: 83 | ValueError: if the field is already exist or the dimension of the field 84 | data does not matches the number of boxes. 85 | """ 86 | if self.has_field(field): 87 | raise ValueError("Field " + field + "already exists") 88 | if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes(): 89 | raise ValueError("Invalid dimensions for field data") 90 | self.data[field] = field_data 91 | 92 | def get(self): 93 | """Convenience function for accesssing box coordinates. 94 | 95 | Returns: 96 | a numpy array of shape [N, 4] representing box corners 97 | """ 98 | return self.get_field("boxes") 99 | 100 | def get_field(self, field): 101 | """Accesses data associated with the specified field in the box collection. 102 | 103 | Args: 104 | field: a string parameter used to speficy a related field to be accessed. 105 | 106 | Returns: 107 | a numpy 1-d array representing data of an associated field 108 | 109 | Raises: 110 | ValueError: if invalid field 111 | """ 112 | if not self.has_field(field): 113 | raise ValueError("field {} does not exist".format(field)) 114 | return self.data[field] 115 | 116 | def get_coordinates(self): 117 | """Get corner coordinates of boxes. 118 | 119 | Returns: 120 | a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max] 121 | """ 122 | box_coordinates = self.get() 123 | y_min = box_coordinates[:, 0] 124 | x_min = box_coordinates[:, 1] 125 | y_max = box_coordinates[:, 2] 126 | x_max = box_coordinates[:, 3] 127 | return [y_min, x_min, y_max, x_max] 128 | 129 | def _is_valid_boxes(self, data): 130 | """Check whether data fullfills the format of N*[ymin, xmin, ymax, xmin]. 131 | 132 | Args: 133 | data: a numpy array of shape [N, 4] representing box coordinates 134 | 135 | Returns: 136 | a boolean indicating whether all ymax of boxes are equal or greater than 137 | ymin, and all xmax of boxes are equal or greater than xmin. 138 | """ 139 | if data.shape[0] > 0: 140 | for i in range(data.shape[0]): 141 | if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]: 142 | return False 143 | return True 144 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/np_box_mask_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Numpy BoxMaskList classes and functions.""" 17 | 18 | from __future__ import ( 19 | absolute_import, 20 | division, 21 | print_function, 22 | unicode_literals, 23 | ) 24 | import numpy as np 25 | 26 | from . import np_box_list 27 | 28 | 29 | class BoxMaskList(np_box_list.BoxList): 30 | """Convenience wrapper for BoxList with masks. 31 | 32 | BoxMaskList extends the np_box_list.BoxList to contain masks as well. 33 | In particular, its constructor receives both boxes and masks. Note that the 34 | masks correspond to the full image. 35 | """ 36 | 37 | def __init__(self, box_data, mask_data): 38 | """Constructs box collection. 39 | 40 | Args: 41 | box_data: a numpy array of shape [N, 4] representing box coordinates 42 | mask_data: a numpy array of shape [N, height, width] representing masks 43 | with values are in {0,1}. The masks correspond to the full 44 | image. The height and the width will be equal to image height and width. 45 | 46 | Raises: 47 | ValueError: if bbox data is not a numpy array 48 | ValueError: if invalid dimensions for bbox data 49 | ValueError: if mask data is not a numpy array 50 | ValueError: if invalid dimension for mask data 51 | """ 52 | super(BoxMaskList, self).__init__(box_data) 53 | if not isinstance(mask_data, np.ndarray): 54 | raise ValueError("Mask data must be a numpy array.") 55 | if len(mask_data.shape) != 3: 56 | raise ValueError("Invalid dimensions for mask data.") 57 | if mask_data.dtype != np.uint8: 58 | raise ValueError( 59 | "Invalid data type for mask data: uint8 is required." 60 | ) 61 | if mask_data.shape[0] != box_data.shape[0]: 62 | raise ValueError( 63 | "There should be the same number of boxes and masks." 64 | ) 65 | self.data["masks"] = mask_data 66 | 67 | def get_masks(self): 68 | """Convenience function for accessing masks. 69 | 70 | Returns: 71 | a numpy array of shape [N, height, width] representing masks 72 | """ 73 | return self.get_field("masks") 74 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/np_box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, 4] numpy arrays representing bounding boxes. 17 | 18 | Example box operations that are supported: 19 | * Areas: compute bounding box areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | from __future__ import ( 23 | absolute_import, 24 | division, 25 | print_function, 26 | unicode_literals, 27 | ) 28 | import numpy as np 29 | 30 | 31 | def area(boxes): 32 | """Computes area of boxes. 33 | 34 | Args: 35 | boxes: Numpy array with shape [N, 4] holding N boxes 36 | 37 | Returns: 38 | a numpy array with shape [N*1] representing box areas 39 | """ 40 | return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) 41 | 42 | 43 | def intersection(boxes1, boxes2): 44 | """Compute pairwise intersection areas between boxes. 45 | 46 | Args: 47 | boxes1: a numpy array with shape [N, 4] holding N boxes 48 | boxes2: a numpy array with shape [M, 4] holding M boxes 49 | 50 | Returns: 51 | a numpy array with shape [N*M] representing pairwise intersection area 52 | """ 53 | [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1) 54 | [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1) 55 | 56 | all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2)) 57 | all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2)) 58 | intersect_heights = np.maximum( 59 | np.zeros(all_pairs_max_ymin.shape), 60 | all_pairs_min_ymax - all_pairs_max_ymin, 61 | ) 62 | all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2)) 63 | all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2)) 64 | intersect_widths = np.maximum( 65 | np.zeros(all_pairs_max_xmin.shape), 66 | all_pairs_min_xmax - all_pairs_max_xmin, 67 | ) 68 | return intersect_heights * intersect_widths 69 | 70 | 71 | def iou(boxes1, boxes2): 72 | """Computes pairwise intersection-over-union between box collections. 73 | 74 | Args: 75 | boxes1: a numpy array with shape [N, 4] holding N boxes. 76 | boxes2: a numpy array with shape [M, 4] holding N boxes. 77 | 78 | Returns: 79 | a numpy array with shape [N, M] representing pairwise iou scores. 80 | """ 81 | intersect = intersection(boxes1, boxes2) 82 | area1 = area(boxes1) 83 | area2 = area(boxes2) 84 | union = ( 85 | np.expand_dims(area1, axis=1) 86 | + np.expand_dims(area2, axis=0) 87 | - intersect 88 | ) 89 | return intersect / union 90 | 91 | 92 | def ioa(boxes1, boxes2): 93 | """Computes pairwise intersection-over-area between box collections. 94 | 95 | Intersection-over-area (ioa) between two boxes box1 and box2 is defined as 96 | their intersection area over box2's area. Note that ioa is not symmetric, 97 | that is, IOA(box1, box2) != IOA(box2, box1). 98 | 99 | Args: 100 | boxes1: a numpy array with shape [N, 4] holding N boxes. 101 | boxes2: a numpy array with shape [M, 4] holding N boxes. 102 | 103 | Returns: 104 | a numpy array with shape [N, M] representing pairwise ioa scores. 105 | """ 106 | intersect = intersection(boxes1, boxes2) 107 | areas = np.expand_dims(area(boxes2), axis=0) 108 | return intersect / areas 109 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/np_mask_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Operations for [N, height, width] numpy arrays representing masks. 17 | 18 | Example mask operations that are supported: 19 | * Areas: compute mask areas 20 | * IOU: pairwise intersection-over-union scores 21 | """ 22 | from __future__ import ( 23 | absolute_import, 24 | division, 25 | print_function, 26 | unicode_literals, 27 | ) 28 | import numpy as np 29 | 30 | EPSILON = 1e-7 31 | 32 | 33 | def area(masks): 34 | """Computes area of masks. 35 | 36 | Args: 37 | masks: Numpy array with shape [N, height, width] holding N masks. Masks 38 | values are of type np.uint8 and values are in {0,1}. 39 | 40 | Returns: 41 | a numpy array with shape [N*1] representing mask areas. 42 | 43 | Raises: 44 | ValueError: If masks.dtype is not np.uint8 45 | """ 46 | if masks.dtype != np.uint8: 47 | raise ValueError("Masks type should be np.uint8") 48 | return np.sum(masks, axis=(1, 2), dtype=np.float32) 49 | 50 | 51 | def intersection(masks1, masks2): 52 | """Compute pairwise intersection areas between masks. 53 | 54 | Args: 55 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 56 | values are of type np.uint8 and values are in {0,1}. 57 | masks2: a numpy array with shape [M, height, width] holding M masks. Masks 58 | values are of type np.uint8 and values are in {0,1}. 59 | 60 | Returns: 61 | a numpy array with shape [N*M] representing pairwise intersection area. 62 | 63 | Raises: 64 | ValueError: If masks1 and masks2 are not of type np.uint8. 65 | """ 66 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 67 | raise ValueError("masks1 and masks2 should be of type np.uint8") 68 | n = masks1.shape[0] 69 | m = masks2.shape[0] 70 | answer = np.zeros([n, m], dtype=np.float32) 71 | for i in np.arange(n): 72 | for j in np.arange(m): 73 | answer[i, j] = np.sum( 74 | np.minimum(masks1[i], masks2[j]), dtype=np.float32 75 | ) 76 | return answer 77 | 78 | 79 | def iou(masks1, masks2): 80 | """Computes pairwise intersection-over-union between mask collections. 81 | 82 | Args: 83 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 84 | values are of type np.uint8 and values are in {0,1}. 85 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 86 | values are of type np.uint8 and values are in {0,1}. 87 | 88 | Returns: 89 | a numpy array with shape [N, M] representing pairwise iou scores. 90 | 91 | Raises: 92 | ValueError: If masks1 and masks2 are not of type np.uint8. 93 | """ 94 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 95 | raise ValueError("masks1 and masks2 should be of type np.uint8") 96 | intersect = intersection(masks1, masks2) 97 | area1 = area(masks1) 98 | area2 = area(masks2) 99 | union = ( 100 | np.expand_dims(area1, axis=1) 101 | + np.expand_dims(area2, axis=0) 102 | - intersect 103 | ) 104 | return intersect / np.maximum(union, EPSILON) 105 | 106 | 107 | def ioa(masks1, masks2): 108 | """Computes pairwise intersection-over-area between box collections. 109 | 110 | Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as 111 | their intersection area over mask2's area. Note that ioa is not symmetric, 112 | that is, IOA(mask1, mask2) != IOA(mask2, mask1). 113 | 114 | Args: 115 | masks1: a numpy array with shape [N, height, width] holding N masks. Masks 116 | values are of type np.uint8 and values are in {0,1}. 117 | masks2: a numpy array with shape [M, height, width] holding N masks. Masks 118 | values are of type np.uint8 and values are in {0,1}. 119 | 120 | Returns: 121 | a numpy array with shape [N, M] representing pairwise ioa scores. 122 | 123 | Raises: 124 | ValueError: If masks1 and masks2 are not of type np.uint8. 125 | """ 126 | if masks1.dtype != np.uint8 or masks2.dtype != np.uint8: 127 | raise ValueError("masks1 and masks2 should be of type np.uint8") 128 | intersect = intersection(masks1, masks2) 129 | areas = np.expand_dims(area(masks2), axis=0) 130 | return intersect / (areas + EPSILON) 131 | -------------------------------------------------------------------------------- /timesformer/utils/ava_evaluation/standard_fields.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Contains classes specifying naming conventions used for object detection. 17 | 18 | 19 | Specifies: 20 | InputDataFields: standard fields used by reader/preprocessor/batcher. 21 | DetectionResultFields: standard fields returned by object detector. 22 | BoxListFields: standard field used by BoxList 23 | TfExampleFields: standard fields for tf-example data format (go/tf-example). 24 | """ 25 | 26 | 27 | from __future__ import ( 28 | absolute_import, 29 | division, 30 | print_function, 31 | unicode_literals, 32 | ) 33 | 34 | 35 | class InputDataFields(object): 36 | """Names for the input tensors. 37 | 38 | Holds the standard data field names to use for identifying input tensors. This 39 | should be used by the decoder to identify keys for the returned tensor_dict 40 | containing input tensors. And it should be used by the model to identify the 41 | tensors it needs. 42 | 43 | Attributes: 44 | image: image. 45 | original_image: image in the original input size. 46 | key: unique key corresponding to image. 47 | source_id: source of the original image. 48 | filename: original filename of the dataset (without common path). 49 | groundtruth_image_classes: image-level class labels. 50 | groundtruth_boxes: coordinates of the ground truth boxes in the image. 51 | groundtruth_classes: box-level class labels. 52 | groundtruth_label_types: box-level label types (e.g. explicit negative). 53 | groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead] 54 | is the groundtruth a single object or a crowd. 55 | groundtruth_area: area of a groundtruth segment. 56 | groundtruth_difficult: is a `difficult` object 57 | groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of the 58 | same class, forming a connected group, where instances are heavily 59 | occluding each other. 60 | proposal_boxes: coordinates of object proposal boxes. 61 | proposal_objectness: objectness score of each proposal. 62 | groundtruth_instance_masks: ground truth instance masks. 63 | groundtruth_instance_boundaries: ground truth instance boundaries. 64 | groundtruth_instance_classes: instance mask-level class labels. 65 | groundtruth_keypoints: ground truth keypoints. 66 | groundtruth_keypoint_visibilities: ground truth keypoint visibilities. 67 | groundtruth_label_scores: groundtruth label scores. 68 | groundtruth_weights: groundtruth weight factor for bounding boxes. 69 | num_groundtruth_boxes: number of groundtruth boxes. 70 | true_image_shapes: true shapes of images in the resized images, as resized 71 | images can be padded with zeros. 72 | """ 73 | 74 | image = "image" 75 | original_image = "original_image" 76 | key = "key" 77 | source_id = "source_id" 78 | filename = "filename" 79 | groundtruth_image_classes = "groundtruth_image_classes" 80 | groundtruth_boxes = "groundtruth_boxes" 81 | groundtruth_classes = "groundtruth_classes" 82 | groundtruth_label_types = "groundtruth_label_types" 83 | groundtruth_is_crowd = "groundtruth_is_crowd" 84 | groundtruth_area = "groundtruth_area" 85 | groundtruth_difficult = "groundtruth_difficult" 86 | groundtruth_group_of = "groundtruth_group_of" 87 | proposal_boxes = "proposal_boxes" 88 | proposal_objectness = "proposal_objectness" 89 | groundtruth_instance_masks = "groundtruth_instance_masks" 90 | groundtruth_instance_boundaries = "groundtruth_instance_boundaries" 91 | groundtruth_instance_classes = "groundtruth_instance_classes" 92 | groundtruth_keypoints = "groundtruth_keypoints" 93 | groundtruth_keypoint_visibilities = "groundtruth_keypoint_visibilities" 94 | groundtruth_label_scores = "groundtruth_label_scores" 95 | groundtruth_weights = "groundtruth_weights" 96 | num_groundtruth_boxes = "num_groundtruth_boxes" 97 | true_image_shape = "true_image_shape" 98 | 99 | 100 | class DetectionResultFields(object): 101 | """Naming conventions for storing the output of the detector. 102 | 103 | Attributes: 104 | source_id: source of the original image. 105 | key: unique key corresponding to image. 106 | detection_boxes: coordinates of the detection boxes in the image. 107 | detection_scores: detection scores for the detection boxes in the image. 108 | detection_classes: detection-level class labels. 109 | detection_masks: contains a segmentation mask for each detection box. 110 | detection_boundaries: contains an object boundary for each detection box. 111 | detection_keypoints: contains detection keypoints for each detection box. 112 | num_detections: number of detections in the batch. 113 | """ 114 | 115 | source_id = "source_id" 116 | key = "key" 117 | detection_boxes = "detection_boxes" 118 | detection_scores = "detection_scores" 119 | detection_classes = "detection_classes" 120 | detection_masks = "detection_masks" 121 | detection_boundaries = "detection_boundaries" 122 | detection_keypoints = "detection_keypoints" 123 | num_detections = "num_detections" 124 | 125 | 126 | class BoxListFields(object): 127 | """Naming conventions for BoxLists. 128 | 129 | Attributes: 130 | boxes: bounding box coordinates. 131 | classes: classes per bounding box. 132 | scores: scores per bounding box. 133 | weights: sample weights per bounding box. 134 | objectness: objectness score per bounding box. 135 | masks: masks per bounding box. 136 | boundaries: boundaries per bounding box. 137 | keypoints: keypoints per bounding box. 138 | keypoint_heatmaps: keypoint heatmaps per bounding box. 139 | """ 140 | 141 | boxes = "boxes" 142 | classes = "classes" 143 | scores = "scores" 144 | weights = "weights" 145 | objectness = "objectness" 146 | masks = "masks" 147 | boundaries = "boundaries" 148 | keypoints = "keypoints" 149 | keypoint_heatmaps = "keypoint_heatmaps" 150 | 151 | 152 | class TfExampleFields(object): 153 | """TF-example proto feature names for object detection. 154 | 155 | Holds the standard feature names to load from an Example proto for object 156 | detection. 157 | 158 | Attributes: 159 | image_encoded: JPEG encoded string 160 | image_format: image format, e.g. "JPEG" 161 | filename: filename 162 | channels: number of channels of image 163 | colorspace: colorspace, e.g. "RGB" 164 | height: height of image in pixels, e.g. 462 165 | width: width of image in pixels, e.g. 581 166 | source_id: original source of the image 167 | object_class_text: labels in text format, e.g. ["person", "cat"] 168 | object_class_label: labels in numbers, e.g. [16, 8] 169 | object_bbox_xmin: xmin coordinates of groundtruth box, e.g. 10, 30 170 | object_bbox_xmax: xmax coordinates of groundtruth box, e.g. 50, 40 171 | object_bbox_ymin: ymin coordinates of groundtruth box, e.g. 40, 50 172 | object_bbox_ymax: ymax coordinates of groundtruth box, e.g. 80, 70 173 | object_view: viewpoint of object, e.g. ["frontal", "left"] 174 | object_truncated: is object truncated, e.g. [true, false] 175 | object_occluded: is object occluded, e.g. [true, false] 176 | object_difficult: is object difficult, e.g. [true, false] 177 | object_group_of: is object a single object or a group of objects 178 | object_depiction: is object a depiction 179 | object_is_crowd: [DEPRECATED, use object_group_of instead] 180 | is the object a single object or a crowd 181 | object_segment_area: the area of the segment. 182 | object_weight: a weight factor for the object's bounding box. 183 | instance_masks: instance segmentation masks. 184 | instance_boundaries: instance boundaries. 185 | instance_classes: Classes for each instance segmentation mask. 186 | detection_class_label: class label in numbers. 187 | detection_bbox_ymin: ymin coordinates of a detection box. 188 | detection_bbox_xmin: xmin coordinates of a detection box. 189 | detection_bbox_ymax: ymax coordinates of a detection box. 190 | detection_bbox_xmax: xmax coordinates of a detection box. 191 | detection_score: detection score for the class label and box. 192 | """ 193 | 194 | image_encoded = "image/encoded" 195 | image_format = "image/format" # format is reserved keyword 196 | filename = "image/filename" 197 | channels = "image/channels" 198 | colorspace = "image/colorspace" 199 | height = "image/height" 200 | width = "image/width" 201 | source_id = "image/source_id" 202 | object_class_text = "image/object/class/text" 203 | object_class_label = "image/object/class/label" 204 | object_bbox_ymin = "image/object/bbox/ymin" 205 | object_bbox_xmin = "image/object/bbox/xmin" 206 | object_bbox_ymax = "image/object/bbox/ymax" 207 | object_bbox_xmax = "image/object/bbox/xmax" 208 | object_view = "image/object/view" 209 | object_truncated = "image/object/truncated" 210 | object_occluded = "image/object/occluded" 211 | object_difficult = "image/object/difficult" 212 | object_group_of = "image/object/group_of" 213 | object_depiction = "image/object/depiction" 214 | object_is_crowd = "image/object/is_crowd" 215 | object_segment_area = "image/object/segment/area" 216 | object_weight = "image/object/weight" 217 | instance_masks = "image/segmentation/object" 218 | instance_boundaries = "image/boundaries/object" 219 | instance_classes = "image/segmentation/object/class" 220 | detection_class_label = "image/detection/label" 221 | detection_bbox_ymin = "image/detection/bbox/ymin" 222 | detection_bbox_xmin = "image/detection/bbox/xmin" 223 | detection_bbox_ymax = "image/detection/bbox/ymax" 224 | detection_bbox_xmax = "image/detection/bbox/xmax" 225 | detection_score = "image/detection/score" 226 | -------------------------------------------------------------------------------- /timesformer/utils/benchmark.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Functions for benchmarks. 4 | """ 5 | 6 | import numpy as np 7 | import pprint 8 | import torch 9 | import tqdm 10 | from fvcore.common.timer import Timer 11 | 12 | import timesformer.utils.logging as logging 13 | import timesformer.utils.misc as misc 14 | from timesformer.datasets import loader 15 | from timesformer.utils.env import setup_environment 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | def benchmark_data_loading(cfg): 21 | """ 22 | Benchmark the speed of data loading in PySlowFast. 23 | Args: 24 | 25 | cfg (CfgNode): configs. Details can be found in 26 | lib/config/defaults.py 27 | """ 28 | # Set up environment. 29 | setup_environment() 30 | # Set random seed from configs. 31 | np.random.seed(cfg.RNG_SEED) 32 | torch.manual_seed(cfg.RNG_SEED) 33 | 34 | # Setup logging format. 35 | logging.setup_logging(cfg.OUTPUT_DIR) 36 | 37 | # Print config. 38 | logger.info("Benchmark data loading with config:") 39 | logger.info(pprint.pformat(cfg)) 40 | 41 | timer = Timer() 42 | dataloader = loader.construct_loader(cfg, "train") 43 | logger.info( 44 | "Initialize loader using {:.2f} seconds.".format(timer.seconds()) 45 | ) 46 | # Total batch size across different machines. 47 | batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS 48 | log_period = cfg.BENCHMARK.LOG_PERIOD 49 | epoch_times = [] 50 | # Test for a few epochs. 51 | for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS): 52 | timer = Timer() 53 | timer_epoch = Timer() 54 | iter_times = [] 55 | if cfg.BENCHMARK.SHUFFLE: 56 | loader.shuffle_dataset(dataloader, cur_epoch) 57 | for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)): 58 | if cur_iter > 0 and cur_iter % log_period == 0: 59 | iter_times.append(timer.seconds()) 60 | ram_usage, ram_total = misc.cpu_mem_usage() 61 | logger.info( 62 | "Epoch {}: {} iters ({} videos) in {:.2f} seconds. " 63 | "RAM Usage: {:.2f}/{:.2f} GB.".format( 64 | cur_epoch, 65 | log_period, 66 | log_period * batch_size, 67 | iter_times[-1], 68 | ram_usage, 69 | ram_total, 70 | ) 71 | ) 72 | timer.reset() 73 | epoch_times.append(timer_epoch.seconds()) 74 | ram_usage, ram_total = misc.cpu_mem_usage() 75 | logger.info( 76 | "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. " 77 | "RAM Usage: {:.2f}/{:.2f} GB.".format( 78 | cur_epoch, 79 | len(dataloader), 80 | len(dataloader) * batch_size, 81 | epoch_times[-1], 82 | ram_usage, 83 | ram_total, 84 | ) 85 | ) 86 | logger.info( 87 | "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} " 88 | "(avg/std) seconds.".format( 89 | cur_epoch, 90 | log_period, 91 | log_period * batch_size, 92 | np.mean(iter_times), 93 | np.std(iter_times), 94 | ) 95 | ) 96 | logger.info( 97 | "On average every epoch ({} videos) takes {:.2f}/{:.2f} " 98 | "(avg/std) seconds.".format( 99 | len(dataloader) * batch_size, 100 | np.mean(epoch_times), 101 | np.std(epoch_times), 102 | ) 103 | ) 104 | -------------------------------------------------------------------------------- /timesformer/utils/bn_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """bn helper.""" 4 | 5 | import itertools 6 | import torch 7 | 8 | 9 | @torch.no_grad() 10 | def compute_and_update_bn_stats(model, data_loader, num_batches=200): 11 | """ 12 | Compute and update the batch norm stats to make it more precise. During 13 | training both bn stats and the weight are changing after every iteration, 14 | so the bn can not precisely reflect the latest stats of the current model. 15 | Here the bn stats is recomputed without change of weights, to make the 16 | running mean and running var more precise. 17 | Args: 18 | model (model): the model using to compute and update the bn stats. 19 | data_loader (dataloader): dataloader using to provide inputs. 20 | num_batches (int): running iterations using to compute the stats. 21 | """ 22 | 23 | # Prepares all the bn layers. 24 | bn_layers = [ 25 | m 26 | for m in model.modules() 27 | if any( 28 | ( 29 | isinstance(m, bn_type) 30 | for bn_type in ( 31 | torch.nn.BatchNorm1d, 32 | torch.nn.BatchNorm2d, 33 | torch.nn.BatchNorm3d, 34 | ) 35 | ) 36 | ) 37 | ] 38 | 39 | # In order to make the running stats only reflect the current batch, the 40 | # momentum is disabled. 41 | # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean 42 | # Setting the momentum to 1.0 to compute the stats without momentum. 43 | momentum_actual = [bn.momentum for bn in bn_layers] 44 | for bn in bn_layers: 45 | bn.momentum = 1.0 46 | 47 | # Calculates the running iterations for precise stats computation. 48 | running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers] 49 | running_square_mean = [torch.zeros_like(bn.running_var) for bn in bn_layers] 50 | 51 | for ind, (inputs, _, _) in enumerate( 52 | itertools.islice(data_loader, num_batches) 53 | ): 54 | # Forwards the model to update the bn stats. 55 | if isinstance(inputs, (list,)): 56 | for i in range(len(inputs)): 57 | inputs[i] = inputs[i].float().cuda(non_blocking=True) 58 | else: 59 | inputs = inputs.cuda(non_blocking=True) 60 | model(inputs) 61 | 62 | for i, bn in enumerate(bn_layers): 63 | # Accumulates the bn stats. 64 | running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1) 65 | # $E(x^2) = Var(x) + E(x)^2$. 66 | cur_square_mean = bn.running_var + bn.running_mean ** 2 67 | running_square_mean[i] += ( 68 | cur_square_mean - running_square_mean[i] 69 | ) / (ind + 1) 70 | 71 | for i, bn in enumerate(bn_layers): 72 | bn.running_mean = running_mean[i] 73 | # Var(x) = $E(x^2) - E(x)^2$. 74 | bn.running_var = running_square_mean[i] - bn.running_mean ** 2 75 | # Sets the precise bn stats. 76 | bn.momentum = momentum_actual[i] 77 | -------------------------------------------------------------------------------- /timesformer/utils/c2_model_loading.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Caffe2 to PyTorch checkpoint name converting utility.""" 4 | 5 | import re 6 | 7 | 8 | def get_name_convert_func(): 9 | """ 10 | Get the function to convert Caffe2 layer names to PyTorch layer names. 11 | Returns: 12 | (func): function to convert parameter name from Caffe2 format to PyTorch 13 | format. 14 | """ 15 | pairs = [ 16 | # ------------------------------------------------------------ 17 | # 'nonlocal_conv3_1_theta_w' -> 's3.pathway0_nonlocal3.conv_g.weight' 18 | [ 19 | r"^nonlocal_conv([0-9]+)_([0-9]+)_(.*)", 20 | r"s\1.pathway0_nonlocal\2_\3", 21 | ], 22 | # 'theta' -> 'conv_theta' 23 | [r"^(.*)_nonlocal([0-9]+)_(theta)(.*)", r"\1_nonlocal\2.conv_\3\4"], 24 | # 'g' -> 'conv_g' 25 | [r"^(.*)_nonlocal([0-9]+)_(g)(.*)", r"\1_nonlocal\2.conv_\3\4"], 26 | # 'phi' -> 'conv_phi' 27 | [r"^(.*)_nonlocal([0-9]+)_(phi)(.*)", r"\1_nonlocal\2.conv_\3\4"], 28 | # 'out' -> 'conv_out' 29 | [r"^(.*)_nonlocal([0-9]+)_(out)(.*)", r"\1_nonlocal\2.conv_\3\4"], 30 | # 'nonlocal_conv4_5_bn_s' -> 's4.pathway0_nonlocal3.bn.weight' 31 | [r"^(.*)_nonlocal([0-9]+)_(bn)_(.*)", r"\1_nonlocal\2.\3.\4"], 32 | # ------------------------------------------------------------ 33 | # 't_pool1_subsample_bn' -> 's1_fuse.conv_f2s.bn.running_mean' 34 | [r"^t_pool1_subsample_bn_(.*)", r"s1_fuse.bn.\1"], 35 | # 't_pool1_subsample' -> 's1_fuse.conv_f2s' 36 | [r"^t_pool1_subsample_(.*)", r"s1_fuse.conv_f2s.\1"], 37 | # 't_res4_5_branch2c_bn_subsample_bn_rm' -> 's4_fuse.conv_f2s.bias' 38 | [ 39 | r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_bn_(.*)", 40 | r"s\1_fuse.bn.\3", 41 | ], 42 | # 't_pool1_subsample' -> 's1_fuse.conv_f2s' 43 | [ 44 | r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_(.*)", 45 | r"s\1_fuse.conv_f2s.\3", 46 | ], 47 | # ------------------------------------------------------------ 48 | # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b' 49 | [ 50 | r"^res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)", 51 | r"s\1.pathway0_res\2.branch\3.\4_\5", 52 | ], 53 | # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.' 54 | [r"^res_conv1_bn_(.*)", r"s1.pathway0_stem.bn.\1"], 55 | # 'conv1_xy_w_momentum' -> 's1.pathway0_stem.conv_xy.' 56 | [r"^conv1_xy(.*)", r"s1.pathway0_stem.conv_xy\1"], 57 | # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.' 58 | [r"^conv1_(.*)", r"s1.pathway0_stem.conv.\1"], 59 | # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight' 60 | [ 61 | r"^res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)", 62 | r"s\1.pathway0_res\2.branch\3_\4", 63 | ], 64 | # 'res_conv1_' -> 's1.pathway0_stem.conv.' 65 | [r"^res_conv1_(.*)", r"s1.pathway0_stem.conv.\1"], 66 | # ------------------------------------------------------------ 67 | # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b' 68 | [ 69 | r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)", 70 | r"s\1.pathway1_res\2.branch\3.\4_\5", 71 | ], 72 | # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.' 73 | [r"^t_res_conv1_bn_(.*)", r"s1.pathway1_stem.bn.\1"], 74 | # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.' 75 | [r"^t_conv1_(.*)", r"s1.pathway1_stem.conv.\1"], 76 | # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight' 77 | [ 78 | r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)", 79 | r"s\1.pathway1_res\2.branch\3_\4", 80 | ], 81 | # 'res_conv1_' -> 's1.pathway0_stem.conv.' 82 | [r"^t_res_conv1_(.*)", r"s1.pathway1_stem.conv.\1"], 83 | # ------------------------------------------------------------ 84 | # pred_ -> head.projection. 85 | [r"pred_(.*)", r"head.projection.\1"], 86 | # '.b_bn_fc' -> '.se.fc' 87 | [r"(.*)b_bn_fc(.*)", r"\1se.fc\2"], 88 | # conv_5 -> head.conv_5. 89 | [r"conv_5(.*)", r"head.conv_5\1"], 90 | # conv_5 -> head.conv_5. 91 | [r"lin_5(.*)", r"head.lin_5\1"], 92 | # '.bn_b' -> '.weight' 93 | [r"(.*)bn.b\Z", r"\1bn.bias"], 94 | # '.bn_s' -> '.weight' 95 | [r"(.*)bn.s\Z", r"\1bn.weight"], 96 | # '_bn_rm' -> '.running_mean' 97 | [r"(.*)bn.rm\Z", r"\1bn.running_mean"], 98 | # '_bn_riv' -> '.running_var' 99 | [r"(.*)bn.riv\Z", r"\1bn.running_var"], 100 | # '_b' -> '.bias' 101 | [r"(.*)[\._]b\Z", r"\1.bias"], 102 | # '_w' -> '.weight' 103 | [r"(.*)[\._]w\Z", r"\1.weight"], 104 | ] 105 | 106 | def convert_caffe2_name_to_pytorch(caffe2_layer_name): 107 | """ 108 | Convert the caffe2_layer_name to pytorch format by apply the list of 109 | regular expressions. 110 | Args: 111 | caffe2_layer_name (str): caffe2 layer name. 112 | Returns: 113 | (str): pytorch layer name. 114 | """ 115 | for source, dest in pairs: 116 | caffe2_layer_name = re.sub(source, dest, caffe2_layer_name) 117 | return caffe2_layer_name 118 | 119 | return convert_caffe2_name_to_pytorch 120 | -------------------------------------------------------------------------------- /timesformer/utils/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Distributed helpers.""" 4 | 5 | import functools 6 | import logging 7 | import pickle 8 | import torch 9 | import torch.distributed as dist 10 | 11 | _LOCAL_PROCESS_GROUP = None 12 | 13 | 14 | def all_gather(tensors): 15 | """ 16 | All gathers the provided tensors from all processes across machines. 17 | Args: 18 | tensors (list): tensors to perform all gather across all processes in 19 | all machines. 20 | """ 21 | 22 | gather_list = [] 23 | output_tensor = [] 24 | world_size = dist.get_world_size() 25 | for tensor in tensors: 26 | tensor_placeholder = [ 27 | torch.ones_like(tensor) for _ in range(world_size) 28 | ] 29 | dist.all_gather(tensor_placeholder, tensor, async_op=False) 30 | gather_list.append(tensor_placeholder) 31 | for gathered_tensor in gather_list: 32 | output_tensor.append(torch.cat(gathered_tensor, dim=0)) 33 | return output_tensor 34 | 35 | 36 | def all_reduce(tensors, average=True): 37 | """ 38 | All reduce the provided tensors from all processes across machines. 39 | Args: 40 | tensors (list): tensors to perform all reduce across all processes in 41 | all machines. 42 | average (bool): scales the reduced tensor by the number of overall 43 | processes across all machines. 44 | """ 45 | 46 | for tensor in tensors: 47 | dist.all_reduce(tensor, async_op=False) 48 | if average: 49 | world_size = dist.get_world_size() 50 | for tensor in tensors: 51 | tensor.mul_(1.0 / world_size) 52 | return tensors 53 | 54 | 55 | def init_process_group( 56 | local_rank, 57 | local_world_size, 58 | shard_id, 59 | num_shards, 60 | init_method, 61 | dist_backend="nccl", 62 | ): 63 | """ 64 | Initializes the default process group. 65 | Args: 66 | local_rank (int): the rank on the current local machine. 67 | local_world_size (int): the world size (number of processes running) on 68 | the current local machine. 69 | shard_id (int): the shard index (machine rank) of the current machine. 70 | num_shards (int): number of shards for distributed training. 71 | init_method (string): supporting three different methods for 72 | initializing process groups: 73 | "file": use shared file system to initialize the groups across 74 | different processes. 75 | "tcp": use tcp address to initialize the groups across different 76 | dist_backend (string): backend to use for distributed training. Options 77 | includes gloo, mpi and nccl, the details can be found here: 78 | https://pytorch.org/docs/stable/distributed.html 79 | """ 80 | # Sets the GPU to use. 81 | torch.cuda.set_device(local_rank) 82 | # Initialize the process group. 83 | proc_rank = local_rank + shard_id * local_world_size 84 | world_size = local_world_size * num_shards 85 | dist.init_process_group( 86 | backend=dist_backend, 87 | init_method=init_method, 88 | world_size=world_size, 89 | rank=proc_rank, 90 | ) 91 | 92 | 93 | def is_master_proc(num_gpus=8): 94 | """ 95 | Determines if the current process is the master process. 96 | """ 97 | if torch.distributed.is_initialized(): 98 | return dist.get_rank() % num_gpus == 0 99 | else: 100 | return True 101 | 102 | 103 | def is_root_proc(): 104 | """ 105 | Determines if the current process is the root process. 106 | """ 107 | if torch.distributed.is_initialized(): 108 | return dist.get_rank() == 0 109 | else: 110 | return True 111 | 112 | 113 | def get_world_size(): 114 | """ 115 | Get the size of the world. 116 | """ 117 | if not dist.is_available(): 118 | return 1 119 | if not dist.is_initialized(): 120 | return 1 121 | return dist.get_world_size() 122 | 123 | 124 | def get_rank(): 125 | """ 126 | Get the rank of the current process. 127 | """ 128 | if not dist.is_available(): 129 | return 0 130 | if not dist.is_initialized(): 131 | return 0 132 | return dist.get_rank() 133 | 134 | 135 | def synchronize(): 136 | """ 137 | Helper function to synchronize (barrier) among all processes when 138 | using distributed training 139 | """ 140 | if not dist.is_available(): 141 | return 142 | if not dist.is_initialized(): 143 | return 144 | world_size = dist.get_world_size() 145 | if world_size == 1: 146 | return 147 | dist.barrier() 148 | 149 | 150 | @functools.lru_cache() 151 | def _get_global_gloo_group(): 152 | """ 153 | Return a process group based on gloo backend, containing all the ranks 154 | The result is cached. 155 | Returns: 156 | (group): pytorch dist group. 157 | """ 158 | if dist.get_backend() == "nccl": 159 | return dist.new_group(backend="gloo") 160 | else: 161 | return dist.group.WORLD 162 | 163 | 164 | def _serialize_to_tensor(data, group): 165 | """ 166 | Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl` 167 | backend is supported. 168 | Args: 169 | data (data): data to be serialized. 170 | group (group): pytorch dist group. 171 | Returns: 172 | tensor (ByteTensor): tensor that serialized. 173 | """ 174 | 175 | backend = dist.get_backend(group) 176 | assert backend in ["gloo", "nccl"] 177 | device = torch.device("cpu" if backend == "gloo" else "cuda") 178 | 179 | buffer = pickle.dumps(data) 180 | if len(buffer) > 1024 ** 3: 181 | logger = logging.getLogger(__name__) 182 | logger.warning( 183 | "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( 184 | get_rank(), len(buffer) / (1024 ** 3), device 185 | ) 186 | ) 187 | storage = torch.ByteStorage.from_buffer(buffer) 188 | tensor = torch.ByteTensor(storage).to(device=device) 189 | return tensor 190 | 191 | 192 | def _pad_to_largest_tensor(tensor, group): 193 | """ 194 | Padding all the tensors from different GPUs to the largest ones. 195 | Args: 196 | tensor (tensor): tensor to pad. 197 | group (group): pytorch dist group. 198 | Returns: 199 | list[int]: size of the tensor, on each rank 200 | Tensor: padded tensor that has the max size 201 | """ 202 | world_size = dist.get_world_size(group=group) 203 | assert ( 204 | world_size >= 1 205 | ), "comm.gather/all_gather must be called from ranks within the given group!" 206 | local_size = torch.tensor( 207 | [tensor.numel()], dtype=torch.int64, device=tensor.device 208 | ) 209 | size_list = [ 210 | torch.zeros([1], dtype=torch.int64, device=tensor.device) 211 | for _ in range(world_size) 212 | ] 213 | dist.all_gather(size_list, local_size, group=group) 214 | size_list = [int(size.item()) for size in size_list] 215 | 216 | max_size = max(size_list) 217 | 218 | # we pad the tensor because torch all_gather does not support 219 | # gathering tensors of different shapes 220 | if local_size != max_size: 221 | padding = torch.zeros( 222 | (max_size - local_size,), dtype=torch.uint8, device=tensor.device 223 | ) 224 | tensor = torch.cat((tensor, padding), dim=0) 225 | return size_list, tensor 226 | 227 | 228 | def all_gather_unaligned(data, group=None): 229 | """ 230 | Run all_gather on arbitrary picklable data (not necessarily tensors). 231 | 232 | Args: 233 | data: any picklable object 234 | group: a torch process group. By default, will use a group which 235 | contains all ranks on gloo backend. 236 | 237 | Returns: 238 | list[data]: list of data gathered from each rank 239 | """ 240 | if get_world_size() == 1: 241 | return [data] 242 | if group is None: 243 | group = _get_global_gloo_group() 244 | if dist.get_world_size(group) == 1: 245 | return [data] 246 | 247 | tensor = _serialize_to_tensor(data, group) 248 | 249 | size_list, tensor = _pad_to_largest_tensor(tensor, group) 250 | max_size = max(size_list) 251 | 252 | # receiving Tensor from all ranks 253 | tensor_list = [ 254 | torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) 255 | for _ in size_list 256 | ] 257 | dist.all_gather(tensor_list, tensor, group=group) 258 | 259 | data_list = [] 260 | for size, tensor in zip(size_list, tensor_list): 261 | buffer = tensor.cpu().numpy().tobytes()[:size] 262 | data_list.append(pickle.loads(buffer)) 263 | 264 | return data_list 265 | 266 | 267 | def init_distributed_training(cfg): 268 | """ 269 | Initialize variables needed for distributed training. 270 | """ 271 | if cfg.NUM_GPUS <= 1: 272 | return 273 | num_gpus_per_machine = cfg.NUM_GPUS 274 | num_machines = dist.get_world_size() // num_gpus_per_machine 275 | for i in range(num_machines): 276 | ranks_on_i = list( 277 | range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine) 278 | ) 279 | pg = dist.new_group(ranks_on_i) 280 | if i == cfg.SHARD_ID: 281 | global _LOCAL_PROCESS_GROUP 282 | _LOCAL_PROCESS_GROUP = pg 283 | 284 | 285 | def get_local_size() -> int: 286 | """ 287 | Returns: 288 | The size of the per-machine process group, 289 | i.e. the number of processes per machine. 290 | """ 291 | if not dist.is_available(): 292 | return 1 293 | if not dist.is_initialized(): 294 | return 1 295 | return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) 296 | 297 | 298 | def get_local_rank() -> int: 299 | """ 300 | Returns: 301 | The rank of the current process within the local (per-machine) process group. 302 | """ 303 | if not dist.is_available(): 304 | return 0 305 | if not dist.is_initialized(): 306 | return 0 307 | assert _LOCAL_PROCESS_GROUP is not None 308 | return dist.get_rank(group=_LOCAL_PROCESS_GROUP) 309 | -------------------------------------------------------------------------------- /timesformer/utils/env.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Set up Environment.""" 4 | 5 | import timesformer.utils.logging as logging 6 | 7 | _ENV_SETUP_DONE = False 8 | 9 | 10 | def setup_environment(): 11 | global _ENV_SETUP_DONE 12 | if _ENV_SETUP_DONE: 13 | return 14 | _ENV_SETUP_DONE = True 15 | -------------------------------------------------------------------------------- /timesformer/utils/logging.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Logging.""" 4 | 5 | import atexit 6 | import builtins 7 | import decimal 8 | import functools 9 | import logging 10 | import os 11 | import sys 12 | import simplejson 13 | from fvcore.common.file_io import PathManager 14 | 15 | import timesformer.utils.distributed as du 16 | 17 | 18 | def _suppress_print(): 19 | """ 20 | Suppresses printing from the current process. 21 | """ 22 | 23 | def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False): 24 | pass 25 | 26 | builtins.print = print_pass 27 | 28 | 29 | @functools.lru_cache(maxsize=None) 30 | def _cached_log_stream(filename): 31 | io = PathManager.open(filename, "a", buffering=1024) 32 | atexit.register(io.close) 33 | return io 34 | 35 | 36 | def setup_logging(output_dir=None): 37 | """ 38 | Sets up the logging for multiple processes. Only enable the logging for the 39 | master process, and suppress logging for the non-master processes. 40 | """ 41 | # Set up logging format. 42 | _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" 43 | 44 | if du.is_master_proc(): 45 | # Enable logging for the master process. 46 | logging.root.handlers = [] 47 | else: 48 | # Suppress logging for non-master processes. 49 | _suppress_print() 50 | 51 | logger = logging.getLogger() 52 | logger.setLevel(logging.DEBUG) 53 | logger.propagate = False 54 | plain_formatter = logging.Formatter( 55 | "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s", 56 | datefmt="%m/%d %H:%M:%S", 57 | ) 58 | 59 | if du.is_master_proc(): 60 | ch = logging.StreamHandler(stream=sys.stdout) 61 | ch.setLevel(logging.DEBUG) 62 | ch.setFormatter(plain_formatter) 63 | logger.addHandler(ch) 64 | 65 | if output_dir is not None and du.is_master_proc(du.get_world_size()): 66 | filename = os.path.join(output_dir, "stdout.log") 67 | fh = logging.StreamHandler(_cached_log_stream(filename)) 68 | fh.setLevel(logging.DEBUG) 69 | fh.setFormatter(plain_formatter) 70 | logger.addHandler(fh) 71 | 72 | 73 | def get_logger(name): 74 | """ 75 | Retrieve the logger with the specified name or, if name is None, return a 76 | logger which is the root logger of the hierarchy. 77 | Args: 78 | name (string): name of the logger. 79 | """ 80 | return logging.getLogger(name) 81 | 82 | 83 | def log_json_stats(stats): 84 | """ 85 | Logs json stats. 86 | Args: 87 | stats (dict): a dictionary of statistical information to log. 88 | """ 89 | stats = { 90 | k: decimal.Decimal("{:.5f}".format(v)) if isinstance(v, float) else v 91 | for k, v in stats.items() 92 | } 93 | json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True) 94 | logger = get_logger(__name__) 95 | logger.info("json_stats: {:s}".format(json_stats)) 96 | -------------------------------------------------------------------------------- /timesformer/utils/lr_policy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Learning rate policy.""" 4 | 5 | import math 6 | 7 | 8 | def get_lr_at_epoch(cfg, cur_epoch): 9 | """ 10 | Retrieve the learning rate of the current epoch with the option to perform 11 | warm up in the beginning of the training stage. 12 | Args: 13 | cfg (CfgNode): configs. Details can be found in 14 | slowfast/config/defaults.py 15 | cur_epoch (float): the number of epoch of the current training stage. 16 | """ 17 | lr = get_lr_func(cfg.SOLVER.LR_POLICY)(cfg, cur_epoch) 18 | # Perform warm up. 19 | if cur_epoch < cfg.SOLVER.WARMUP_EPOCHS: 20 | lr_start = cfg.SOLVER.WARMUP_START_LR 21 | lr_end = get_lr_func(cfg.SOLVER.LR_POLICY)( 22 | cfg, cfg.SOLVER.WARMUP_EPOCHS 23 | ) 24 | alpha = (lr_end - lr_start) / cfg.SOLVER.WARMUP_EPOCHS 25 | lr = cur_epoch * alpha + lr_start 26 | return lr 27 | 28 | 29 | def lr_func_cosine(cfg, cur_epoch): 30 | """ 31 | Retrieve the learning rate to specified values at specified epoch with the 32 | cosine learning rate schedule. Details can be found in: 33 | Ilya Loshchilov, and Frank Hutter 34 | SGDR: Stochastic Gradient Descent With Warm Restarts. 35 | Args: 36 | cfg (CfgNode): configs. Details can be found in 37 | slowfast/config/defaults.py 38 | cur_epoch (float): the number of epoch of the current training stage. 39 | """ 40 | assert cfg.SOLVER.COSINE_END_LR < cfg.SOLVER.BASE_LR 41 | return ( 42 | cfg.SOLVER.COSINE_END_LR 43 | + (cfg.SOLVER.BASE_LR - cfg.SOLVER.COSINE_END_LR) 44 | * (math.cos(math.pi * cur_epoch / cfg.SOLVER.MAX_EPOCH) + 1.0) 45 | * 0.5 46 | ) 47 | 48 | 49 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch): 50 | """ 51 | Retrieve the learning rate to specified values at specified epoch with the 52 | steps with relative learning rate schedule. 53 | Args: 54 | cfg (CfgNode): configs. Details can be found in 55 | slowfast/config/defaults.py 56 | cur_epoch (float): the number of epoch of the current training stage. 57 | """ 58 | ind = get_step_index(cfg, cur_epoch) 59 | return cfg.SOLVER.LRS[ind] * cfg.SOLVER.BASE_LR 60 | 61 | 62 | def get_step_index(cfg, cur_epoch): 63 | """ 64 | Retrieves the lr step index for the given epoch. 65 | Args: 66 | cfg (CfgNode): configs. Details can be found in 67 | slowfast/config/defaults.py 68 | cur_epoch (float): the number of epoch of the current training stage. 69 | """ 70 | steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_EPOCH] 71 | for ind, step in enumerate(steps): # NoQA 72 | if cur_epoch < step: 73 | break 74 | return ind - 1 75 | 76 | 77 | def get_lr_func(lr_policy): 78 | """ 79 | Given the configs, retrieve the specified lr policy function. 80 | Args: 81 | lr_policy (string): the learning rate policy to use for the job. 82 | """ 83 | policy = "lr_func_" + lr_policy 84 | if policy not in globals(): 85 | raise NotImplementedError("Unknown LR policy: {}".format(lr_policy)) 86 | else: 87 | return globals()[policy] 88 | -------------------------------------------------------------------------------- /timesformer/utils/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Functions for computing metrics.""" 4 | 5 | import torch 6 | import numpy as np 7 | 8 | def topks_correct(preds, labels, ks): 9 | """ 10 | Given the predictions, labels, and a list of top-k values, compute the 11 | number of correct predictions for each top-k value. 12 | 13 | Args: 14 | preds (array): array of predictions. Dimension is batchsize 15 | N x ClassNum. 16 | labels (array): array of labels. Dimension is batchsize N. 17 | ks (list): list of top-k values. For example, ks = [1, 5] correspods 18 | to top-1 and top-5. 19 | 20 | Returns: 21 | topks_correct (list): list of numbers, where the `i`-th entry 22 | corresponds to the number of top-`ks[i]` correct predictions. 23 | """ 24 | assert preds.size(0) == labels.size( 25 | 0 26 | ), "Batch dim of predictions and labels must match" 27 | # Find the top max_k predictions for each sample 28 | _top_max_k_vals, top_max_k_inds = torch.topk( 29 | preds, max(ks), dim=1, largest=True, sorted=True 30 | ) 31 | # (batch_size, max_k) -> (max_k, batch_size). 32 | top_max_k_inds = top_max_k_inds.t() 33 | # (batch_size, ) -> (max_k, batch_size). 34 | rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds) 35 | # (i, j) = 1 if top i-th prediction for the j-th sample is correct. 36 | top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels) 37 | # Compute the number of topk correct predictions for each k. 38 | topks_correct = [top_max_k_correct[:k, :].float().sum() for k in ks] 39 | return topks_correct 40 | 41 | 42 | def topk_errors(preds, labels, ks): 43 | """ 44 | Computes the top-k error for each k. 45 | Args: 46 | preds (array): array of predictions. Dimension is N. 47 | labels (array): array of labels. Dimension is N. 48 | ks (list): list of ks to calculate the top accuracies. 49 | """ 50 | num_topks_correct = topks_correct(preds, labels, ks) 51 | return [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] 52 | 53 | 54 | def topk_accuracies(preds, labels, ks): 55 | """ 56 | Computes the top-k accuracy for each k. 57 | Args: 58 | preds (array): array of predictions. Dimension is N. 59 | labels (array): array of labels. Dimension is N. 60 | ks (list): list of ks to calculate the top accuracies. 61 | """ 62 | num_topks_correct = topks_correct(preds, labels, ks) 63 | return [(x / preds.size(0)) * 100.0 for x in num_topks_correct] 64 | 65 | def multitask_topks_correct(preds, labels, ks=(1,)): 66 | """ 67 | Args: 68 | preds: tuple(torch.FloatTensor), each tensor should be of shape 69 | [batch_size, class_count], class_count can vary on a per task basis, i.e. 70 | outputs[i].shape[1] can be different to outputs[j].shape[j]. 71 | labels: tuple(torch.LongTensor), each tensor should be of shape [batch_size] 72 | ks: tuple(int), compute accuracy at top-k for the values of k specified 73 | in this parameter. 74 | Returns: 75 | tuple(float), same length at topk with the corresponding accuracy@k in. 76 | """ 77 | max_k = int(np.max(ks)) 78 | task_count = len(preds) 79 | batch_size = labels[0].size(0) 80 | all_correct = torch.zeros(max_k, batch_size).type(torch.ByteTensor) 81 | if torch.cuda.is_available(): 82 | all_correct = all_correct.cuda() 83 | for output, label in zip(preds, labels): 84 | _, max_k_idx = output.topk(max_k, dim=1, largest=True, sorted=True) 85 | # Flip batch_size, class_count as .view doesn't work on non-contiguous 86 | max_k_idx = max_k_idx.t() 87 | correct_for_task = max_k_idx.eq(label.view(1, -1).expand_as(max_k_idx)) 88 | all_correct.add_(correct_for_task) 89 | 90 | multitask_topks_correct = [ 91 | torch.ge(all_correct[:k].float().sum(0), task_count).float().sum(0) for k in ks 92 | ] 93 | 94 | return multitask_topks_correct 95 | -------------------------------------------------------------------------------- /timesformer/utils/multigrid.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Helper functions for multigrid training.""" 4 | 5 | import numpy as np 6 | 7 | import timesformer.utils.logging as logging 8 | 9 | logger = logging.get_logger(__name__) 10 | 11 | 12 | class MultigridSchedule(object): 13 | """ 14 | This class defines multigrid training schedule and update cfg accordingly. 15 | """ 16 | 17 | def init_multigrid(self, cfg): 18 | """ 19 | Update cfg based on multigrid settings. 20 | Args: 21 | cfg (configs): configs that contains training and multigrid specific 22 | hyperparameters. Details can be seen in 23 | slowfast/config/defaults.py. 24 | Returns: 25 | cfg (configs): the updated cfg. 26 | """ 27 | self.schedule = None 28 | # We may modify cfg.TRAIN.BATCH_SIZE, cfg.DATA.NUM_FRAMES, and 29 | # cfg.DATA.TRAIN_CROP_SIZE during training, so we store their original 30 | # value in cfg and use them as global variables. 31 | cfg.MULTIGRID.DEFAULT_B = cfg.TRAIN.BATCH_SIZE 32 | cfg.MULTIGRID.DEFAULT_T = cfg.DATA.NUM_FRAMES 33 | cfg.MULTIGRID.DEFAULT_S = cfg.DATA.TRAIN_CROP_SIZE 34 | 35 | if cfg.MULTIGRID.LONG_CYCLE: 36 | self.schedule = self.get_long_cycle_schedule(cfg) 37 | cfg.SOLVER.STEPS = [0] + [s[-1] for s in self.schedule] 38 | # Fine-tuning phase. 39 | cfg.SOLVER.STEPS[-1] = ( 40 | cfg.SOLVER.STEPS[-2] + cfg.SOLVER.STEPS[-1] 41 | ) // 2 42 | cfg.SOLVER.LRS = [ 43 | cfg.SOLVER.GAMMA ** s[0] * s[1][0] for s in self.schedule 44 | ] 45 | # Fine-tuning phase. 46 | cfg.SOLVER.LRS = cfg.SOLVER.LRS[:-1] + [ 47 | cfg.SOLVER.LRS[-2], 48 | cfg.SOLVER.LRS[-1], 49 | ] 50 | 51 | cfg.SOLVER.MAX_EPOCH = self.schedule[-1][-1] 52 | 53 | elif cfg.MULTIGRID.SHORT_CYCLE: 54 | cfg.SOLVER.STEPS = [ 55 | int(s * cfg.MULTIGRID.EPOCH_FACTOR) for s in cfg.SOLVER.STEPS 56 | ] 57 | cfg.SOLVER.MAX_EPOCH = int( 58 | cfg.SOLVER.MAX_EPOCH * cfg.MULTIGRID.EPOCH_FACTOR 59 | ) 60 | return cfg 61 | 62 | def update_long_cycle(self, cfg, cur_epoch): 63 | """ 64 | Before every epoch, check if long cycle shape should change. If it 65 | should, update cfg accordingly. 66 | Args: 67 | cfg (configs): configs that contains training and multigrid specific 68 | hyperparameters. Details can be seen in 69 | slowfast/config/defaults.py. 70 | cur_epoch (int): current epoch index. 71 | Returns: 72 | cfg (configs): the updated cfg. 73 | changed (bool): do we change long cycle shape at this epoch? 74 | """ 75 | base_b, base_t, base_s = get_current_long_cycle_shape( 76 | self.schedule, cur_epoch 77 | ) 78 | if base_s != cfg.DATA.TRAIN_CROP_SIZE or base_t != cfg.DATA.NUM_FRAMES: 79 | 80 | cfg.DATA.NUM_FRAMES = base_t 81 | cfg.DATA.TRAIN_CROP_SIZE = base_s 82 | cfg.TRAIN.BATCH_SIZE = base_b * cfg.MULTIGRID.DEFAULT_B 83 | 84 | bs_factor = ( 85 | float(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS) 86 | / cfg.MULTIGRID.BN_BASE_SIZE 87 | ) 88 | 89 | if bs_factor < 1: 90 | cfg.BN.NORM_TYPE = "sync_batchnorm" 91 | cfg.BN.NUM_SYNC_DEVICES = int(1.0 / bs_factor) 92 | elif bs_factor > 1: 93 | cfg.BN.NORM_TYPE = "sub_batchnorm" 94 | cfg.BN.NUM_SPLITS = int(bs_factor) 95 | else: 96 | cfg.BN.NORM_TYPE = "batchnorm" 97 | 98 | cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = cfg.DATA.SAMPLING_RATE * ( 99 | cfg.MULTIGRID.DEFAULT_T // cfg.DATA.NUM_FRAMES 100 | ) 101 | logger.info("Long cycle updates:") 102 | logger.info("\tBN.NORM_TYPE: {}".format(cfg.BN.NORM_TYPE)) 103 | if cfg.BN.NORM_TYPE == "sync_batchnorm": 104 | logger.info( 105 | "\tBN.NUM_SYNC_DEVICES: {}".format(cfg.BN.NUM_SYNC_DEVICES) 106 | ) 107 | elif cfg.BN.NORM_TYPE == "sub_batchnorm": 108 | logger.info("\tBN.NUM_SPLITS: {}".format(cfg.BN.NUM_SPLITS)) 109 | logger.info("\tTRAIN.BATCH_SIZE: {}".format(cfg.TRAIN.BATCH_SIZE)) 110 | logger.info( 111 | "\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format( 112 | cfg.DATA.NUM_FRAMES, cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE 113 | ) 114 | ) 115 | logger.info( 116 | "\tDATA.TRAIN_CROP_SIZE: {}".format(cfg.DATA.TRAIN_CROP_SIZE) 117 | ) 118 | return cfg, True 119 | else: 120 | return cfg, False 121 | 122 | def get_long_cycle_schedule(self, cfg): 123 | """ 124 | Based on multigrid hyperparameters, define the schedule of a long cycle. 125 | Args: 126 | cfg (configs): configs that contains training and multigrid specific 127 | hyperparameters. Details can be seen in 128 | slowfast/config/defaults.py. 129 | Returns: 130 | schedule (list): Specifies a list long cycle base shapes and their 131 | corresponding training epochs. 132 | """ 133 | 134 | steps = cfg.SOLVER.STEPS 135 | 136 | default_size = float( 137 | cfg.DATA.NUM_FRAMES * cfg.DATA.TRAIN_CROP_SIZE ** 2 138 | ) 139 | default_iters = steps[-1] 140 | 141 | # Get shapes and average batch size for each long cycle shape. 142 | avg_bs = [] 143 | all_shapes = [] 144 | for t_factor, s_factor in cfg.MULTIGRID.LONG_CYCLE_FACTORS: 145 | base_t = int(round(cfg.DATA.NUM_FRAMES * t_factor)) 146 | base_s = int(round(cfg.DATA.TRAIN_CROP_SIZE * s_factor)) 147 | if cfg.MULTIGRID.SHORT_CYCLE: 148 | shapes = [ 149 | [ 150 | base_t, 151 | cfg.MULTIGRID.DEFAULT_S 152 | * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[0], 153 | ], 154 | [ 155 | base_t, 156 | cfg.MULTIGRID.DEFAULT_S 157 | * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[1], 158 | ], 159 | [base_t, base_s], 160 | ] 161 | else: 162 | shapes = [[base_t, base_s]] 163 | 164 | # (T, S) -> (B, T, S) 165 | shapes = [ 166 | [int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]] 167 | for s in shapes 168 | ] 169 | avg_bs.append(np.mean([s[0] for s in shapes])) 170 | all_shapes.append(shapes) 171 | 172 | # Get schedule regardless of cfg.MULTIGRID.EPOCH_FACTOR. 173 | total_iters = 0 174 | schedule = [] 175 | for step_index in range(len(steps) - 1): 176 | step_epochs = steps[step_index + 1] - steps[step_index] 177 | 178 | for long_cycle_index, shapes in enumerate(all_shapes): 179 | cur_epochs = ( 180 | step_epochs * avg_bs[long_cycle_index] / sum(avg_bs) 181 | ) 182 | 183 | cur_iters = cur_epochs / avg_bs[long_cycle_index] 184 | total_iters += cur_iters 185 | schedule.append((step_index, shapes[-1], cur_epochs)) 186 | 187 | iter_saving = default_iters / total_iters 188 | 189 | final_step_epochs = cfg.SOLVER.MAX_EPOCH - steps[-1] 190 | 191 | # We define the fine-tuning phase to have the same amount of iteration 192 | # saving as the rest of the training. 193 | ft_epochs = final_step_epochs / iter_saving * avg_bs[-1] 194 | 195 | schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs)) 196 | 197 | # Obtrain final schedule given desired cfg.MULTIGRID.EPOCH_FACTOR. 198 | x = ( 199 | cfg.SOLVER.MAX_EPOCH 200 | * cfg.MULTIGRID.EPOCH_FACTOR 201 | / sum(s[-1] for s in schedule) 202 | ) 203 | 204 | final_schedule = [] 205 | total_epochs = 0 206 | for s in schedule: 207 | epochs = s[2] * x 208 | total_epochs += epochs 209 | final_schedule.append((s[0], s[1], int(round(total_epochs)))) 210 | print_schedule(final_schedule) 211 | return final_schedule 212 | 213 | 214 | def print_schedule(schedule): 215 | """ 216 | Log schedule. 217 | """ 218 | logger.info("Long cycle index\tBase shape\tEpochs") 219 | for s in schedule: 220 | logger.info("{}\t{}\t{}".format(s[0], s[1], s[2])) 221 | 222 | 223 | def get_current_long_cycle_shape(schedule, epoch): 224 | """ 225 | Given a schedule and epoch index, return the long cycle base shape. 226 | Args: 227 | schedule (configs): configs that contains training and multigrid specific 228 | hyperparameters. Details can be seen in 229 | slowfast/config/defaults.py. 230 | cur_epoch (int): current epoch index. 231 | Returns: 232 | shapes (list): A list describing the base shape in a long cycle: 233 | [batch size relative to default, 234 | number of frames, spatial dimension]. 235 | """ 236 | for s in schedule: 237 | if epoch < s[-1]: 238 | return s[1] 239 | return schedule[-1][1] 240 | -------------------------------------------------------------------------------- /timesformer/utils/multiprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Multiprocessing helpers.""" 4 | 5 | import torch 6 | 7 | 8 | def run( 9 | local_rank, 10 | num_proc, 11 | func, 12 | init_method, 13 | shard_id, 14 | num_shards, 15 | backend, 16 | cfg, 17 | output_queue=None, 18 | ): 19 | """ 20 | Runs a function from a child process. 21 | Args: 22 | local_rank (int): rank of the current process on the current machine. 23 | num_proc (int): number of processes per machine. 24 | func (function): function to execute on each of the process. 25 | init_method (string): method to initialize the distributed training. 26 | TCP initialization: equiring a network address reachable from all 27 | processes followed by the port. 28 | Shared file-system initialization: makes use of a file system that 29 | is shared and visible from all machines. The URL should start with 30 | file:// and contain a path to a non-existent file on a shared file 31 | system. 32 | shard_id (int): the rank of the current machine. 33 | num_shards (int): number of overall machines for the distributed 34 | training job. 35 | backend (string): three distributed backends ('nccl', 'gloo', 'mpi') are 36 | supports, each with different capabilities. Details can be found 37 | here: 38 | https://pytorch.org/docs/stable/distributed.html 39 | cfg (CfgNode): configs. Details can be found in 40 | slowfast/config/defaults.py 41 | output_queue (queue): can optionally be used to return values from the 42 | master process. 43 | """ 44 | # Initialize the process group. 45 | world_size = num_proc * num_shards 46 | rank = shard_id * num_proc + local_rank 47 | 48 | try: 49 | torch.distributed.init_process_group( 50 | backend=backend, 51 | init_method=init_method, 52 | world_size=world_size, 53 | rank=rank, 54 | ) 55 | except Exception as e: 56 | raise e 57 | 58 | torch.cuda.set_device(local_rank) 59 | ret = func(cfg) 60 | if output_queue is not None and local_rank == 0: 61 | output_queue.put(ret) 62 | -------------------------------------------------------------------------------- /timesformer/utils/parser.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Argument parser functions.""" 4 | 5 | import argparse 6 | import sys 7 | 8 | import timesformer.utils.checkpoint as cu 9 | from timesformer.config.defaults import get_cfg 10 | 11 | 12 | def parse_args(): 13 | """ 14 | Parse the following arguments for a default parser for PySlowFast users. 15 | Args: 16 | shard_id (int): shard id for the current machine. Starts from 0 to 17 | num_shards - 1. If single machine is used, then set shard id to 0. 18 | num_shards (int): number of shards using by the job. 19 | init_method (str): initialization method to launch the job with multiple 20 | devices. Options includes TCP or shared file-system for 21 | initialization. details can be find in 22 | https://pytorch.org/docs/stable/distributed.html#tcp-initialization 23 | cfg (str): path to the config file. 24 | opts (argument): provide addtional options from the command line, it 25 | overwrites the config loaded from file. 26 | """ 27 | parser = argparse.ArgumentParser( 28 | description="Provide SlowFast video training and testing pipeline." 29 | ) 30 | parser.add_argument( 31 | "--shard_id", 32 | help="The shard id of current node, Starts from 0 to num_shards - 1", 33 | default=0, 34 | type=int, 35 | ) 36 | parser.add_argument( 37 | "--num_shards", 38 | help="Number of shards using by the job", 39 | default=1, 40 | type=int, 41 | ) 42 | parser.add_argument( 43 | "--init_method", 44 | help="Initialization method, includes TCP or shared file-system", 45 | default="tcp://localhost:9999", 46 | type=str, 47 | ) 48 | parser.add_argument( 49 | "--cfg", 50 | dest="cfg_file", 51 | help="Path to the config file", 52 | default="configs/Kinetics/SLOWFAST_4x16_R50.yaml", 53 | type=str, 54 | ) 55 | parser.add_argument( 56 | "opts", 57 | help="See slowfast/config/defaults.py for all options", 58 | default=None, 59 | nargs=argparse.REMAINDER, 60 | ) 61 | if len(sys.argv) == 1: 62 | parser.print_help() 63 | return parser.parse_args() 64 | 65 | 66 | def load_config(args): 67 | """ 68 | Given the arguemnts, load and initialize the configs. 69 | Args: 70 | args (argument): arguments includes `shard_id`, `num_shards`, 71 | `init_method`, `cfg_file`, and `opts`. 72 | """ 73 | # Setup cfg. 74 | cfg = get_cfg() 75 | # Load config from cfg. 76 | if args.cfg_file is not None: 77 | cfg.merge_from_file(args.cfg_file) 78 | # Load config from command line, overwrite config from opts. 79 | if args.opts is not None: 80 | cfg.merge_from_list(args.opts) 81 | 82 | # Inherit parameters from args. 83 | if hasattr(args, "num_shards") and hasattr(args, "shard_id"): 84 | cfg.NUM_SHARDS = args.num_shards 85 | cfg.SHARD_ID = args.shard_id 86 | if hasattr(args, "rng_seed"): 87 | cfg.RNG_SEED = args.rng_seed 88 | if hasattr(args, "output_dir"): 89 | cfg.OUTPUT_DIR = args.output_dir 90 | 91 | # Create the checkpoint dir. 92 | cu.make_checkpoint_dir(cfg.OUTPUT_DIR) 93 | return cfg 94 | -------------------------------------------------------------------------------- /timesformer/utils/weight_init_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Utility function for weight initialization""" 4 | 5 | import torch.nn as nn 6 | from fvcore.nn.weight_init import c2_msra_fill 7 | 8 | 9 | def init_weights(model, fc_init_std=0.01, zero_init_final_bn=True): 10 | """ 11 | Performs ResNet style weight initialization. 12 | Args: 13 | fc_init_std (float): the expected standard deviation for fc layer. 14 | zero_init_final_bn (bool): if True, zero initialize the final bn for 15 | every bottleneck. 16 | """ 17 | for m in model.modules(): 18 | if isinstance(m, nn.Conv3d): 19 | """ 20 | Follow the initialization method proposed in: 21 | {He, Kaiming, et al. 22 | "Delving deep into rectifiers: Surpassing human-level 23 | performance on imagenet classification." 24 | arXiv preprint arXiv:1502.01852 (2015)} 25 | """ 26 | c2_msra_fill(m) 27 | elif isinstance(m, nn.BatchNorm3d): 28 | if ( 29 | hasattr(m, "transform_final_bn") 30 | and m.transform_final_bn 31 | and zero_init_final_bn 32 | ): 33 | batchnorm_weight = 0.0 34 | else: 35 | batchnorm_weight = 1.0 36 | if m.weight is not None: 37 | m.weight.data.fill_(batchnorm_weight) 38 | if m.bias is not None: 39 | m.bias.data.zero_() 40 | if isinstance(m, nn.Linear): 41 | m.weight.data.normal_(mean=0.0, std=fc_init_std) 42 | if m.bias is not None: 43 | m.bias.data.zero_() 44 | -------------------------------------------------------------------------------- /timesformer/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /tools/benchmark.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | A script to benchmark data loading. 4 | """ 5 | 6 | import timesformer.utils.logging as logging 7 | from timesformer.utils.benchmark import benchmark_data_loading 8 | from timesformer.utils.misc import launch_job 9 | from timesformer.utils.parser import load_config, parse_args 10 | 11 | logger = logging.get_logger(__name__) 12 | 13 | 14 | def main(): 15 | args = parse_args() 16 | cfg = load_config(args) 17 | 18 | launch_job( 19 | cfg=cfg, init_method=args.init_method, func=benchmark_data_loading 20 | ) 21 | 22 | 23 | if __name__ == "__main__": 24 | main() 25 | -------------------------------------------------------------------------------- /tools/run_net.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Wrapper to train and test a video classification model.""" 4 | from timesformer.utils.misc import launch_job 5 | from timesformer.utils.parser import load_config, parse_args 6 | 7 | from tools.test_net import test 8 | from tools.train_net import train 9 | 10 | 11 | def get_func(cfg): 12 | train_func = train 13 | test_func = test 14 | return train_func, test_func 15 | 16 | def main(): 17 | """ 18 | Main function to spawn the train and test process. 19 | """ 20 | args = parse_args() 21 | if args.num_shards > 1: 22 | args.output_dir = str(args.job_dir) 23 | cfg = load_config(args) 24 | 25 | train, test = get_func(cfg) 26 | 27 | # Perform training. 28 | if cfg.TRAIN.ENABLE: 29 | launch_job(cfg=cfg, init_method=args.init_method, func=train) 30 | 31 | # Perform multi-clip testing. 32 | if cfg.TEST.ENABLE: 33 | launch_job(cfg=cfg, init_method=args.init_method, func=test) 34 | 35 | # Perform model visualization. 36 | if cfg.TENSORBOARD.ENABLE and ( 37 | cfg.TENSORBOARD.MODEL_VIS.ENABLE 38 | or cfg.TENSORBOARD.WRONG_PRED_VIS.ENABLE 39 | ): 40 | launch_job(cfg=cfg, init_method=args.init_method, func=visualize) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /tools/submit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pathlib import Path 4 | import shutil 5 | import submitit 6 | import multiprocessing 7 | import sys 8 | 9 | import torch 10 | import timesformer.utils.checkpoint as cu 11 | import timesformer.utils.multiprocessing as mpu 12 | from timesformer.utils.misc import launch_job 13 | from timesformer.utils.parser import load_config 14 | 15 | from tools.run_net import get_func 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser( 19 | "Submitit for onestage training", add_help=False 20 | ) 21 | parser.add_argument( 22 | "--num_gpus", 23 | help="Number of GPUs", 24 | default=8, 25 | type=int, 26 | ) 27 | parser.add_argument( 28 | "--num_shards", 29 | help="Number of Nodes", 30 | default=1, 31 | type=int, 32 | ) 33 | parser.add_argument( 34 | "--partition", default="learnfair", type=str, help="Partition where to submit" 35 | ) 36 | parser.add_argument("--timeout", default=60 * 72, type=int, help="Duration of the job") 37 | parser.add_argument("--cfg", dest="cfg_file", help="Path to the config file", 38 | default="configs/test_R50_8GPU.yaml", type=str) 39 | parser.add_argument( 40 | "--job_dir", default="", type=str, help="Job dir. Leave empty for automatic." 41 | ) 42 | parser.add_argument( 43 | "--name", default="", type=str, help="Job dir. Leave empty for automatic." 44 | ) 45 | parser.add_argument( 46 | "--resume-from", 47 | default="", 48 | type=str, 49 | help=( 50 | "Weights to resume from (.*pth file) or a file (last_checkpoint) that contains " 51 | + "weight file name from the same directory" 52 | ), 53 | ) 54 | parser.add_argument("--resume-job", default="", type=str, help="resume training from the job") 55 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this") 56 | parser.add_argument("--postfix", default="experiment", type=str, help="Postfix of the jobs") 57 | parser.add_argument("--mail", default="", type=str, 58 | help="Email this user when the job finishes if specified") 59 | parser.add_argument('--comment', default="", type=str, 60 | help='Comment to pass to scheduler, e.g. priority message') 61 | parser.add_argument( 62 | "opts", 63 | help="See lib/config/defaults.py for all options", 64 | default=None, 65 | nargs=argparse.REMAINDER, 66 | ) 67 | return parser.parse_args() 68 | 69 | 70 | def get_shared_folder() -> Path: 71 | user = os.getenv("USER") 72 | if Path("/checkpoint/").is_dir(): 73 | p = Path(f"/checkpoint/{user}/experiments") 74 | p.mkdir(exist_ok=True) 75 | return p 76 | raise RuntimeError("No shared folder available") 77 | 78 | 79 | def launch(shard_id, num_shards, cfg, init_method): 80 | os.environ["NCCL_MIN_NRINGS"] = "8" 81 | 82 | print ("Pytorch version: ", torch.__version__) 83 | 84 | cfg.SHARD_ID = shard_id 85 | cfg.NUM_SHARDS = num_shards 86 | 87 | print([ 88 | shard_id, num_shards, cfg 89 | ]) 90 | 91 | train, test = get_func(cfg) 92 | # Launch job. 93 | if cfg.TRAIN.ENABLE: 94 | launch_job(cfg=cfg, init_method=init_method, func=train) 95 | 96 | if cfg.TEST.ENABLE: 97 | launch_job(cfg=cfg, init_method=init_method, func=test) 98 | 99 | 100 | class Trainer(object): 101 | def __init__(self, args): 102 | self.args = args 103 | 104 | def __call__(self): 105 | 106 | socket_name = os.popen("ip r | grep default | awk '{print $5}'").read().strip('\n') 107 | print("Setting GLOO and NCCL sockets IFNAME to: {}".format(socket_name)) 108 | os.environ["GLOO_SOCKET_IFNAME"] = socket_name 109 | # not sure if the next line is really affect anything 110 | os.environ["NCCL_SOCKET_IFNAME"] = socket_name 111 | 112 | 113 | hostname_first_node = os.popen( 114 | "scontrol show hostnames $SLURM_JOB_NODELIST" 115 | ).read().split("\n")[0] 116 | dist_url = "tcp://{}:12399".format(hostname_first_node) 117 | print("We will use the following dist url: {}".format(dist_url)) 118 | 119 | self._setup_gpu_args() 120 | results = launch( 121 | shard_id=self.args.machine_rank, 122 | num_shards=self.args.num_shards, 123 | cfg=load_config(self.args), 124 | init_method=dist_url, 125 | ) 126 | return results 127 | 128 | def checkpoint(self): 129 | import submitit 130 | 131 | job_env = submitit.JobEnvironment() 132 | slurm_job_id = job_env.job_id 133 | if self.args.resume_job == "": 134 | self.args.resume_job = slurm_job_id 135 | print("Requeuing ", self.args) 136 | empty_trainer = type(self)(self.args) 137 | return submitit.helpers.DelayedSubmission(empty_trainer) 138 | 139 | def _setup_gpu_args(self): 140 | import submitit 141 | 142 | job_env = submitit.JobEnvironment() 143 | print(self.args) 144 | 145 | self.args.machine_rank = job_env.global_rank 146 | print(f"Process rank: {job_env.global_rank}") 147 | 148 | 149 | def main(): 150 | args = parse_args() 151 | 152 | if args.name == "": 153 | cfg_name = os.path.splitext(os.path.basename(args.cfg_file))[0] 154 | args.name = '_'.join([cfg_name, args.postfix]) 155 | 156 | assert args.job_dir != "" 157 | 158 | args.output_dir = str(args.job_dir) 159 | args.job_dir = Path(args.job_dir) / "%j" 160 | 161 | # Note that the folder will depend on the job_id, to easily track experiments 162 | #executor = submitit.AutoExecutor(folder=Path(args.job_dir) / "%j", slurm_max_num_timeout=30) 163 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) 164 | 165 | # cluster setup is defined by environment variables 166 | num_gpus_per_node = args.num_gpus 167 | nodes = args.num_shards 168 | partition = args.partition 169 | timeout_min = args.timeout 170 | kwargs = {} 171 | if args.use_volta32: 172 | kwargs['slurm_constraint'] = 'volta32gb,ib4' 173 | if args.comment: 174 | kwargs['slurm_comment'] = args.comment 175 | 176 | executor.update_parameters( 177 | mem_gb=60 * num_gpus_per_node, 178 | gpus_per_node=num_gpus_per_node, 179 | tasks_per_node=1, 180 | cpus_per_task=10 * num_gpus_per_node, 181 | nodes=nodes, 182 | timeout_min=timeout_min, # max is 60 * 72 183 | slurm_partition=partition, 184 | slurm_signal_delay_s=120, 185 | **kwargs 186 | ) 187 | 188 | 189 | print(args.name) 190 | executor.update_parameters(name=args.name) 191 | 192 | trainer = Trainer(args) 193 | job = executor.submit(trainer) 194 | 195 | print("Submitted job_id:", job.job_id) 196 | 197 | 198 | if __name__ == "__main__": 199 | main() 200 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | """Multi-view test a video classification model.""" 4 | 5 | import numpy as np 6 | import os 7 | import pickle 8 | import torch 9 | from fvcore.common.file_io import PathManager 10 | import cv2 11 | from einops import rearrange, reduce, repeat 12 | import scipy.io 13 | 14 | import timesformer.utils.checkpoint as cu 15 | import timesformer.utils.distributed as du 16 | import timesformer.utils.logging as logging 17 | import timesformer.utils.misc as misc 18 | import timesformer.visualization.tensorboard_vis as tb 19 | from timesformer.datasets import loader 20 | from timesformer.models import build_model 21 | from timesformer.utils.meters import TestMeter 22 | 23 | logger = logging.get_logger(__name__) 24 | 25 | 26 | @torch.no_grad() 27 | def perform_test(test_loader, model, test_meter, cfg, writer=None): 28 | """ 29 | For classification: 30 | Perform mutli-view testing that uniformly samples N clips from a video along 31 | its temporal axis. For each clip, it takes 3 crops to cover the spatial 32 | dimension, followed by averaging the softmax scores across all Nx3 views to 33 | form a video-level prediction. All video predictions are compared to 34 | ground-truth labels and the final testing performance is logged. 35 | For detection: 36 | Perform fully-convolutional testing on the full frames without crop. 37 | Args: 38 | test_loader (loader): video testing loader. 39 | model (model): the pretrained video model to test. 40 | test_meter (TestMeter): testing meters to log and ensemble the testing 41 | results. 42 | cfg (CfgNode): configs. Details can be found in 43 | slowfast/config/defaults.py 44 | writer (TensorboardWriter object, optional): TensorboardWriter object 45 | to writer Tensorboard log. 46 | """ 47 | # Enable eval mode. 48 | model.eval() 49 | test_meter.iter_tic() 50 | 51 | for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader): 52 | if cfg.NUM_GPUS: 53 | # Transfer the data to the current GPU device. 54 | if isinstance(inputs, (list,)): 55 | for i in range(len(inputs)): 56 | inputs[i] = inputs[i].cuda(non_blocking=True) 57 | else: 58 | inputs = inputs.cuda(non_blocking=True) 59 | 60 | # Transfer the data to the current GPU device. 61 | labels = labels.cuda() 62 | video_idx = video_idx.cuda() 63 | for key, val in meta.items(): 64 | if isinstance(val, (list,)): 65 | for i in range(len(val)): 66 | val[i] = val[i].cuda(non_blocking=True) 67 | else: 68 | meta[key] = val.cuda(non_blocking=True) 69 | test_meter.data_toc() 70 | 71 | if cfg.DETECTION.ENABLE: 72 | # Compute the predictions. 73 | preds = model(inputs, meta["boxes"]) 74 | ori_boxes = meta["ori_boxes"] 75 | metadata = meta["metadata"] 76 | 77 | preds = preds.detach().cpu() if cfg.NUM_GPUS else preds.detach() 78 | ori_boxes = ( 79 | ori_boxes.detach().cpu() if cfg.NUM_GPUS else ori_boxes.detach() 80 | ) 81 | metadata = ( 82 | metadata.detach().cpu() if cfg.NUM_GPUS else metadata.detach() 83 | ) 84 | 85 | if cfg.NUM_GPUS > 1: 86 | preds = torch.cat(du.all_gather_unaligned(preds), dim=0) 87 | ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0) 88 | metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0) 89 | 90 | test_meter.iter_toc() 91 | # Update and log stats. 92 | test_meter.update_stats(preds, ori_boxes, metadata) 93 | test_meter.log_iter_stats(None, cur_iter) 94 | else: 95 | # Perform the forward pass. 96 | preds = model(inputs) 97 | 98 | # Gather all the predictions across all the devices to perform ensemble. 99 | if cfg.NUM_GPUS > 1: 100 | preds, labels, video_idx = du.all_gather( 101 | [preds, labels, video_idx] 102 | ) 103 | if cfg.NUM_GPUS: 104 | preds = preds.cpu() 105 | labels = labels.cpu() 106 | video_idx = video_idx.cpu() 107 | 108 | test_meter.iter_toc() 109 | # Update and log stats. 110 | test_meter.update_stats( 111 | preds.detach(), labels.detach(), video_idx.detach() 112 | ) 113 | test_meter.log_iter_stats(cur_iter) 114 | 115 | test_meter.iter_tic() 116 | 117 | # Log epoch stats and print the final testing results. 118 | if not cfg.DETECTION.ENABLE: 119 | all_preds = test_meter.video_preds.clone().detach() 120 | all_labels = test_meter.video_labels 121 | if cfg.NUM_GPUS: 122 | all_preds = all_preds.cpu() 123 | all_labels = all_labels.cpu() 124 | if writer is not None: 125 | writer.plot_eval(preds=all_preds, labels=all_labels) 126 | 127 | if cfg.TEST.SAVE_RESULTS_PATH != "": 128 | save_path = os.path.join(cfg.OUTPUT_DIR, cfg.TEST.SAVE_RESULTS_PATH) 129 | 130 | with PathManager.open(save_path, "wb") as f: 131 | pickle.dump([all_labels, all_labels], f) 132 | 133 | logger.info( 134 | "Successfully saved prediction results to {}".format(save_path) 135 | ) 136 | 137 | test_meter.finalize_metrics() 138 | return test_meter 139 | 140 | 141 | def test(cfg): 142 | """ 143 | Perform multi-view testing on the pretrained video model. 144 | Args: 145 | cfg (CfgNode): configs. Details can be found in 146 | slowfast/config/defaults.py 147 | """ 148 | # Set up environment. 149 | du.init_distributed_training(cfg) 150 | # Set random seed from configs. 151 | np.random.seed(cfg.RNG_SEED) 152 | torch.manual_seed(cfg.RNG_SEED) 153 | 154 | # Setup logging format. 155 | logging.setup_logging(cfg.OUTPUT_DIR) 156 | 157 | # Print config. 158 | logger.info("Test with config:") 159 | logger.info(cfg) 160 | 161 | # Build the video model and print model statistics. 162 | model = build_model(cfg) 163 | if du.is_master_proc() and cfg.LOG_MODEL_INFO: 164 | misc.log_model_info(model, cfg, use_train_input=False) 165 | 166 | cu.load_test_checkpoint(cfg, model) 167 | 168 | # Create video testing loaders. 169 | test_loader = loader.construct_loader(cfg, "test") 170 | logger.info("Testing model for {} iterations".format(len(test_loader))) 171 | 172 | assert ( 173 | len(test_loader.dataset) 174 | % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) 175 | == 0 176 | ) 177 | # Create meters for multi-view testing. 178 | test_meter = TestMeter( 179 | len(test_loader.dataset) 180 | // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), 181 | cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, 182 | cfg.MODEL.NUM_CLASSES, 183 | len(test_loader), 184 | cfg.DATA.MULTI_LABEL, 185 | cfg.DATA.ENSEMBLE_METHOD, 186 | ) 187 | 188 | # Set up writer for logging to Tensorboard format. 189 | if cfg.TENSORBOARD.ENABLE and du.is_master_proc( 190 | cfg.NUM_GPUS * cfg.NUM_SHARDS 191 | ): 192 | writer = tb.TensorboardWriter(cfg) 193 | else: 194 | writer = None 195 | 196 | # # Perform multi-view test on the entire dataset. 197 | test_meter = perform_test(test_loader, model, test_meter, cfg, writer) 198 | if writer is not None: 199 | writer.close() 200 | --------------------------------------------------------------------------------