├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── configs ├── EK │ ├── divided_224_16x4.yaml │ ├── joint_224_16x4.yaml │ ├── motionformer_224_16x4.yaml │ ├── motionformer_224_32x3.yaml │ └── motionformer_336_16x4.yaml ├── K400 │ ├── divided_224_16x4.yaml │ ├── joint_224_16x4.yaml │ ├── motionformer_224_16x4.yaml │ ├── motionformer_224_32x3.yaml │ └── motionformer_336_16x8.yaml ├── K600 │ ├── divided_224_16x4.yaml │ ├── joint_224_16x4.yaml │ ├── motionformer_224_16x4.yaml │ ├── motionformer_224_32x3.yaml │ └── motionformer_336_16x4.yaml └── SSV2 │ ├── divided_224_16x4.yaml │ ├── joint_224_16x4.yaml │ ├── motionformer_224_16x4.yaml │ ├── motionformer_224_32x3.yaml │ └── motionformer_336_16x4.yaml ├── data ├── kinetics_400 │ └── preprocess.py └── kinetics_600 │ └── preprocess.py ├── environment.yml ├── figs ├── firstpage.png ├── qual_results.png ├── splash.png └── traj_attn_fig.png ├── index.html ├── run_with_submitit.py ├── setup.cfg ├── setup.py ├── slowfast ├── __init__.py ├── config │ ├── __init__.py │ ├── custom_config.py │ └── defaults.py ├── datasets │ ├── DATASET.md │ ├── __init__.py │ ├── autoaugment.py │ ├── build.py │ ├── cv2_transform.py │ ├── decoder.py │ ├── epickitchens.py │ ├── epickitchens_record.py │ ├── frame_loader.py │ ├── kinetics.py │ ├── loader.py │ ├── multigrid_helper.py │ ├── random_erasing.py │ ├── samplers.py │ ├── ssv2.py │ ├── transform.py │ ├── utils.py │ ├── video_container.py │ └── video_record.py ├── models │ ├── __init__.py │ ├── adamw.py │ ├── batchnorm_helper.py │ ├── build.py │ ├── losses.py │ ├── nystrom_helper.py │ ├── optimizer.py │ ├── orthoformer_helper.py │ ├── performer_helper.py │ ├── video_model_builder.py │ └── vit_helper.py ├── utils │ ├── __init__.py │ ├── benchmark.py │ ├── bn_helper.py │ ├── c2_model_loading.py │ ├── checkpoint.py │ ├── distributed.py │ ├── env.py │ ├── logging.py │ ├── lr_policy.py │ ├── meters.py │ ├── metrics.py │ ├── misc.py │ ├── multigrid.py │ ├── multiprocessing.py │ ├── parser.py │ └── weight_init_helper.py └── visualization │ ├── __init__.py │ ├── async_predictor.py │ ├── ava_demo_precomputed_boxes.py │ ├── demo_loader.py │ ├── gradcam_utils.py │ ├── prediction_vis.py │ ├── predictor.py │ ├── tensorboard_vis.py │ ├── utils.py │ └── video_visualizer.py ├── slurm_scripts ├── run_multi_node_job.sh ├── run_single_node_job.sh └── test.sh └── tools ├── benchmark.py ├── run_net.py ├── test_net.py └── train_net.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | bin/ 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | *.pkl 23 | *.json 24 | *.npy 25 | *.csv 26 | 27 | # Installer logs 28 | pip-log.txt 29 | pip-delete-this-directory.txt 30 | 31 | # Unit test / coverage reports 32 | .tox/ 33 | .coverage 34 | .cache 35 | nosetests.xml 36 | coverage.xml 37 | 38 | # Translations 39 | *.mo 40 | 41 | # Mr Developer 42 | .mr.developer.cfg 43 | .project 44 | .pydevproject 45 | 46 | # Rope 47 | .ropeproject 48 | 49 | # Django stuff: 50 | *.log 51 | *.pot 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Motionformer 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `master`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to Motionformer, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019, Facebook, Inc 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /configs/EK/divided_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Epickitchens 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: data/epic_kitchens/ 20 | USE_RAND_AUGMENT: True 21 | RE_PROB: 0.0 22 | USE_REPEATED_AUG: False 23 | USE_RANDOM_RESIZE_CROPS: False 24 | COLORJITTER: False 25 | GRAYSCALE: False 26 | GAUSSIAN: False 27 | SOLVER: 28 | BASE_LR: 1e-4 29 | LR_POLICY: steps_with_relative_lrs 30 | LRS: [1, 0.1, 0.01] 31 | STEPS: [0, 30, 40] 32 | MAX_EPOCH: 50 33 | MOMENTUM: 0.9 34 | WEIGHT_DECAY: 5e-2 35 | WARMUP_EPOCHS: 0.0 36 | OPTIMIZING_METHOD: adamw 37 | USE_MIXED_PRECISION: True 38 | SMOOTHING: 0.2 39 | SLOWFAST: 40 | ALPHA: 8 41 | VIT: 42 | PATCH_SIZE: 16 43 | PATCH_SIZE_TEMP: 2 44 | CHANNELS: 3 45 | EMBED_DIM: 768 46 | DEPTH: 12 47 | NUM_HEADS: 12 48 | MLP_RATIO: 4 49 | QKV_BIAS: True 50 | VIDEO_INPUT: True 51 | TEMPORAL_RESOLUTION: 8 52 | USE_MLP: True 53 | DROP: 0.0 54 | POS_DROPOUT: 0.0 55 | DROP_PATH: 0.2 56 | IM_PRETRAINED: True 57 | HEAD_DROPOUT: 0.0 58 | HEAD_ACT: tanh 59 | PRETRAINED_WEIGHTS: vit_1k 60 | ATTN_LAYER: divided 61 | MODEL: 62 | NUM_CLASSES: 97 63 | ARCH: slow 64 | MODEL_NAME: VisionTransformer 65 | LOSS_FUNC: cross_entropy 66 | TEST: 67 | ENABLE: True 68 | DATASET: Epickitchens 69 | BATCH_SIZE: 64 70 | NUM_ENSEMBLE_VIEWS: 10 71 | NUM_SPATIAL_CROPS: 3 72 | DATA_LOADER: 73 | NUM_WORKERS: 8 74 | PIN_MEMORY: True 75 | NUM_GPUS: 8 76 | NUM_SHARDS: 4 77 | RNG_SEED: 0 78 | OUTPUT_DIR: . 79 | TENSORBOARD: 80 | ENABLE: True 81 | -------------------------------------------------------------------------------- /configs/EK/joint_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Epickitchens 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: data/epic_kitchens/ 20 | USE_RAND_AUGMENT: True 21 | RE_PROB: 0.0 22 | USE_REPEATED_AUG: False 23 | USE_RANDOM_RESIZE_CROPS: False 24 | COLORJITTER: False 25 | GRAYSCALE: False 26 | GAUSSIAN: False 27 | SOLVER: 28 | BASE_LR: 1e-4 29 | LR_POLICY: steps_with_relative_lrs 30 | LRS: [1, 0.1, 0.01] 31 | STEPS: [0, 30, 40] 32 | MAX_EPOCH: 50 33 | MOMENTUM: 0.9 34 | WEIGHT_DECAY: 5e-2 35 | WARMUP_EPOCHS: 0.0 36 | OPTIMIZING_METHOD: adamw 37 | USE_MIXED_PRECISION: True 38 | SMOOTHING: 0.2 39 | SLOWFAST: 40 | ALPHA: 8 41 | VIT: 42 | PATCH_SIZE: 16 43 | PATCH_SIZE_TEMP: 2 44 | CHANNELS: 3 45 | EMBED_DIM: 768 46 | DEPTH: 12 47 | NUM_HEADS: 12 48 | MLP_RATIO: 4 49 | QKV_BIAS: True 50 | VIDEO_INPUT: True 51 | TEMPORAL_RESOLUTION: 8 52 | USE_MLP: True 53 | DROP: 0.0 54 | POS_DROPOUT: 0.0 55 | DROP_PATH: 0.2 56 | IM_PRETRAINED: True 57 | HEAD_DROPOUT: 0.0 58 | HEAD_ACT: tanh 59 | PRETRAINED_WEIGHTS: vit_1k 60 | ATTN_LAYER: joint 61 | MODEL: 62 | NUM_CLASSES: 97 63 | ARCH: slow 64 | MODEL_NAME: VisionTransformer 65 | LOSS_FUNC: cross_entropy 66 | TEST: 67 | ENABLE: True 68 | DATASET: Epickitchens 69 | BATCH_SIZE: 64 70 | NUM_ENSEMBLE_VIEWS: 10 71 | NUM_SPATIAL_CROPS: 3 72 | DATA_LOADER: 73 | NUM_WORKERS: 8 74 | PIN_MEMORY: True 75 | NUM_GPUS: 8 76 | NUM_SHARDS: 4 77 | RNG_SEED: 0 78 | OUTPUT_DIR: . 79 | TENSORBOARD: 80 | ENABLE: True 81 | -------------------------------------------------------------------------------- /configs/EK/motionformer_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Epickitchens 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: data/epic_kitchens/ 20 | USE_RAND_AUGMENT: True 21 | RE_PROB: 0.0 22 | USE_REPEATED_AUG: False 23 | USE_RANDOM_RESIZE_CROPS: False 24 | COLORJITTER: False 25 | GRAYSCALE: False 26 | GAUSSIAN: False 27 | SOLVER: 28 | BASE_LR: 1e-4 29 | LR_POLICY: steps_with_relative_lrs 30 | LRS: [1, 0.1, 0.01] 31 | STEPS: [0, 30, 40] 32 | MAX_EPOCH: 50 33 | MOMENTUM: 0.9 34 | WEIGHT_DECAY: 5e-2 35 | WARMUP_EPOCHS: 0.0 36 | OPTIMIZING_METHOD: adamw 37 | USE_MIXED_PRECISION: True 38 | SMOOTHING: 0.2 39 | SLOWFAST: 40 | ALPHA: 8 41 | VIT: 42 | PATCH_SIZE: 16 43 | PATCH_SIZE_TEMP: 2 44 | CHANNELS: 3 45 | EMBED_DIM: 768 46 | DEPTH: 12 47 | NUM_HEADS: 12 48 | MLP_RATIO: 4 49 | QKV_BIAS: True 50 | VIDEO_INPUT: True 51 | TEMPORAL_RESOLUTION: 8 52 | USE_MLP: True 53 | DROP: 0.0 54 | POS_DROPOUT: 0.0 55 | DROP_PATH: 0.2 56 | IM_PRETRAINED: True 57 | HEAD_DROPOUT: 0.0 58 | HEAD_ACT: tanh 59 | PRETRAINED_WEIGHTS: vit_1k 60 | ATTN_LAYER: trajectory 61 | MODEL: 62 | NUM_CLASSES: 97 63 | ARCH: slow 64 | MODEL_NAME: VisionTransformer 65 | LOSS_FUNC: cross_entropy 66 | TEST: 67 | ENABLE: True 68 | DATASET: Epickitchens 69 | BATCH_SIZE: 64 70 | NUM_ENSEMBLE_VIEWS: 10 71 | NUM_SPATIAL_CROPS: 3 72 | DATA_LOADER: 73 | NUM_WORKERS: 8 74 | PIN_MEMORY: True 75 | NUM_GPUS: 8 76 | NUM_SHARDS: 4 77 | RNG_SEED: 0 78 | OUTPUT_DIR: . 79 | TENSORBOARD: 80 | ENABLE: True 81 | -------------------------------------------------------------------------------- /configs/EK/motionformer_224_32x3.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Epickitchens 4 | BATCH_SIZE: 12 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 3 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: data/epic_kitchens/ 20 | USE_RAND_AUGMENT: True 21 | RE_PROB: 0.0 22 | USE_REPEATED_AUG: False 23 | USE_RANDOM_RESIZE_CROPS: False 24 | COLORJITTER: False 25 | GRAYSCALE: False 26 | GAUSSIAN: False 27 | SOLVER: 28 | BASE_LR: 0.375e-4 29 | LR_POLICY: steps_with_relative_lrs 30 | LRS: [1, 0.1, 0.01] 31 | STEPS: [0, 30, 40] 32 | MAX_EPOCH: 50 33 | MOMENTUM: 0.9 34 | WEIGHT_DECAY: 5e-2 35 | WARMUP_EPOCHS: 0.0 36 | OPTIMIZING_METHOD: adamw 37 | USE_MIXED_PRECISION: True 38 | SMOOTHING: 0.2 39 | SLOWFAST: 40 | ALPHA: 8 41 | VIT: 42 | PATCH_SIZE: 16 43 | PATCH_SIZE_TEMP: 2 44 | CHANNELS: 3 45 | EMBED_DIM: 768 46 | DEPTH: 12 47 | NUM_HEADS: 12 48 | MLP_RATIO: 4 49 | QKV_BIAS: True 50 | VIDEO_INPUT: True 51 | TEMPORAL_RESOLUTION: 16 52 | USE_MLP: True 53 | DROP: 0.0 54 | POS_DROPOUT: 0.0 55 | DROP_PATH: 0.2 56 | IM_PRETRAINED: True 57 | HEAD_DROPOUT: 0.0 58 | HEAD_ACT: tanh 59 | PRETRAINED_WEIGHTS: vit_1k 60 | ATTN_LAYER: trajectory 61 | MODEL: 62 | NUM_CLASSES: 97 63 | ARCH: slow 64 | MODEL_NAME: VisionTransformer 65 | LOSS_FUNC: cross_entropy 66 | TEST: 67 | ENABLE: True 68 | DATASET: Epickitchens 69 | BATCH_SIZE: 64 70 | NUM_ENSEMBLE_VIEWS: 10 71 | NUM_SPATIAL_CROPS: 3 72 | DATA_LOADER: 73 | NUM_WORKERS: 8 74 | PIN_MEMORY: True 75 | NUM_GPUS: 8 76 | NUM_SHARDS: 4 77 | RNG_SEED: 0 78 | OUTPUT_DIR: . 79 | TENSORBOARD: 80 | ENABLE: True 81 | -------------------------------------------------------------------------------- /configs/EK/motionformer_336_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Epickitchens 4 | BATCH_SIZE: 12 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [384, 480] 14 | TRAIN_CROP_SIZE: 336 15 | TEST_CROP_SIZE: 336 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: data/epic_kitchens/ 20 | USE_RAND_AUGMENT: True 21 | RE_PROB: 0.0 22 | USE_REPEATED_AUG: False 23 | USE_RANDOM_RESIZE_CROPS: False 24 | COLORJITTER: False 25 | GRAYSCALE: False 26 | GAUSSIAN: False 27 | SOLVER: 28 | BASE_LR: 0.375e-4 29 | LR_POLICY: steps_with_relative_lrs 30 | LRS: [1, 0.1, 0.01] 31 | STEPS: [0, 30, 40] 32 | MAX_EPOCH: 50 33 | MOMENTUM: 0.9 34 | WEIGHT_DECAY: 5e-2 35 | WARMUP_EPOCHS: 0.0 36 | OPTIMIZING_METHOD: adamw 37 | USE_MIXED_PRECISION: True 38 | SMOOTHING: 0.2 39 | SLOWFAST: 40 | ALPHA: 8 41 | VIT: 42 | PATCH_SIZE: 16 43 | PATCH_SIZE_TEMP: 2 44 | CHANNELS: 3 45 | EMBED_DIM: 768 46 | DEPTH: 12 47 | NUM_HEADS: 12 48 | MLP_RATIO: 4 49 | QKV_BIAS: True 50 | VIDEO_INPUT: True 51 | TEMPORAL_RESOLUTION: 8 52 | USE_MLP: True 53 | DROP: 0.0 54 | POS_DROPOUT: 0.0 55 | DROP_PATH: 0.2 56 | IM_PRETRAINED: True 57 | HEAD_DROPOUT: 0.0 58 | HEAD_ACT: tanh 59 | PRETRAINED_WEIGHTS: vit_1k 60 | ATTN_LAYER: trajectory 61 | MODEL: 62 | NUM_CLASSES: 97 63 | ARCH: slow 64 | MODEL_NAME: VisionTransformer 65 | LOSS_FUNC: cross_entropy 66 | TEST: 67 | ENABLE: True 68 | DATASET: Epickitchens 69 | BATCH_SIZE: 64 70 | NUM_ENSEMBLE_VIEWS: 10 71 | NUM_SPATIAL_CROPS: 3 72 | DATA_LOADER: 73 | NUM_WORKERS: 8 74 | PIN_MEMORY: True 75 | NUM_GPUS: 8 76 | NUM_SHARDS: 4 77 | RNG_SEED: 0 78 | OUTPUT_DIR: . 79 | TENSORBOARD: 80 | ENABLE: True 81 | -------------------------------------------------------------------------------- /configs/K400/divided_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 16 10 | SAMPLING_RATE: 4 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 224 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_400/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 1e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 8 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: divided 59 | MODEL: 60 | NUM_CLASSES: 400 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K400/joint_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 16 10 | SAMPLING_RATE: 4 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 224 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_400/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 1e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 8 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: joint 59 | MODEL: 60 | NUM_CLASSES: 400 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K400/motionformer_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 16 10 | SAMPLING_RATE: 4 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 224 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_400/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 1e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 8 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: trajectory 59 | MODEL: 60 | NUM_CLASSES: 400 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K400/motionformer_224_32x3.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 12 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 32 10 | SAMPLING_RATE: 3 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 224 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_400/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: True 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 0.375e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 16 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: trajectory 59 | MODEL: 60 | NUM_CLASSES: 400 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K400/motionformer_336_16x8.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 12 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 16 10 | SAMPLING_RATE: 8 11 | TRAIN_JITTER_SCALES: [384, 480] 12 | TRAIN_CROP_SIZE: 336 13 | TEST_CROP_SIZE: 336 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_400/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 0.375e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 8 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: trajectory 59 | MODEL: 60 | NUM_CLASSES: 400 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K600/divided_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 16 10 | SAMPLING_RATE: 4 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 224 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_600/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 1e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 8 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: divided 59 | MODEL: 60 | NUM_CLASSES: 600 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K600/joint_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 16 10 | SAMPLING_RATE: 4 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 224 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_600/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 1e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 8 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: joint 59 | MODEL: 60 | NUM_CLASSES: 600 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K600/motionformer_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 16 10 | SAMPLING_RATE: 4 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 224 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_600/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 1e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 8 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: trajectory 59 | MODEL: 60 | NUM_CLASSES: 600 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K600/motionformer_224_32x3.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 12 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 32 10 | SAMPLING_RATE: 3 11 | TRAIN_JITTER_SCALES: [256, 320] 12 | TRAIN_CROP_SIZE: 224 13 | TEST_CROP_SIZE: 224 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_600/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 0.375e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 16 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: trajectory 59 | MODEL: 60 | NUM_CLASSES: 600 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/K600/motionformer_336_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Kinetics 4 | BATCH_SIZE: 12 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | DATA: 9 | NUM_FRAMES: 16 10 | SAMPLING_RATE: 4 11 | TRAIN_JITTER_SCALES: [384, 480] 12 | TRAIN_CROP_SIZE: 336 13 | TEST_CROP_SIZE: 336 14 | INPUT_CHANNEL_NUM: [3] 15 | MEAN: [0.5, 0.5, 0.5] 16 | STD: [0.5, 0.5, 0.5] 17 | PATH_TO_DATA_DIR: data/kinetics_600/ 18 | USE_RAND_AUGMENT: False 19 | RE_PROB: 0.0 20 | USE_REPEATED_AUG: False 21 | USE_RANDOM_RESIZE_CROPS: False 22 | COLORJITTER: True 23 | GRAYSCALE: False 24 | GAUSSIAN: False 25 | SOLVER: 26 | BASE_LR: 0.375e-4 27 | LR_POLICY: steps_with_relative_lrs 28 | LRS: [1, 0.1, 0.01] 29 | STEPS: [0, 20, 30] 30 | MAX_EPOCH: 35 31 | MOMENTUM: 0.9 32 | WEIGHT_DECAY: 5e-2 33 | WARMUP_EPOCHS: 0.0 34 | OPTIMIZING_METHOD: adamw 35 | USE_MIXED_PRECISION: True 36 | SMOOTHING: 0.2 37 | SLOWFAST: 38 | ALPHA: 8 39 | VIT: 40 | PATCH_SIZE: 16 41 | PATCH_SIZE_TEMP: 2 42 | CHANNELS: 3 43 | EMBED_DIM: 768 44 | DEPTH: 12 45 | NUM_HEADS: 12 46 | MLP_RATIO: 4 47 | QKV_BIAS: True 48 | VIDEO_INPUT: True 49 | TEMPORAL_RESOLUTION: 8 50 | USE_MLP: True 51 | DROP: 0.0 52 | POS_DROPOUT: 0.0 53 | DROP_PATH: 0.2 54 | IM_PRETRAINED: True 55 | HEAD_DROPOUT: 0.0 56 | HEAD_ACT: tanh 57 | PRETRAINED_WEIGHTS: vit_1k 58 | ATTN_LAYER: trajectory 59 | MODEL: 60 | NUM_CLASSES: 600 61 | ARCH: slow 62 | MODEL_NAME: VisionTransformer 63 | LOSS_FUNC: cross_entropy 64 | TEST: 65 | ENABLE: True 66 | DATASET: Kinetics 67 | BATCH_SIZE: 64 68 | NUM_ENSEMBLE_VIEWS: 10 69 | NUM_SPATIAL_CROPS: 3 70 | DATA_LOADER: 71 | NUM_WORKERS: 8 72 | PIN_MEMORY: True 73 | NUM_GPUS: 8 74 | NUM_SHARDS: 4 75 | RNG_SEED: 0 76 | OUTPUT_DIR: . 77 | TENSORBOARD: 78 | ENABLE: True 79 | -------------------------------------------------------------------------------- /configs/SSV2/divided_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Ssv2 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2 20 | PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames 21 | INV_UNIFORM_SAMPLE: True 22 | RANDOM_FLIP: False 23 | REVERSE_INPUT_CHANNEL: True 24 | USE_RAND_AUGMENT: True 25 | RE_PROB: 0.0 26 | USE_REPEATED_AUG: False 27 | USE_RANDOM_RESIZE_CROPS: False 28 | COLORJITTER: False 29 | GRAYSCALE: False 30 | GAUSSIAN: False 31 | SOLVER: 32 | BASE_LR: 1e-4 33 | LR_POLICY: steps_with_relative_lrs 34 | LRS: [1, 0.1, 0.01] 35 | STEPS: [0, 20, 30] 36 | MAX_EPOCH: 35 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 5e-2 39 | WARMUP_EPOCHS: 0.0 40 | OPTIMIZING_METHOD: adamw 41 | USE_MIXED_PRECISION: True 42 | SMOOTHING: 0.2 43 | SLOWFAST: 44 | ALPHA: 8 45 | VIT: 46 | PATCH_SIZE: 16 47 | PATCH_SIZE_TEMP: 2 48 | CHANNELS: 3 49 | EMBED_DIM: 768 50 | DEPTH: 12 51 | NUM_HEADS: 12 52 | MLP_RATIO: 4 53 | QKV_BIAS: True 54 | VIDEO_INPUT: True 55 | TEMPORAL_RESOLUTION: 8 56 | USE_MLP: True 57 | DROP: 0.0 58 | POS_DROPOUT: 0.0 59 | DROP_PATH: 0.2 60 | IM_PRETRAINED: True 61 | HEAD_DROPOUT: 0.0 62 | HEAD_ACT: tanh 63 | PRETRAINED_WEIGHTS: vit_1k 64 | ATTN_LAYER: divided 65 | MODEL: 66 | NUM_CLASSES: 174 67 | ARCH: slow 68 | MODEL_NAME: VisionTransformer 69 | LOSS_FUNC: cross_entropy 70 | TEST: 71 | ENABLE: True 72 | DATASET: Ssv2 73 | BATCH_SIZE: 64 74 | NUM_ENSEMBLE_VIEWS: 1 75 | NUM_SPATIAL_CROPS: 3 76 | DATA_LOADER: 77 | NUM_WORKERS: 4 78 | PIN_MEMORY: True 79 | NUM_GPUS: 8 80 | NUM_SHARDS: 4 81 | RNG_SEED: 0 82 | OUTPUT_DIR: . 83 | TENSORBOARD: 84 | ENABLE: True 85 | -------------------------------------------------------------------------------- /configs/SSV2/joint_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Ssv2 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2 20 | PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames 21 | INV_UNIFORM_SAMPLE: True 22 | RANDOM_FLIP: False 23 | REVERSE_INPUT_CHANNEL: True 24 | USE_RAND_AUGMENT: False 25 | RE_PROB: 0.0 26 | USE_REPEATED_AUG: False 27 | USE_RANDOM_RESIZE_CROPS: False 28 | COLORJITTER: False 29 | GRAYSCALE: False 30 | GAUSSIAN: False 31 | SOLVER: 32 | BASE_LR: 1e-4 33 | LR_POLICY: steps_with_relative_lrs 34 | LRS: [1, 0.1, 0.01] 35 | STEPS: [0, 20, 30] 36 | MAX_EPOCH: 35 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 5e-2 39 | WARMUP_EPOCHS: 0.0 40 | OPTIMIZING_METHOD: adamw 41 | USE_MIXED_PRECISION: True 42 | SMOOTHING: 0.2 43 | SLOWFAST: 44 | ALPHA: 8 45 | VIT: 46 | PATCH_SIZE: 16 47 | PATCH_SIZE_TEMP: 2 48 | CHANNELS: 3 49 | EMBED_DIM: 768 50 | DEPTH: 12 51 | NUM_HEADS: 12 52 | MLP_RATIO: 4 53 | QKV_BIAS: True 54 | VIDEO_INPUT: True 55 | TEMPORAL_RESOLUTION: 8 56 | USE_MLP: True 57 | DROP: 0.0 58 | POS_DROPOUT: 0.0 59 | DROP_PATH: 0.2 60 | IM_PRETRAINED: True 61 | HEAD_DROPOUT: 0.0 62 | HEAD_ACT: tanh 63 | PRETRAINED_WEIGHTS: vit_1k 64 | ATTN_LAYER: joint 65 | MODEL: 66 | NUM_CLASSES: 174 67 | ARCH: slow 68 | MODEL_NAME: VisionTransformer 69 | LOSS_FUNC: cross_entropy 70 | TEST: 71 | ENABLE: True 72 | DATASET: Ssv2 73 | BATCH_SIZE: 64 74 | NUM_ENSEMBLE_VIEWS: 1 75 | NUM_SPATIAL_CROPS: 3 76 | DATA_LOADER: 77 | NUM_WORKERS: 4 78 | PIN_MEMORY: True 79 | NUM_GPUS: 8 80 | NUM_SHARDS: 4 81 | RNG_SEED: 0 82 | OUTPUT_DIR: . 83 | TENSORBOARD: 84 | ENABLE: True 85 | -------------------------------------------------------------------------------- /configs/SSV2/motionformer_224_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Ssv2 4 | BATCH_SIZE: 32 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2 20 | PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames 21 | INV_UNIFORM_SAMPLE: True 22 | RANDOM_FLIP: False 23 | REVERSE_INPUT_CHANNEL: True 24 | USE_RAND_AUGMENT: True 25 | RE_PROB: 0.0 26 | USE_REPEATED_AUG: False 27 | USE_RANDOM_RESIZE_CROPS: False 28 | COLORJITTER: False 29 | GRAYSCALE: False 30 | GAUSSIAN: False 31 | SOLVER: 32 | BASE_LR: 1e-4 33 | LR_POLICY: steps_with_relative_lrs 34 | LRS: [1, 0.1, 0.01] 35 | STEPS: [0, 20, 30] 36 | MAX_EPOCH: 35 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 5e-2 39 | WARMUP_EPOCHS: 0.0 40 | OPTIMIZING_METHOD: adamw 41 | USE_MIXED_PRECISION: True 42 | SMOOTHING: 0.2 43 | SLOWFAST: 44 | ALPHA: 8 45 | VIT: 46 | PATCH_SIZE: 16 47 | PATCH_SIZE_TEMP: 2 48 | CHANNELS: 3 49 | EMBED_DIM: 768 50 | DEPTH: 12 51 | NUM_HEADS: 12 52 | MLP_RATIO: 4 53 | QKV_BIAS: True 54 | VIDEO_INPUT: True 55 | TEMPORAL_RESOLUTION: 8 56 | USE_MLP: True 57 | DROP: 0.0 58 | POS_DROPOUT: 0.0 59 | DROP_PATH: 0.2 60 | IM_PRETRAINED: True 61 | HEAD_DROPOUT: 0.0 62 | HEAD_ACT: tanh 63 | PRETRAINED_WEIGHTS: vit_1k 64 | ATTN_LAYER: trajectory 65 | MODEL: 66 | NUM_CLASSES: 174 67 | ARCH: slow 68 | MODEL_NAME: VisionTransformer 69 | LOSS_FUNC: cross_entropy 70 | TEST: 71 | ENABLE: True 72 | DATASET: Ssv2 73 | BATCH_SIZE: 64 74 | NUM_ENSEMBLE_VIEWS: 1 75 | NUM_SPATIAL_CROPS: 3 76 | DATA_LOADER: 77 | NUM_WORKERS: 4 78 | PIN_MEMORY: True 79 | NUM_GPUS: 8 80 | NUM_SHARDS: 4 81 | RNG_SEED: 0 82 | OUTPUT_DIR: . 83 | TENSORBOARD: 84 | ENABLE: True 85 | -------------------------------------------------------------------------------- /configs/SSV2/motionformer_224_32x3.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Ssv2 4 | BATCH_SIZE: 12 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 32 12 | SAMPLING_RATE: 3 13 | TRAIN_JITTER_SCALES: [256, 320] 14 | TRAIN_CROP_SIZE: 224 15 | TEST_CROP_SIZE: 224 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2 20 | PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames 21 | INV_UNIFORM_SAMPLE: True 22 | RANDOM_FLIP: False 23 | REVERSE_INPUT_CHANNEL: True 24 | USE_RAND_AUGMENT: True 25 | RE_PROB: 0.0 26 | USE_REPEATED_AUG: False 27 | USE_RANDOM_RESIZE_CROPS: False 28 | COLORJITTER: False 29 | GRAYSCALE: False 30 | GAUSSIAN: False 31 | SOLVER: 32 | BASE_LR: 0.375e-4 33 | LR_POLICY: steps_with_relative_lrs 34 | LRS: [1, 0.1, 0.01] 35 | STEPS: [0, 20, 30] 36 | MAX_EPOCH: 35 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 5e-2 39 | WARMUP_EPOCHS: 0.0 40 | OPTIMIZING_METHOD: adamw 41 | USE_MIXED_PRECISION: True 42 | SMOOTHING: 0.2 43 | SLOWFAST: 44 | ALPHA: 8 45 | VIT: 46 | PATCH_SIZE: 16 47 | PATCH_SIZE_TEMP: 2 48 | CHANNELS: 3 49 | EMBED_DIM: 768 50 | DEPTH: 12 51 | NUM_HEADS: 12 52 | MLP_RATIO: 4 53 | QKV_BIAS: True 54 | VIDEO_INPUT: True 55 | TEMPORAL_RESOLUTION: 16 56 | USE_MLP: True 57 | DROP: 0.0 58 | POS_DROPOUT: 0.0 59 | DROP_PATH: 0.2 60 | IM_PRETRAINED: True 61 | HEAD_DROPOUT: 0.0 62 | HEAD_ACT: tanh 63 | PRETRAINED_WEIGHTS: vit_1k 64 | ATTN_LAYER: trajectory 65 | MODEL: 66 | NUM_CLASSES: 174 67 | ARCH: slow 68 | MODEL_NAME: VisionTransformer 69 | LOSS_FUNC: cross_entropy 70 | TEST: 71 | ENABLE: True 72 | DATASET: Ssv2 73 | BATCH_SIZE: 64 74 | NUM_ENSEMBLE_VIEWS: 1 75 | NUM_SPATIAL_CROPS: 3 76 | DATA_LOADER: 77 | NUM_WORKERS: 4 78 | PIN_MEMORY: True 79 | NUM_GPUS: 8 80 | NUM_SHARDS: 4 81 | RNG_SEED: 0 82 | OUTPUT_DIR: . 83 | TENSORBOARD: 84 | ENABLE: True 85 | -------------------------------------------------------------------------------- /configs/SSV2/motionformer_336_16x4.yaml: -------------------------------------------------------------------------------- 1 | TRAIN: 2 | ENABLE: True 3 | DATASET: Ssv2 4 | BATCH_SIZE: 12 5 | EVAL_PERIOD: 5 6 | CHECKPOINT_PERIOD: 5 7 | AUTO_RESUME: True 8 | CHECKPOINT_EPOCH_RESET: True 9 | CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth 10 | DATA: 11 | NUM_FRAMES: 16 12 | SAMPLING_RATE: 4 13 | TRAIN_JITTER_SCALES: [384, 480] 14 | TRAIN_CROP_SIZE: 336 15 | TEST_CROP_SIZE: 336 16 | INPUT_CHANNEL_NUM: [3] 17 | MEAN: [0.5, 0.5, 0.5] 18 | STD: [0.5, 0.5, 0.5] 19 | PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2 20 | PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames 21 | INV_UNIFORM_SAMPLE: True 22 | RANDOM_FLIP: False 23 | REVERSE_INPUT_CHANNEL: True 24 | USE_RAND_AUGMENT: True 25 | RE_PROB: 0.0 26 | USE_REPEATED_AUG: False 27 | USE_RANDOM_RESIZE_CROPS: False 28 | COLORJITTER: False 29 | GRAYSCALE: False 30 | GAUSSIAN: False 31 | SOLVER: 32 | BASE_LR: 0.375e-4 33 | LR_POLICY: steps_with_relative_lrs 34 | LRS: [1, 0.1, 0.01] 35 | STEPS: [0, 20, 30] 36 | MAX_EPOCH: 35 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 5e-2 39 | WARMUP_EPOCHS: 0.0 40 | OPTIMIZING_METHOD: adamw 41 | USE_MIXED_PRECISION: True 42 | SMOOTHING: 0.2 43 | SLOWFAST: 44 | ALPHA: 8 45 | VIT: 46 | PATCH_SIZE: 16 47 | PATCH_SIZE_TEMP: 2 48 | CHANNELS: 3 49 | EMBED_DIM: 768 50 | DEPTH: 12 51 | NUM_HEADS: 12 52 | MLP_RATIO: 4 53 | QKV_BIAS: True 54 | VIDEO_INPUT: True 55 | TEMPORAL_RESOLUTION: 8 56 | USE_MLP: True 57 | DROP: 0.0 58 | POS_DROPOUT: 0.0 59 | DROP_PATH: 0.2 60 | IM_PRETRAINED: True 61 | HEAD_DROPOUT: 0.0 62 | HEAD_ACT: tanh 63 | PRETRAINED_WEIGHTS: vit_1k 64 | ATTN_LAYER: trajectory 65 | MODEL: 66 | NUM_CLASSES: 174 67 | ARCH: slow 68 | MODEL_NAME: VisionTransformer 69 | LOSS_FUNC: cross_entropy 70 | TEST: 71 | ENABLE: True 72 | DATASET: Ssv2 73 | BATCH_SIZE: 64 74 | NUM_ENSEMBLE_VIEWS: 1 75 | NUM_SPATIAL_CROPS: 3 76 | DATA_LOADER: 77 | NUM_WORKERS: 4 78 | PIN_MEMORY: True 79 | NUM_GPUS: 8 80 | NUM_SHARDS: 4 81 | RNG_SEED: 0 82 | OUTPUT_DIR: . 83 | TENSORBOARD: 84 | ENABLE: True 85 | -------------------------------------------------------------------------------- /data/kinetics_400/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | import csv 4 | import glob 5 | import os 6 | 7 | 8 | def k400_preproces( 9 | root_dir='/datasets01/kinetics/070618/400/', split_dir='train_avi-288p', mode='train' 10 | ): 11 | data_prefix = os.path.join(root_dir, split_dir) 12 | files = list(sorted(glob.glob(os.path.join(data_prefix, '*', '*')))) 13 | classes = list(sorted(glob.glob(os.path.join(data_prefix, '*')))) 14 | classes = [os.path.basename(i) for i in classes] 15 | class_to_idx = {classes[i]: i for i in range(len(classes))} 16 | 17 | with open(f'{mode}.csv', mode='w') as csv_file: 18 | csv_writer = csv.writer(csv_file, delimiter=' ', quotechar='"', quoting=csv.QUOTE_MINIMAL) 19 | for path in files: 20 | class_name = path.split('/')[-2] 21 | class_idx = class_to_idx[class_name] 22 | csv_writer.writerow([path, class_idx]) 23 | 24 | 25 | if __name__ == '__main__': 26 | import argparse 27 | parser = argparse.ArgumentParser(description='K-400 preprocessing') 28 | 29 | parser.add_argument( 30 | '--root_dir', 31 | default='/datasets01/kinetics/070618/400/', 32 | type=str, 33 | help='root dir of K-400 folder' 34 | ) 35 | parser.add_argument( 36 | '--split_dir', 37 | default='train_avi-288p', 38 | type=str, 39 | help='name of dir of split' 40 | ) 41 | parser.add_argument( 42 | '--mode', 43 | default='train', 44 | type=str, 45 | help='name of dir of split' 46 | ) 47 | args = parser.parse_args() 48 | k400_preproces( 49 | root_dir=args.root_dir, 50 | split_dir=args.split_dir, 51 | mode=args.mode 52 | ) -------------------------------------------------------------------------------- /data/kinetics_600/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | import csv 4 | import glob 5 | import os 6 | 7 | 8 | def k600_preproces( 9 | root_dir='/datasets01/kinetics/070618/600/', split_dir='train_avi-288p', mode='train' 10 | ): 11 | data_prefix = os.path.join(root_dir, split_dir) 12 | files = list(sorted(glob.glob(os.path.join(data_prefix, '*', '*')))) 13 | classes = list(sorted(glob.glob(os.path.join(data_prefix, '*')))) 14 | classes = [os.path.basename(i) for i in classes] 15 | class_to_idx = {classes[i]: i for i in range(len(classes))} 16 | 17 | with open(f'{mode}.csv', mode='w') as csv_file: 18 | csv_writer = csv.writer(csv_file, delimiter=' ', quotechar='"', quoting=csv.QUOTE_MINIMAL) 19 | for path in files: 20 | class_name = path.split('/')[-2] 21 | class_idx = class_to_idx[class_name] 22 | csv_writer.writerow([path, class_idx]) 23 | 24 | 25 | if __name__ == '__main__': 26 | import argparse 27 | parser = argparse.ArgumentParser(description='K-600 preprocessing') 28 | 29 | parser.add_argument( 30 | '--root_dir', 31 | default='/datasets01/kinetics/070618/600/', 32 | type=str, 33 | help='root dir of K-400 folder' 34 | ) 35 | parser.add_argument( 36 | '--split_dir', 37 | default='train_avi-288p', 38 | type=str, 39 | help='name of dir of split' 40 | ) 41 | parser.add_argument( 42 | '--mode', 43 | default='train', 44 | type=str, 45 | help='name of dir of split' 46 | ) 47 | args = parser.parse_args() 48 | k600_preproces( 49 | root_dir=args.root_dir, 50 | split_dir=args.split_dir, 51 | mode=args.mode 52 | ) -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: motionformer 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=1_gnu 9 | - av=8.0.3=py38hcaf3a0b_0 10 | - blas=1.0=mkl 11 | - bzip2=1.0.8=h7b6447c_0 12 | - ca-certificates=2021.5.30=ha878542_0 13 | - certifi=2021.5.30=py38h578d9bd_0 14 | - cudatoolkit=10.2.89=hfd86e86_1 15 | - ffmpeg=4.3.1=h167e202_0 16 | - freetype=2.10.4=h5ab3b9f_0 17 | - gmp=6.2.1=h2531618_2 18 | - gnutls=3.6.15=he1e5248_0 19 | - intel-openmp=2021.2.0=h06a4308_610 20 | - jpeg=9b=h024ee3a_2 21 | - lame=3.100=h7b6447c_0 22 | - lcms2=2.12=h3be6417_0 23 | - ld_impl_linux-64=2.33.1=h53a641e_7 24 | - libffi=3.3=he6710b0_2 25 | - libgcc-ng=9.3.0=h2828fa1_19 26 | - libgomp=9.3.0=h2828fa1_19 27 | - libiconv=1.15=h63c8f33_5 28 | - libidn2=2.3.1=h27cfd23_0 29 | - libpng=1.6.37=hbc83047_0 30 | - libstdcxx-ng=9.1.0=hdf63c60_0 31 | - libtasn1=4.16.0=h27cfd23_0 32 | - libtiff=4.2.0=h85742a9_0 33 | - libunistring=0.9.10=h27cfd23_0 34 | - libuv=1.40.0=h7b6447c_0 35 | - libwebp-base=1.2.0=h27cfd23_0 36 | - lz4-c=1.9.3=h2531618_0 37 | - mkl=2021.2.0=h06a4308_296 38 | - mkl-service=2.3.0=py38h27cfd23_1 39 | - mkl_fft=1.3.0=py38h42c9631_2 40 | - mkl_random=1.2.1=py38ha9443f7_2 41 | - ncurses=6.2=he6710b0_1 42 | - nettle=3.7.2=hbbd107a_1 43 | - ninja=1.10.2=hff7bd54_1 44 | - numpy=1.20.2=py38h2d18471_0 45 | - numpy-base=1.20.2=py38hfae3a4d_0 46 | - olefile=0.46=py_0 47 | - openh264=2.1.1=h8b12597_0 48 | - openssl=1.1.1k=h7f98852_0 49 | - pillow=8.2.0=py38he98fc37_0 50 | - pip=21.1.1=py38h06a4308_0 51 | - python=3.8.5=h7579374_1 52 | - python_abi=3.8=1_cp38 53 | - pytorch=1.8.1=py3.8_cuda10.2_cudnn7.6.5_0 54 | - readline=8.1=h27cfd23_0 55 | - setuptools=52.0.0=py38h06a4308_0 56 | - six=1.15.0=py38h06a4308_0 57 | - sqlite=3.35.4=hdfb4753_0 58 | - tk=8.6.10=hbc83047_0 59 | - torchvision=0.9.1=py38_cu102 60 | - typing_extensions=3.7.4.3=pyha847dfd_0 61 | - wheel=0.36.2=pyhd3eb1b0_0 62 | - x264=1!152.20180806=h14c3975_0 63 | - xz=5.2.5=h7b6447c_0 64 | - zlib=1.2.11=h7b6447c_3 65 | - zstd=1.4.9=haebb681_0 66 | - pip: 67 | - chardet==4.0.0 68 | - cloudpickle==1.6.0 69 | - cycler==0.10.0 70 | - ffmpeg-python==0.2.0 71 | - future==0.18.2 72 | - fvcore==0.1.5 73 | - idna==2.10 74 | - iopath==0.1.8 75 | - joblib==1.0.1 76 | - kiwisolver==1.3.1 77 | - matplotlib==3.4.2 78 | - pandas==1.2.4 79 | - psutil==5.8.0 80 | - pyparsing==2.4.7 81 | - python-dateutil==2.8.1 82 | - pytz==2021.1 83 | - pyyaml==5.4.1 84 | - requests==2.25.1 85 | - scikit-learn==0.24.2 86 | - scipy==1.6.3 87 | - sklearn==0.0 88 | - threadpoolctl==2.1.0 89 | - timm==0.4.9 90 | - tqdm==4.61.0 91 | - urllib3==1.26.5 92 | - werkzeug==2.0.1 93 | prefix: /private/home/mandelapatrick/.conda/envs/motionformer 94 | -------------------------------------------------------------------------------- /figs/firstpage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50cc0e13ba1240a4acf2a6800ccdbe1bb49/figs/firstpage.png -------------------------------------------------------------------------------- /figs/qual_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50cc0e13ba1240a4acf2a6800ccdbe1bb49/figs/qual_results.png -------------------------------------------------------------------------------- /figs/splash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50cc0e13ba1240a4acf2a6800ccdbe1bb49/figs/splash.png -------------------------------------------------------------------------------- /figs/traj_attn_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50cc0e13ba1240a4acf2a6800ccdbe1bb49/figs/traj_attn_fig.png -------------------------------------------------------------------------------- /run_with_submitit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import argparse 5 | import os 6 | from pathlib import Path 7 | import shutil 8 | import submitit 9 | import multiprocessing 10 | import sys 11 | import uuid 12 | 13 | import torch 14 | import slowfast.utils.checkpoint as cu 15 | import slowfast.utils.multiprocessing as mpu 16 | from slowfast.utils.misc import launch_job 17 | from slowfast.utils.parser import load_config 18 | 19 | from tools.test_net import test 20 | from tools.train_net import train 21 | 22 | def parse_args(): 23 | parser = argparse.ArgumentParser( 24 | "Submitit for onestage training", add_help=False 25 | ) 26 | parser.add_argument( 27 | "--num_gpus", 28 | help="Number of GPUs", 29 | default=8, 30 | type=int, 31 | ) 32 | parser.add_argument( 33 | "--num_shards", 34 | help="Number of Nodes", 35 | default=1, 36 | type=int, 37 | ) 38 | parser.add_argument( 39 | "--partition", 40 | default="learnfair", 41 | type=str, 42 | help="Partition where to submit" 43 | ) 44 | parser.add_argument( 45 | "--timeout", 46 | default=60 * 72, 47 | type=int, 48 | help="Duration of the job" 49 | ) 50 | parser.add_argument( 51 | "--cfg", 52 | dest="cfg_file", 53 | help="Path to the config file", 54 | default="configs/test_R50_8GPU.yaml", type=str 55 | ) 56 | parser.add_argument( 57 | "--job_dir", 58 | default="/checkpoint/mandelapatrick/slowfast_ssv2", 59 | type=str, 60 | help="Job dir. Leave empty for automatic." 61 | ) 62 | parser.add_argument( 63 | "--name", 64 | default="", 65 | type=str, 66 | help="Job dir. Leave empty for automatic." 67 | ) 68 | parser.add_argument( 69 | "--resume-from", 70 | default="", 71 | type=str, 72 | help=( 73 | "Weights to resume from (.*pth file) or a file (last_checkpoint) that contains " 74 | + "weight file name from the same directory" 75 | ), 76 | ) 77 | parser.add_argument( 78 | "--resume-job", 79 | default="", 80 | type=str, 81 | help="resume training from the job") 82 | parser.add_argument( 83 | "--use_volta32", 84 | action='store_true', 85 | help="Big models? Use this") 86 | parser.add_argument( 87 | "--postfix", 88 | default="experiment", 89 | type=str, 90 | help="Postfix of the jobs" 91 | ) 92 | parser.add_argument( 93 | "--mail", 94 | default="", 95 | type=str, 96 | help="Email this user when the job finishes if specified" 97 | ) 98 | parser.add_argument( 99 | '--comment', 100 | default="", 101 | type=str, 102 | help='Comment to pass to scheduler, e.g. priority message' 103 | ) 104 | parser.add_argument( 105 | "opts", 106 | help="See slowfast/config/defaults.py for all options", 107 | default=None, 108 | nargs=argparse.REMAINDER, 109 | ) 110 | return parser.parse_args() 111 | 112 | 113 | def get_shared_folder() -> Path: 114 | user = os.getenv("USER") 115 | if Path("/checkpoint/").is_dir(): 116 | p = Path(f"/checkpoint/{user}/slowfast") 117 | p.mkdir(exist_ok=True) 118 | return p 119 | raise RuntimeError("No shared folder available") 120 | 121 | 122 | def get_init_file(): 123 | # Init file must not exist, but it's parent dir must exist. 124 | os.makedirs(str(get_shared_folder()), exist_ok=True) 125 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" 126 | if init_file.exists(): 127 | os.remove(str(init_file)) 128 | return init_file 129 | 130 | 131 | def launch(shard_id, num_shards, cfg, init_method): 132 | os.environ["NCCL_MIN_NRINGS"] = "8" 133 | 134 | print ("Pytorch version: ", torch.__version__) 135 | cfg.SHARD_ID = shard_id 136 | cfg.NUM_SHARDS = num_shards 137 | cfg.USE_SBATCH = False 138 | 139 | print([ 140 | shard_id, num_shards, cfg 141 | ]) 142 | 143 | # train, test = get_func(cfg) 144 | # Launch job. 145 | if cfg.TRAIN.ENABLE: 146 | launch_job(cfg=cfg, init_method=init_method, func=train) 147 | 148 | if cfg.TEST.ENABLE: 149 | launch_job(cfg=cfg, init_method=init_method, func=test) 150 | 151 | 152 | class Trainer(object): 153 | def __init__(self, args): 154 | self.args = args 155 | 156 | def __call__(self): 157 | 158 | socket_name = os.popen("ip r | grep default | awk '{print $5}'").read().strip('\n') 159 | print("Setting GLOO and NCCL sockets IFNAME to: {}".format(socket_name)) 160 | os.environ["GLOO_SOCKET_IFNAME"] = socket_name 161 | os.environ["NCCL_SOCKET_IFNAME"] = socket_name 162 | 163 | hostname_first_node = os.popen( 164 | "scontrol show hostnames $SLURM_JOB_NODELIST" 165 | ).read().split("\n")[0] 166 | dist_url = "tcp://{}:12399".format(hostname_first_node) 167 | print("We will use the following dist url: {}".format(dist_url)) 168 | 169 | self._setup_gpu_args() 170 | results = launch( 171 | shard_id=self.args.machine_rank, 172 | num_shards=self.args.num_shards, 173 | cfg=load_config(self.args), 174 | init_method=dist_url, 175 | ) 176 | return results 177 | 178 | def checkpoint(self): 179 | import submitit 180 | 181 | job_env = submitit.JobEnvironment() 182 | slurm_job_id = job_env.job_id 183 | if self.args.resume_job == "": 184 | self.args.resume_job = slurm_job_id 185 | print("Requeuing ", self.args) 186 | empty_trainer = type(self)(self.args) 187 | return submitit.helpers.DelayedSubmission(empty_trainer) 188 | 189 | def _setup_gpu_args(self): 190 | import submitit 191 | 192 | job_env = submitit.JobEnvironment() 193 | print(self.args) 194 | 195 | self.args.machine_rank = job_env.global_rank 196 | self.args.output_dir = str(self.args.output_dir).replace("%j", str(job_env.job_id)) 197 | print(f"Process rank: {job_env.global_rank}") 198 | 199 | 200 | def main(): 201 | args = parse_args() 202 | 203 | if args.name == "": 204 | cfg_name = os.path.splitext(os.path.basename(args.cfg_file))[0] 205 | args.name = '_'.join([cfg_name, args.postfix]) 206 | 207 | assert args.job_dir != "" 208 | 209 | args.job_dir = Path(args.job_dir) / "%j" 210 | args.output_dir = args.job_dir 211 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) 212 | 213 | # cluster setup is defined by environment variables 214 | num_gpus_per_node = args.num_gpus 215 | nodes = args.num_shards 216 | partition = args.partition 217 | timeout_min = args.timeout 218 | kwargs = {} 219 | if args.use_volta32: 220 | kwargs['slurm_constraint'] = 'volta32gb' 221 | if args.comment: 222 | kwargs['slurm_comment'] = args.comment 223 | 224 | executor.update_parameters( 225 | mem_gb=60 * num_gpus_per_node, 226 | gpus_per_node=num_gpus_per_node, 227 | tasks_per_node=1, 228 | cpus_per_task=10 * num_gpus_per_node, 229 | nodes=nodes, 230 | timeout_min=timeout_min, # max is 60 * 72 231 | slurm_partition=partition, 232 | slurm_signal_delay_s=120, 233 | **kwargs 234 | ) 235 | 236 | 237 | print(args.name) 238 | executor.update_parameters(name=args.name) 239 | 240 | trainer = Trainer(args) 241 | job = executor.submit(trainer) 242 | print("Submitted job_id:", job.job_id) 243 | 244 | 245 | if __name__ == "__main__": 246 | main() 247 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=4 4 | known_standard_library=numpy,setuptools 5 | known_myself=slowfast 6 | known_third_party=fvcore,iopath,av,torch,pycocotools,yacs,termcolor,scipy,simplejson,matplotlib,detectron2,torchvision,yaml,tqdm,psutil,opencv-python,pandas,tensorboard,moviepy,sklearn,cv2 7 | no_lines_before=STDLIB,THIRDPARTY 8 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER 9 | default_section=FIRSTPARTY 10 | 11 | [mypy] 12 | python_version=3.6 13 | ignore_missing_imports = True 14 | warn_unused_configs = True 15 | disallow_untyped_defs = True 16 | check_untyped_defs = True 17 | warn_unused_ignores = True 18 | warn_redundant_casts = True 19 | show_column_numbers = True 20 | follow_imports = silent 21 | allow_redefinition = True 22 | ; Require all functions to be annotated 23 | disallow_incomplete_defs = True 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from setuptools import find_packages, setup 5 | 6 | setup( 7 | name="slowfast", 8 | version="1.0", 9 | author="FAIR", 10 | url="unknown", 11 | description="SlowFast Video Understanding", 12 | install_requires=[ 13 | "yacs>=0.1.6", 14 | "pyyaml>=5.1", 15 | "av", 16 | "matplotlib", 17 | "termcolor>=1.1", 18 | "simplejson", 19 | "tqdm", 20 | "psutil", 21 | "matplotlib", 22 | "detectron2", 23 | "opencv-python", 24 | "pandas", 25 | "torchvision>=0.4.2", 26 | "sklearn", 27 | "tensorboard", 28 | ], 29 | extras_require={"tensorboard_video_visualization": ["moviepy"]}, 30 | packages=find_packages(exclude=("configs", "tests")), 31 | ) 32 | -------------------------------------------------------------------------------- /slowfast/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from slowfast.utils.env import setup_environment 5 | 6 | setup_environment() 7 | -------------------------------------------------------------------------------- /slowfast/config/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /slowfast/config/custom_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Add custom configs and default values""" 5 | 6 | 7 | def add_custom_config(_C): 8 | # Add your own customized configs. 9 | pass 10 | -------------------------------------------------------------------------------- /slowfast/datasets/DATASET.md: -------------------------------------------------------------------------------- 1 | # Dataset Preparation 2 | 3 | ## Kinetics 4 | 5 | The Kinetics Dataset could be downloaded via the code released by ActivityNet: 6 | 7 | 1. Download the videos via the official [scripts](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). 8 | 9 | 2. After all the videos were downloaded, resize the video to the short edge size of 256, then prepare the csv files for training, validation, and testing set as `train.csv`, `val.csv`, `test.csv`. The format of the csv file is: 10 | 11 | ``` 12 | path_to_video_1 label_1 13 | path_to_video_2 label_2 14 | path_to_video_3 label_3 15 | ... 16 | path_to_video_N label_N 17 | ``` 18 | 19 | You can use provided helper functions to create csv files: 20 | ``` 21 | cd data/kinetics_400 22 | python3 preprocess.py --root_dir $PATH_TO_ROOT_DIR --split_dir $SPLIT_DIR --mode $MODE 23 | ``` 24 | 25 | For example: 26 | 27 | ``` 28 | cd data/kinetics_400 29 | python3 preprocess.py --root_dir /datasets01/kinetics/070618/400/ --split_dir train_avi-288p --mode train 30 | python3 preprocess.py --root_dir /datasets01/kinetics/070618/400/ --split_dir val_avi-288p --mode val 31 | python3 preprocess.py --root_dir /datasets01/kinetics/070618/400/ --split_dir val_avi-288p --mode test 32 | ``` 33 | 34 | ## Something-Something V2 35 | 1. Please download the dataset and annotations from [dataset provider](https://20bn.com/datasets/something-something). 36 | 37 | 2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/val.csv)). 38 | 39 | 3. Extract the frames at 30 FPS. (We used ffmpeg-4.1.3 with command 40 | `ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}"` 41 | in experiments.) Please put the frames in a structure consistent with the frame lists. 42 | 43 | 44 | Please put all annotation json files and the frame lists in the same folder, and set `DATA.PATH_TO_DATA_DIR` to the path. Set `DATA.PATH_PREFIX` to be the path to the folder containing extracted frames. 45 | 46 | ## Epic-Kitchens-100 47 | 48 | Follow instructions from [dataset provider](https://github.com/epic-kitchens/epic-kitchens-100-annotations). -------------------------------------------------------------------------------- /slowfast/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from .build import DATASET_REGISTRY, build_dataset # noqa 5 | from .epickitchens import Epickitchens 6 | from .kinetics import Kinetics # noqa 7 | from .ssv2 import Ssv2 # noqa 8 | -------------------------------------------------------------------------------- /slowfast/datasets/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from fvcore.common.registry import Registry 5 | 6 | DATASET_REGISTRY = Registry("DATASET") 7 | DATASET_REGISTRY.__doc__ = """ 8 | Registry for dataset. 9 | 10 | The registered object will be called with `obj(cfg, split)`. 11 | The call should return a `torch.utils.data.Dataset` object. 12 | """ 13 | 14 | 15 | def build_dataset(dataset_name, cfg, split): 16 | """ 17 | Build a dataset, defined by `dataset_name`. 18 | Args: 19 | dataset_name (str): the name of the dataset to be constructed. 20 | cfg (CfgNode): configs. Details can be found in 21 | slowfast/config/defaults.py 22 | split (str): the split of the data loader. Options include `train`, 23 | `val`, and `test`. 24 | Returns: 25 | Dataset: a constructed dataset specified by dataset_name. 26 | """ 27 | # Capitalize the the first letter of the dataset_name since the dataset_name 28 | # in configs may be in lowercase but the name of dataset class should always 29 | # start with an uppercase letter. 30 | name = dataset_name.capitalize() 31 | return DATASET_REGISTRY.get(name)(cfg, split) 32 | -------------------------------------------------------------------------------- /slowfast/datasets/epickitchens_record.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from .video_record import VideoRecord 5 | from datetime import timedelta 6 | import time 7 | 8 | 9 | def timestamp_to_sec(timestamp): 10 | x = time.strptime(timestamp, '%H:%M:%S.%f') 11 | sec = float(timedelta(hours=x.tm_hour, 12 | minutes=x.tm_min, 13 | seconds=x.tm_sec).total_seconds()) + float( 14 | timestamp.split('.')[-1]) / 100 15 | return sec 16 | 17 | 18 | class EpicKitchensVideoRecord(VideoRecord): 19 | def __init__(self, tup): 20 | self._index = str(tup[0]) 21 | self._series = tup[1] 22 | 23 | @property 24 | def participant(self): 25 | return self._series['participant_id'] 26 | 27 | @property 28 | def untrimmed_video_name(self): 29 | return self._series['video_id'] 30 | 31 | @property 32 | def start_frame(self): 33 | return int(round(timestamp_to_sec(self._series['start_timestamp']) * self.fps)) 34 | 35 | @property 36 | def end_frame(self): 37 | return int(round(timestamp_to_sec(self._series['stop_timestamp']) * self.fps)) 38 | 39 | @property 40 | def fps(self): 41 | is_100 = len(self.untrimmed_video_name.split('_')[1]) == 3 42 | return 50 if is_100 else 60 43 | 44 | @property 45 | def num_frames(self): 46 | return self.end_frame - self.start_frame 47 | 48 | @property 49 | def label(self): 50 | return {'verb': self._series['verb_class'] if 'verb_class' in self._series else -1, 51 | 'noun': self._series['noun_class'] if 'noun_class' in self._series else -1} 52 | 53 | @property 54 | def metadata(self): 55 | return {'narration_id': self._index} -------------------------------------------------------------------------------- /slowfast/datasets/frame_loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import os 5 | import torch 6 | from . import utils as utils 7 | from .decoder import get_start_end_idx 8 | 9 | 10 | def temporal_sampling( 11 | num_frames, start_idx, end_idx, num_samples, start_frame=0 12 | ): 13 | """ 14 | Given the start and end frame index, sample num_samples frames between 15 | the start and end with equal interval. 16 | Args: 17 | num_frames (int): number of frames of the trimmed action clip 18 | start_idx (int): the index of the start frame. 19 | end_idx (int): the index of the end frame. 20 | num_samples (int): number of frames to sample. 21 | start_frame (int): starting frame of the action clip in the untrimmed video 22 | Returns: 23 | frames (tersor): a tensor of temporal sampled video frames, dimension is 24 | `num clip frames` x `channel` x `height` x `width`. 25 | """ 26 | index = torch.linspace(start_idx, end_idx, num_samples) 27 | index = torch.clamp(index, 0, num_frames - 1).long() 28 | return start_frame + index 29 | 30 | 31 | def pack_frames_to_video_clip( 32 | cfg, video_record, temporal_sample_index, target_fps=60 33 | ): 34 | # Load video by loading its extracted frames 35 | path_to_video = '{}/{}/rgb_frames/{}'.format( 36 | cfg.EPICKITCHENS.VISUAL_DATA_DIR, 37 | video_record.participant, 38 | video_record.untrimmed_video_name 39 | 40 | ) 41 | img_tmpl = "frame_{:010d}.jpg" 42 | fps = video_record.fps 43 | sampling_rate = cfg.DATA.SAMPLING_RATE 44 | num_samples = cfg.DATA.NUM_FRAMES 45 | start_idx, end_idx = get_start_end_idx( 46 | video_record.num_frames, 47 | num_samples * sampling_rate * fps / target_fps, 48 | temporal_sample_index, 49 | cfg.TEST.NUM_ENSEMBLE_VIEWS, 50 | ) 51 | start_idx, end_idx = start_idx + 1, end_idx + 1 52 | frame_idx = temporal_sampling( 53 | video_record.num_frames, 54 | start_idx, end_idx, num_samples, 55 | start_frame=video_record.start_frame 56 | ) 57 | img_paths = [ 58 | os.path.join( 59 | path_to_video, 60 | img_tmpl.format(idx.item() 61 | )) for idx in frame_idx] 62 | frames = utils.retry_load_images(img_paths) 63 | return frames -------------------------------------------------------------------------------- /slowfast/datasets/loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Data loader.""" 5 | 6 | import itertools 7 | import numpy as np 8 | import torch 9 | from torch.utils.data._utils.collate import default_collate 10 | from torch.utils.data.distributed import DistributedSampler 11 | from torch.utils.data.sampler import RandomSampler 12 | 13 | from slowfast.datasets.multigrid_helper import ShortCycleBatchSampler 14 | 15 | from . import utils as utils 16 | from .build import build_dataset 17 | from .samplers import RASampler 18 | 19 | 20 | def detection_collate(batch): 21 | """ 22 | Collate function for detection task. Concatanate bboxes, labels and 23 | metadata from different samples in the first dimension instead of 24 | stacking them to have a batch-size dimension. 25 | Args: 26 | batch (tuple or list): data batch to collate. 27 | Returns: 28 | (tuple): collated detection data batch. 29 | """ 30 | inputs, labels, video_idx, extra_data = zip(*batch) 31 | inputs, video_idx = default_collate(inputs), default_collate(video_idx) 32 | labels = torch.tensor(np.concatenate(labels, axis=0)).float() 33 | 34 | collated_extra_data = {} 35 | for key in extra_data[0].keys(): 36 | data = [d[key] for d in extra_data] 37 | if key == "boxes" or key == "ori_boxes": 38 | # Append idx info to the bboxes before concatenating them. 39 | bboxes = [ 40 | np.concatenate( 41 | [np.full((data[i].shape[0], 1), float(i)), data[i]], axis=1 42 | ) 43 | for i in range(len(data)) 44 | ] 45 | bboxes = np.concatenate(bboxes, axis=0) 46 | collated_extra_data[key] = torch.tensor(bboxes).float() 47 | elif key == "metadata": 48 | collated_extra_data[key] = torch.tensor( 49 | list(itertools.chain(*data)) 50 | ).view(-1, 2) 51 | else: 52 | collated_extra_data[key] = default_collate(data) 53 | 54 | return inputs, labels, video_idx, collated_extra_data 55 | 56 | 57 | def construct_loader(cfg, split, is_precise_bn=False): 58 | """ 59 | Constructs the data loader for the given dataset. 60 | Args: 61 | cfg (CfgNode): configs. Details can be found in 62 | slowfast/config/defaults.py 63 | split (str): the split of the data loader. Options include `train`, 64 | `val`, and `test`. 65 | """ 66 | assert split in ["train", "val", "test"] 67 | if split in ["train"]: 68 | dataset_name = cfg.TRAIN.DATASET 69 | batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 70 | shuffle = True 71 | drop_last = True 72 | elif split in ["val"]: 73 | dataset_name = cfg.TRAIN.DATASET 74 | batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 75 | shuffle = False 76 | drop_last = False 77 | elif split in ["test"]: 78 | dataset_name = cfg.TEST.DATASET 79 | batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 80 | shuffle = False 81 | drop_last = False 82 | 83 | # Construct the dataset 84 | dataset = build_dataset(dataset_name, cfg, split) 85 | 86 | if isinstance(dataset, torch.utils.data.IterableDataset): 87 | loader = torch.utils.data.DataLoader( 88 | dataset, 89 | batch_size=batch_size, 90 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 91 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 92 | drop_last=drop_last, 93 | collate_fn=detection_collate if cfg.DETECTION.ENABLE else None, 94 | worker_init_fn=utils.loader_worker_init_fn(dataset), 95 | ) 96 | else: 97 | if ( 98 | cfg.MULTIGRID.SHORT_CYCLE 99 | and split in ["train"] 100 | and not is_precise_bn 101 | ): 102 | # Create a sampler for multi-process training 103 | sampler = utils.create_sampler(dataset, shuffle, cfg) 104 | batch_sampler = ShortCycleBatchSampler( 105 | sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg 106 | ) 107 | # Create a loader 108 | loader = torch.utils.data.DataLoader( 109 | dataset, 110 | batch_sampler=batch_sampler, 111 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 112 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 113 | worker_init_fn=utils.loader_worker_init_fn(dataset), 114 | ) 115 | else: 116 | # Create a sampler for multi-process training 117 | sampler = utils.create_sampler(dataset, shuffle, cfg) 118 | # Create a loader 119 | loader = torch.utils.data.DataLoader( 120 | dataset, 121 | batch_size=batch_size, 122 | shuffle=(False if sampler else shuffle), 123 | sampler=sampler, 124 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 125 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 126 | drop_last=drop_last, 127 | collate_fn=detection_collate if cfg.DETECTION.ENABLE else None, 128 | worker_init_fn=utils.loader_worker_init_fn(dataset), 129 | ) 130 | return loader 131 | 132 | 133 | def shuffle_dataset(loader, cur_epoch): 134 | """ " 135 | Shuffles the data. 136 | Args: 137 | loader (loader): data loader to perform shuffle. 138 | cur_epoch (int): number of the current epoch. 139 | """ 140 | if ( 141 | loader._dataset_kind 142 | == torch.utils.data.dataloader._DatasetKind.Iterable 143 | ): 144 | if hasattr(loader.dataset, "sampler"): 145 | sampler = loader.dataset.sampler 146 | else: 147 | raise RuntimeError( 148 | "Unknown sampler for IterableDataset when shuffling dataset" 149 | ) 150 | else: 151 | sampler = ( 152 | loader.batch_sampler.sampler 153 | if isinstance(loader.batch_sampler, ShortCycleBatchSampler) 154 | else loader.sampler 155 | ) 156 | assert isinstance( 157 | sampler, (RandomSampler, DistributedSampler, RASampler) 158 | ), "Sampler type '{}' not supported".format(type(sampler)) 159 | # RandomSampler handles shuffling automatically 160 | if isinstance(sampler, DistributedSampler) or isinstance(sampler, RASampler): 161 | # DistributedSampler shuffles data based on epoch 162 | sampler.set_epoch(cur_epoch) 163 | -------------------------------------------------------------------------------- /slowfast/datasets/multigrid_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Helper functions for multigrid training.""" 5 | 6 | import numpy as np 7 | import torch 8 | from torch.utils.data.sampler import Sampler 9 | 10 | TORCH_MAJOR = int(torch.__version__.split('.')[0]) 11 | TORCH_MINOR = int(torch.__version__.split('.')[1]) 12 | 13 | if TORCH_MAJOR >= 1 and TORCH_MINOR >= 8: 14 | _int_classes = int 15 | else: 16 | from torch._six import int_classes as _int_classes 17 | 18 | 19 | class ShortCycleBatchSampler(Sampler): 20 | """ 21 | Extend Sampler to support "short cycle" sampling. 22 | See paper "A Multigrid Method for Efficiently Training Video Models", 23 | Wu et al., 2019 (https://arxiv.org/abs/1912.00998) for details. 24 | """ 25 | 26 | def __init__(self, sampler, batch_size, drop_last, cfg): 27 | if not isinstance(sampler, Sampler): 28 | raise ValueError( 29 | "sampler should be an instance of " 30 | "torch.utils.data.Sampler, but got sampler={}".format(sampler) 31 | ) 32 | if ( 33 | not isinstance(batch_size, _int_classes) 34 | or isinstance(batch_size, bool) 35 | or batch_size <= 0 36 | ): 37 | raise ValueError( 38 | "batch_size should be a positive integer value, " 39 | "but got batch_size={}".format(batch_size) 40 | ) 41 | if not isinstance(drop_last, bool): 42 | raise ValueError( 43 | "drop_last should be a boolean value, but got " 44 | "drop_last={}".format(drop_last) 45 | ) 46 | self.sampler = sampler 47 | self.drop_last = drop_last 48 | 49 | bs_factor = [ 50 | int( 51 | round( 52 | ( 53 | float(cfg.DATA.TRAIN_CROP_SIZE) 54 | / (s * cfg.MULTIGRID.DEFAULT_S) 55 | ) 56 | ** 2 57 | ) 58 | ) 59 | for s in cfg.MULTIGRID.SHORT_CYCLE_FACTORS 60 | ] 61 | 62 | self.batch_sizes = [ 63 | batch_size * bs_factor[0], 64 | batch_size * bs_factor[1], 65 | batch_size, 66 | ] 67 | 68 | def __iter__(self): 69 | counter = 0 70 | batch_size = self.batch_sizes[0] 71 | batch = [] 72 | for idx in self.sampler: 73 | batch.append((idx, counter % 3)) 74 | if len(batch) == batch_size: 75 | yield batch 76 | counter += 1 77 | batch_size = self.batch_sizes[counter % 3] 78 | batch = [] 79 | if len(batch) > 0 and not self.drop_last: 80 | yield batch 81 | 82 | def __len__(self): 83 | avg_batch_size = sum(self.batch_sizes) / 3.0 84 | if self.drop_last: 85 | return int(np.floor(len(self.sampler) / avg_batch_size)) 86 | else: 87 | return int(np.ceil(len(self.sampler) / avg_batch_size)) 88 | -------------------------------------------------------------------------------- /slowfast/datasets/random_erasing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | # Copyright 2020 Ross Wightman 4 | # Modified 5 | 6 | import random 7 | import math 8 | import torch 9 | 10 | 11 | def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'): 12 | # NOTE I've seen CUDA illegal memory access errors being caused by the normal_() 13 | # paths, flip the order so normal is run on CPU if this becomes a problem 14 | # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508 15 | if per_pixel: 16 | return torch.empty(patch_size, dtype=dtype, device=device).normal_() 17 | elif rand_color: 18 | return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_() 19 | else: 20 | return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device) 21 | 22 | 23 | class RandomErasing: 24 | """ Randomly selects a rectangle region in an image and erases its pixels. 25 | 'Random Erasing Data Augmentation' by Zhong et al. 26 | See https://arxiv.org/pdf/1708.04896.pdf 27 | 28 | This variant of RandomErasing is intended to be applied to either a batch 29 | or single image tensor after it has been normalized by dataset mean and std. 30 | Args: 31 | probability: Probability that the Random Erasing operation will be performed. 32 | min_area: Minimum percentage of erased area wrt input image area. 33 | max_area: Maximum percentage of erased area wrt input image area. 34 | min_aspect: Minimum aspect ratio of erased area. 35 | mode: pixel color mode, one of 'const', 'rand', or 'pixel' 36 | 'const' - erase block is constant color of 0 for all channels 37 | 'rand' - erase block is same per-channel random (normal) color 38 | 'pixel' - erase block is per-pixel random (normal) color 39 | max_count: maximum number of erasing blocks per image, area per box is scaled by count. 40 | per-image count is randomly chosen between 1 and this value. 41 | """ 42 | 43 | def __init__( 44 | self, 45 | probability=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, 46 | mode='const', min_count=1, max_count=None, num_splits=0, device='cuda', seed=None): 47 | self.probability = probability 48 | self.min_area = min_area 49 | self.max_area = max_area 50 | max_aspect = max_aspect or 1 / min_aspect 51 | self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) 52 | self.min_count = min_count 53 | self.max_count = max_count or min_count 54 | self.num_splits = num_splits 55 | mode = mode.lower() 56 | self.rand_color = False 57 | self.per_pixel = False 58 | if mode == 'rand': 59 | self.rand_color = True # per block random normal 60 | elif mode == 'pixel': 61 | self.per_pixel = True # per pixel random normal 62 | else: 63 | assert not mode or mode == 'const' 64 | self.device = device 65 | self.seed = seed 66 | 67 | def _erase(self, img, chan, img_h, img_w, dtype): 68 | if self.seed is not None: 69 | random.seed(self.seed) 70 | 71 | if random.random() > self.probability: 72 | return 73 | area = img_h * img_w 74 | count = self.min_count if self.min_count == self.max_count else \ 75 | random.randint(self.min_count, self.max_count) 76 | for _ in range(count): 77 | for attempt in range(10): 78 | target_area = random.uniform(self.min_area, self.max_area) * area / count 79 | aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) 80 | h = int(round(math.sqrt(target_area * aspect_ratio))) 81 | w = int(round(math.sqrt(target_area / aspect_ratio))) 82 | if w < img_w and h < img_h: 83 | top = random.randint(0, img_h - h) 84 | left = random.randint(0, img_w - w) 85 | img[:, top:top + h, left:left + w] = _get_pixels( 86 | self.per_pixel, self.rand_color, (chan, h, w), 87 | dtype=dtype, device=self.device) 88 | break 89 | 90 | def __call__(self, input): 91 | if len(input.size()) == 3: 92 | self._erase(input, *input.size(), input.dtype) 93 | else: 94 | batch_size, chan, img_h, img_w = input.size() 95 | # skip first slice of batch if num_splits is set (for clean portion of samples) 96 | batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 97 | for i in range(batch_start, batch_size): 98 | self._erase(input[i], chan, img_h, img_w, input.dtype) 99 | return input -------------------------------------------------------------------------------- /slowfast/datasets/samplers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2015-present, Facebook, Inc. 3 | # All rights reserved. 4 | import torch 5 | import torch.distributed as dist 6 | import math 7 | 8 | 9 | class RASampler(torch.utils.data.Sampler): 10 | """Sampler that restricts data loading to a subset of the dataset for distributed, 11 | with repeated augmentation. 12 | It ensures that different each augmented version of a sample will be visible to a 13 | different process (GPU) 14 | Heavily based on torch.utils.data.DistributedSampler 15 | """ 16 | 17 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 18 | if num_replicas is None: 19 | if not dist.is_available(): 20 | raise RuntimeError("Requires distributed package to be available") 21 | num_replicas = dist.get_world_size() 22 | if rank is None: 23 | if not dist.is_available(): 24 | raise RuntimeError("Requires distributed package to be available") 25 | rank = dist.get_rank() 26 | self.dataset = dataset 27 | self.num_replicas = num_replicas 28 | self.rank = rank 29 | self.epoch = 0 30 | self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.num_replicas)) 31 | self.total_size = self.num_samples * self.num_replicas 32 | # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas)) 33 | self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) 34 | self.shuffle = shuffle 35 | 36 | def __iter__(self): 37 | # deterministically shuffle based on epoch 38 | g = torch.Generator() 39 | g.manual_seed(self.epoch) 40 | if self.shuffle: 41 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 42 | else: 43 | indices = list(range(len(self.dataset))) 44 | 45 | # add extra samples to make it evenly divisible 46 | indices = [ele for ele in indices for i in range(3)] 47 | indices += indices[:(self.total_size - len(indices))] 48 | assert len(indices) == self.total_size 49 | 50 | # subsample 51 | indices = indices[self.rank:self.total_size:self.num_replicas] 52 | assert len(indices) == self.num_samples 53 | 54 | return iter(indices[:self.num_selected_samples]) 55 | 56 | def __len__(self): 57 | return self.num_selected_samples 58 | 59 | def set_epoch(self, epoch): 60 | self.epoch = epoch -------------------------------------------------------------------------------- /slowfast/datasets/video_container.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import av 5 | 6 | 7 | def get_video_container(path_to_vid, multi_thread_decode=False, backend="pyav"): 8 | """ 9 | Given the path to the video, return the pyav video container. 10 | Args: 11 | path_to_vid (str): path to the video. 12 | multi_thread_decode (bool): if True, perform multi-thread decoding. 13 | backend (str): decoder backend, options include `pyav` and 14 | `torchvision`, default is `pyav`. 15 | Returns: 16 | container (container): video container. 17 | """ 18 | if backend == "torchvision": 19 | with open(path_to_vid, "rb") as fp: 20 | container = fp.read() 21 | return container 22 | elif backend == "pyav": 23 | container = av.open(path_to_vid) 24 | if multi_thread_decode: 25 | # Enable multiple threads for decoding. 26 | container.streams.video[0].thread_type = "AUTO" 27 | return container 28 | else: 29 | raise NotImplementedError("Unknown backend {}".format(backend)) 30 | -------------------------------------------------------------------------------- /slowfast/datasets/video_record.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | class VideoRecord(object): 5 | def __init__(self, row): 6 | self._data = row 7 | 8 | @property 9 | def segment_name(self): 10 | return NotImplementedError() 11 | 12 | @property 13 | def participant(self): 14 | return NotImplementedError() 15 | 16 | @property 17 | def untrimmed_video_name(self): 18 | return NotImplementedError() 19 | 20 | @property 21 | def start_frame(self): 22 | return NotImplementedError() 23 | 24 | @property 25 | def end_frame(self): 26 | return NotImplementedError() 27 | 28 | @property 29 | def num_frames(self): 30 | return NotImplementedError() 31 | 32 | @property 33 | def label(self): 34 | return NotImplementedError() -------------------------------------------------------------------------------- /slowfast/models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from .build import MODEL_REGISTRY, build_model # noqa 5 | from .video_model_builder import VisionTransformer -------------------------------------------------------------------------------- /slowfast/models/adamw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import math 5 | import torch 6 | from torch.optim.optimizer import Optimizer 7 | 8 | 9 | class AdamW(Optimizer): 10 | r"""Implements AdamW algorithm. 11 | 12 | The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. 13 | The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. 14 | 15 | Arguments: 16 | params (iterable): iterable of parameters to optimize or dicts defining 17 | parameter groups 18 | lr (float, optional): learning rate (default: 1e-3) 19 | betas (Tuple[float, float], optional): coefficients used for computing 20 | running averages of gradient and its square (default: (0.9, 0.999)) 21 | eps (float, optional): term added to the denominator to improve 22 | numerical stability (default: 1e-8) 23 | weight_decay (float, optional): weight decay coefficient (default: 1e-2) 24 | amsgrad (boolean, optional): whether to use the AMSGrad variant of this 25 | algorithm from the paper `On the Convergence of Adam and Beyond`_ 26 | (default: False) 27 | 28 | .. _Adam\: A Method for Stochastic Optimization: 29 | https://arxiv.org/abs/1412.6980 30 | .. _Decoupled Weight Decay Regularization: 31 | https://arxiv.org/abs/1711.05101 32 | .. _On the Convergence of Adam and Beyond: 33 | https://openreview.net/forum?id=ryQu7f-RZ 34 | """ 35 | 36 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, 37 | weight_decay=1e-2, amsgrad=False): 38 | if not 0.0 <= lr: 39 | raise ValueError("Invalid learning rate: {}".format(lr)) 40 | if not 0.0 <= eps: 41 | raise ValueError("Invalid epsilon value: {}".format(eps)) 42 | if not 0.0 <= betas[0] < 1.0: 43 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 44 | if not 0.0 <= betas[1] < 1.0: 45 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 46 | if not 0.0 <= weight_decay: 47 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 48 | defaults = dict(lr=lr, betas=betas, eps=eps, 49 | weight_decay=weight_decay, amsgrad=amsgrad) 50 | super(AdamW, self).__init__(params, defaults) 51 | 52 | def __setstate__(self, state): 53 | super(AdamW, self).__setstate__(state) 54 | for group in self.param_groups: 55 | group.setdefault('amsgrad', False) 56 | 57 | @torch.no_grad() 58 | def step(self, closure=None): 59 | """Performs a single optimization step. 60 | 61 | Arguments: 62 | closure (callable, optional): A closure that reevaluates the model 63 | and returns the loss. 64 | """ 65 | loss = None 66 | if closure is not None: 67 | with torch.enable_grad(): 68 | loss = closure() 69 | 70 | for group in self.param_groups: 71 | for p in group['params']: 72 | if p.grad is None: 73 | continue 74 | 75 | # Perform stepweight decay 76 | p.mul_(1 - group['lr'] * group['weight_decay']) 77 | 78 | # Perform optimization step 79 | grad = p.grad 80 | if grad.is_sparse: 81 | raise RuntimeError('AdamW does not support sparse gradients') 82 | amsgrad = group['amsgrad'] 83 | 84 | state = self.state[p] 85 | 86 | # State initialization 87 | if len(state) == 0: 88 | state['step'] = 0 89 | # Exponential moving average of gradient values 90 | state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) 91 | # Exponential moving average of squared gradient values 92 | state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) 93 | if amsgrad: 94 | # Maintains max of all exp. moving avg. of sq. grad. values 95 | state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) 96 | 97 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 98 | if amsgrad: 99 | max_exp_avg_sq = state['max_exp_avg_sq'] 100 | beta1, beta2 = group['betas'] 101 | 102 | state['step'] += 1 103 | bias_correction1 = 1 - beta1 ** state['step'] 104 | bias_correction2 = 1 - beta2 ** state['step'] 105 | 106 | # Decay the first and second moment running average coefficient 107 | exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) 108 | exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) 109 | if amsgrad: 110 | # Maintains the maximum of all 2nd moment running avg. till now 111 | torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 112 | # Use the max. for normalizing running avg. of gradient 113 | denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) 114 | else: 115 | denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) 116 | 117 | step_size = group['lr'] / bias_correction1 118 | 119 | p.addcdiv_(exp_avg, denom, value=-step_size) 120 | 121 | return loss -------------------------------------------------------------------------------- /slowfast/models/batchnorm_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """BatchNorm (BN) utility functions and custom batch-size BN implementations""" 5 | 6 | from functools import partial 7 | import torch 8 | import torch.distributed as dist 9 | import torch.nn as nn 10 | from torch.autograd.function import Function 11 | 12 | import slowfast.utils.distributed as du 13 | 14 | 15 | def get_norm(cfg): 16 | """ 17 | Args: 18 | cfg (CfgNode): model building configs, details are in the comments of 19 | the config file. 20 | Returns: 21 | nn.Module: the normalization layer. 22 | """ 23 | if cfg.BN.NORM_TYPE == "batchnorm": 24 | return nn.BatchNorm3d 25 | elif cfg.BN.NORM_TYPE == "sub_batchnorm": 26 | return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS) 27 | elif cfg.BN.NORM_TYPE == "sync_batchnorm": 28 | return partial( 29 | NaiveSyncBatchNorm3d, num_sync_devices=cfg.BN.NUM_SYNC_DEVICES 30 | ) 31 | else: 32 | raise NotImplementedError( 33 | "Norm type {} is not supported".format(cfg.BN.NORM_TYPE) 34 | ) 35 | 36 | 37 | class SubBatchNorm3d(nn.Module): 38 | """ 39 | The standard BN layer computes stats across all examples in a GPU. In some 40 | cases it is desirable to compute stats across only a subset of examples 41 | (e.g., in multigrid training https://arxiv.org/abs/1912.00998). 42 | SubBatchNorm3d splits the batch dimension into N splits, and run BN on 43 | each of them separately (so that the stats are computed on each subset of 44 | examples (1/N of batch) independently. During evaluation, it aggregates 45 | the stats from all splits into one BN. 46 | """ 47 | 48 | def __init__(self, num_splits, **args): 49 | """ 50 | Args: 51 | num_splits (int): number of splits. 52 | args (list): other arguments. 53 | """ 54 | super(SubBatchNorm3d, self).__init__() 55 | self.num_splits = num_splits 56 | num_features = args["num_features"] 57 | # Keep only one set of weight and bias. 58 | if args.get("affine", True): 59 | self.affine = True 60 | args["affine"] = False 61 | self.weight = torch.nn.Parameter(torch.ones(num_features)) 62 | self.bias = torch.nn.Parameter(torch.zeros(num_features)) 63 | else: 64 | self.affine = False 65 | self.bn = nn.BatchNorm3d(**args) 66 | args["num_features"] = num_features * num_splits 67 | self.split_bn = nn.BatchNorm3d(**args) 68 | 69 | def _get_aggregated_mean_std(self, means, stds, n): 70 | """ 71 | Calculate the aggregated mean and stds. 72 | Args: 73 | means (tensor): mean values. 74 | stds (tensor): standard deviations. 75 | n (int): number of sets of means and stds. 76 | """ 77 | mean = means.view(n, -1).sum(0) / n 78 | std = ( 79 | stds.view(n, -1).sum(0) / n 80 | + ((means.view(n, -1) - mean) ** 2).view(n, -1).sum(0) / n 81 | ) 82 | return mean.detach(), std.detach() 83 | 84 | def aggregate_stats(self): 85 | """ 86 | Synchronize running_mean, and running_var. Call this before eval. 87 | """ 88 | if self.split_bn.track_running_stats: 89 | ( 90 | self.bn.running_mean.data, 91 | self.bn.running_var.data, 92 | ) = self._get_aggregated_mean_std( 93 | self.split_bn.running_mean, 94 | self.split_bn.running_var, 95 | self.num_splits, 96 | ) 97 | 98 | def forward(self, x): 99 | if self.training: 100 | n, c, t, h, w = x.shape 101 | x = x.view(n // self.num_splits, c * self.num_splits, t, h, w) 102 | x = self.split_bn(x) 103 | x = x.view(n, c, t, h, w) 104 | else: 105 | x = self.bn(x) 106 | if self.affine: 107 | x = x * self.weight.view((-1, 1, 1, 1)) 108 | x = x + self.bias.view((-1, 1, 1, 1)) 109 | return x 110 | 111 | 112 | class GroupGather(Function): 113 | """ 114 | GroupGather performs all gather on each of the local process/ GPU groups. 115 | """ 116 | 117 | @staticmethod 118 | def forward(ctx, input, num_sync_devices, num_groups): 119 | """ 120 | Perform forwarding, gathering the stats across different process/ GPU 121 | group. 122 | """ 123 | ctx.num_sync_devices = num_sync_devices 124 | ctx.num_groups = num_groups 125 | 126 | input_list = [ 127 | torch.zeros_like(input) for k in range(du.get_local_size()) 128 | ] 129 | dist.all_gather( 130 | input_list, input, async_op=False, group=du._LOCAL_PROCESS_GROUP 131 | ) 132 | 133 | inputs = torch.stack(input_list, dim=0) 134 | if num_groups > 1: 135 | rank = du.get_local_rank() 136 | group_idx = rank // num_sync_devices 137 | inputs = inputs[ 138 | group_idx 139 | * num_sync_devices : (group_idx + 1) 140 | * num_sync_devices 141 | ] 142 | inputs = torch.sum(inputs, dim=0) 143 | return inputs 144 | 145 | @staticmethod 146 | def backward(ctx, grad_output): 147 | """ 148 | Perform backwarding, gathering the gradients across different process/ GPU 149 | group. 150 | """ 151 | grad_output_list = [ 152 | torch.zeros_like(grad_output) for k in range(du.get_local_size()) 153 | ] 154 | dist.all_gather( 155 | grad_output_list, 156 | grad_output, 157 | async_op=False, 158 | group=du._LOCAL_PROCESS_GROUP, 159 | ) 160 | 161 | grads = torch.stack(grad_output_list, dim=0) 162 | if ctx.num_groups > 1: 163 | rank = du.get_local_rank() 164 | group_idx = rank // ctx.num_sync_devices 165 | grads = grads[ 166 | group_idx 167 | * ctx.num_sync_devices : (group_idx + 1) 168 | * ctx.num_sync_devices 169 | ] 170 | grads = torch.sum(grads, dim=0) 171 | return grads, None, None 172 | 173 | 174 | class NaiveSyncBatchNorm3d(nn.BatchNorm3d): 175 | def __init__(self, num_sync_devices, **args): 176 | """ 177 | Naive version of Synchronized 3D BatchNorm. 178 | Args: 179 | num_sync_devices (int): number of device to sync. 180 | args (list): other arguments. 181 | """ 182 | self.num_sync_devices = num_sync_devices 183 | if self.num_sync_devices > 0: 184 | assert du.get_local_size() % self.num_sync_devices == 0, ( 185 | du.get_local_size(), 186 | self.num_sync_devices, 187 | ) 188 | self.num_groups = du.get_local_size() // self.num_sync_devices 189 | else: 190 | self.num_sync_devices = du.get_local_size() 191 | self.num_groups = 1 192 | super(NaiveSyncBatchNorm3d, self).__init__(**args) 193 | 194 | def forward(self, input): 195 | if du.get_local_size() == 1 or not self.training: 196 | return super().forward(input) 197 | 198 | assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs" 199 | C = input.shape[1] 200 | mean = torch.mean(input, dim=[0, 2, 3, 4]) 201 | meansqr = torch.mean(input * input, dim=[0, 2, 3, 4]) 202 | 203 | vec = torch.cat([mean, meansqr], dim=0) 204 | vec = GroupGather.apply(vec, self.num_sync_devices, self.num_groups) * ( 205 | 1.0 / self.num_sync_devices 206 | ) 207 | 208 | mean, meansqr = torch.split(vec, C) 209 | var = meansqr - mean * mean 210 | self.running_mean += self.momentum * (mean.detach() - self.running_mean) 211 | self.running_var += self.momentum * (var.detach() - self.running_var) 212 | 213 | invstd = torch.rsqrt(var + self.eps) 214 | scale = self.weight * invstd 215 | bias = self.bias - mean * scale 216 | scale = scale.reshape(1, -1, 1, 1, 1) 217 | bias = bias.reshape(1, -1, 1, 1, 1) 218 | return input * scale + bias -------------------------------------------------------------------------------- /slowfast/models/build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Model construction functions.""" 5 | import math 6 | import torch 7 | import slowfast as slowfast 8 | from fvcore.common.registry import Registry 9 | 10 | from . import vit_helper 11 | 12 | MODEL_REGISTRY = Registry("MODEL") 13 | MODEL_REGISTRY.__doc__ = """ 14 | Registry for video model. 15 | 16 | The registered object will be called with `obj(cfg)`. 17 | The call should return a `torch.nn.Module` object. 18 | """ 19 | 20 | 21 | def build_model(cfg, gpu_id=None): 22 | """ 23 | Builds the video model. 24 | Args: 25 | cfg (configs): configs that contains the hyper-parameters to build the 26 | backbone. Details can be seen in slowfast/config/defaults.py. 27 | gpu_id (Optional[int]): specify the gpu index to build model. 28 | """ 29 | if torch.cuda.is_available(): 30 | assert ( 31 | cfg.NUM_GPUS <= torch.cuda.device_count() 32 | ), "Cannot use more GPU devices than available" 33 | else: 34 | assert ( 35 | cfg.NUM_GPUS == 0 36 | ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs." 37 | 38 | # Construct the model 39 | name = cfg.MODEL.MODEL_NAME 40 | model = MODEL_REGISTRY.get(name)(cfg) 41 | 42 | if isinstance(model, slowfast.models.video_model_builder.VisionTransformer): 43 | if cfg.VIT.IM_PRETRAINED: 44 | vit_helper.load_pretrained( 45 | model, cfg=cfg, num_classes=cfg.MODEL.NUM_CLASSES, 46 | in_chans=cfg.VIT.CHANNELS, filter_fn=vit_helper._conv_filter, 47 | strict=False 48 | ) 49 | if hasattr(model, 'st_embed'): 50 | model.st_embed.data[:, 1:, :] = model.pos_embed.data[:, 1:, :].repeat( 51 | 1, cfg.VIT.TEMPORAL_RESOLUTION, 1) 52 | model.st_embed.data[:, 0, :] = model.pos_embed.data[:, 0, :] 53 | if hasattr(model, 'patch_embed_3d'): 54 | model.patch_embed_3d.proj.weight.data = torch.zeros_like( 55 | model.patch_embed_3d.proj.weight.data) 56 | n = math.floor(model.patch_embed_3d.proj.weight.shape[2] / 2) 57 | model.patch_embed_3d.proj.weight.data[:, :, n, :, :] = model.patch_embed.proj.weight.data 58 | model.patch_embed_3d.proj.bias.data = model.patch_embed.proj.bias.data 59 | 60 | if cfg.NUM_GPUS: 61 | if gpu_id is None: 62 | # Determine the GPU used by the current process 63 | cur_device = torch.cuda.current_device() 64 | else: 65 | cur_device = gpu_id 66 | # Transfer the model to the current GPU device 67 | model = model.cuda(device=cur_device) 68 | # Use multi-process data parallel model in the multi-gpu setting 69 | if cfg.NUM_GPUS > 1: 70 | # Make model replica operate on the current device 71 | model = torch.nn.parallel.DistributedDataParallel( 72 | module=model, device_ids=[cur_device], output_device=cur_device, 73 | find_unused_parameters=True 74 | ) 75 | return model 76 | -------------------------------------------------------------------------------- /slowfast/models/losses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Loss functions.""" 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy 9 | import slowfast.utils.metrics as metrics 10 | 11 | 12 | _LOSSES = { 13 | "cross_entropy": nn.CrossEntropyLoss, 14 | "bce": nn.BCELoss, 15 | "bce_logit": nn.BCEWithLogitsLoss, 16 | "label_smoothing_cross_entropy": LabelSmoothingCrossEntropy, 17 | "mse_loss": nn.MSELoss, 18 | "soft_target_cross_entropy": SoftTargetCrossEntropy 19 | } 20 | 21 | 22 | def get_loss_func(loss_name): 23 | """ 24 | Retrieve the loss given the loss name. 25 | Args (int): 26 | loss_name: the name of the loss to use. 27 | """ 28 | if loss_name not in _LOSSES.keys(): 29 | raise NotImplementedError("Loss {} is not supported".format(loss_name)) 30 | return _LOSSES[loss_name] 31 | -------------------------------------------------------------------------------- /slowfast/models/nystrom_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from einops import rearrange, repeat 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as Fn 8 | import math 9 | 10 | 11 | def iterative_inv(mat, n_iter = 6, init_option="exact"): 12 | I = torch.eye(mat.size(-2), device = mat.device) 13 | K = mat 14 | 15 | if init_option == "original": 16 | # This original implementation is more conservative to compute coefficient of Z_0. 17 | V = 1. / torch.max(torch.sum(K, dim = -2)) * K.transpose(-1, -2) 18 | elif init_option == "arbitrary_input": 19 | # sum = 1 for softmax input but not for exp 20 | a1 = torch.max(torch.sum(torch.abs(K), dim = -2, keepdim=True), dim=-1, keepdim=True).values 21 | a2 = torch.max(torch.sum(torch.abs(K), dim = -1, keepdim=True), dim=-2, keepdim=True).values 22 | V = 1. / (a1 * a2) * K.transpose(-1, -2) 23 | else: # The entries of K are positive and ||K||_{\infty} = 1 due to softmax 24 | # This is the exact coefficient computation, 25 | # 1 / ||K||_1, of initialization of Z_0, leading to faster convergence. 26 | V = 1. / torch.max( 27 | torch.sum(K, dim = -2), dim = -1).values.unsqueeze(-1).unsqueeze(-1) * K.transpose(-1, -2) 28 | 29 | for _ in range(n_iter): 30 | KV = torch.matmul(K, V) 31 | V = torch.matmul(0.25 * V, 13 * I - torch.matmul(KV, 15 * I - torch.matmul(KV, 7 * I - KV))) 32 | return V 33 | 34 | 35 | def nystrom_spatial_attn( 36 | q, k, v, landmarks=64, num_frames=None, inv_iters=6, 37 | use_full_matrix=False, use_spatial_landmarks=False, return_attn=False 38 | ): 39 | 40 | """ 41 | Compute full space-time attention but only softmax over spatial dimension 42 | """ 43 | B, N, D = k.shape 44 | F = num_frames 45 | scale = D ** -0.5 46 | q = q * scale 47 | if use_full_matrix: 48 | queries_landmarks = q.clone() 49 | keys_landmarks = k.clone() 50 | else: 51 | segs = N // landmarks 52 | with torch.no_grad(): 53 | if use_spatial_landmarks: 54 | # transpose spatial and temporal dimensions 55 | q2 = rearrange(q, 'b (f p) d -> b (p f) d', f=F) 56 | k2 = rearrange(k, 'b (f p) d -> b (p f) d', f=F) 57 | if (N % landmarks == 0): 58 | keys_landmarks = k2.reshape(B, landmarks, N // landmarks, D).mean(dim = -2) 59 | queries_landmarks = q2.reshape(B, landmarks, N // landmarks, D).mean(dim = -2) 60 | else: 61 | num_k = (segs + 1) * landmarks - N 62 | keys_landmarks_f = k2[:, :num_k * segs, :].reshape( 63 | B, num_k, segs, D).mean(dim = -2) 64 | keys_landmarks_l = k2[:, num_k * segs:, :].reshape( 65 | B, landmarks - num_k, segs + 1, D).mean(dim = -2) 66 | keys_landmarks = torch.cat((keys_landmarks_f, keys_landmarks_l), dim = -2) 67 | 68 | queries_landmarks_f = q2[:, :num_k * segs, :].reshape( 69 | B, num_k, segs, D).mean(dim = -2) 70 | queries_landmarks_l = q2[:, num_k * segs:, :].reshape( 71 | B, landmarks - num_k, segs + 1, D).mean(dim = -2) 72 | queries_landmarks = torch.cat((queries_landmarks_f, queries_landmarks_l), dim = -2) 73 | else: 74 | if (N % landmarks == 0): 75 | keys_landmarks = k.reshape( 76 | B, landmarks, N // landmarks, D).mean(dim = -2) 77 | queries_landmarks = q.reshape( 78 | B, landmarks, N // landmarks, D).mean(dim = -2) 79 | else: 80 | num_k = (segs + 1) * landmarks - N 81 | keys_landmarks_f = k[:, :num_k * segs, :].reshape( 82 | B, num_k, segs, D).mean(dim = -2) 83 | keys_landmarks_l = k[:, num_k * segs:, :].reshape( 84 | B, landmarks - num_k, segs + 1, D).mean(dim = -2) 85 | keys_landmarks = torch.cat((keys_landmarks_f, keys_landmarks_l), dim = -2) 86 | 87 | queries_landmarks_f = q[:, :num_k * segs, :].reshape( 88 | B, num_k, segs, D).mean(dim = -2) 89 | queries_landmarks_l = q[:, num_k * segs:, :].reshape( 90 | B, landmarks - num_k, segs + 1, D).mean(dim = -2) 91 | queries_landmarks = torch.cat((queries_landmarks_f, queries_landmarks_l), dim = -2) 92 | 93 | kernel_1 = Fn.softmax( 94 | torch.matmul(q, keys_landmarks.transpose(-1, -2)), dim = -1) 95 | kernel_2 = Fn.softmax( 96 | torch.matmul(queries_landmarks, keys_landmarks.transpose(-1, -2)), dim = -1) 97 | kernel_3 = Fn.softmax( 98 | rearrange(torch.matmul( 99 | queries_landmarks, k.transpose(-1, -2)), 'b l (f p) -> b l f p', f=F), dim = -1) 100 | attn = torch.matmul(kernel_1, iterative_inv(kernel_2, n_iter=inv_iters)) 101 | 102 | v = rearrange(v, 'b (f p) d -> b f p d', f=F) 103 | x = torch.einsum( 104 | 'b n l, b l f d -> b n f d', 105 | attn, torch.einsum('b l f p, b f p d -> b l f d', kernel_3, v) 106 | ) 107 | 108 | if return_attn: 109 | attn = torch.einsum('b m l, b l f p -> b m f p', attn, kernel_3) 110 | return x, attn 111 | 112 | return x -------------------------------------------------------------------------------- /slowfast/models/optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Optimizer.""" 5 | 6 | import torch 7 | from .adamw import AdamW 8 | 9 | import slowfast.utils.lr_policy as lr_policy 10 | 11 | 12 | def construct_optimizer(model, cfg): 13 | """ 14 | Construct a stochastic gradient descent or ADAM optimizer with momentum. 15 | Details can be found in: 16 | Herbert Robbins, and Sutton Monro. "A stochastic approximation method." 17 | and 18 | Diederik P.Kingma, and Jimmy Ba. 19 | "Adam: A Method for Stochastic Optimization." 20 | 21 | Args: 22 | model (model): model to perform stochastic gradient descent 23 | optimization or ADAM optimization. 24 | cfg (config): configs of hyper-parameters of SGD or ADAM, includes base 25 | learning rate, momentum, weight_decay, dampening, and etc. 26 | """ 27 | # Batchnorm parameters. 28 | bn_params = [] 29 | # Non-batchnorm parameters. 30 | non_bn_parameters = [] 31 | for m in model.modules(): 32 | is_bn = isinstance(m, torch.nn.modules.batchnorm._NormBase) 33 | for p in m.parameters(recurse=False): 34 | if is_bn: 35 | bn_params.append(p) 36 | else: 37 | non_bn_parameters.append(p) 38 | 39 | # Apply different weight decay to Batchnorm and non-batchnorm parameters. 40 | # In Caffe2 classification codebase the weight decay for batchnorm is 0.0. 41 | # Having a different weight decay on batchnorm might cause a performance 42 | # drop. 43 | optim_params = [ 44 | {"params": bn_params, "weight_decay": cfg.BN.WEIGHT_DECAY}, 45 | {"params": non_bn_parameters, "weight_decay": cfg.SOLVER.WEIGHT_DECAY}, 46 | ] 47 | # Check all parameters will be passed into optimizer. 48 | assert len(list(model.parameters())) == len(non_bn_parameters) + len( 49 | bn_params 50 | ), "parameter size does not match: {} + {} != {}".format( 51 | len(non_bn_parameters), len(bn_params), len(list(model.parameters())) 52 | ) 53 | 54 | if cfg.SOLVER.OPTIMIZING_METHOD == "sgd": 55 | return torch.optim.SGD( 56 | optim_params, 57 | lr=cfg.SOLVER.BASE_LR, 58 | momentum=cfg.SOLVER.MOMENTUM, 59 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 60 | dampening=cfg.SOLVER.DAMPENING, 61 | nesterov=cfg.SOLVER.NESTEROV, 62 | ) 63 | elif cfg.SOLVER.OPTIMIZING_METHOD == "adam": 64 | return torch.optim.Adam( 65 | optim_params, 66 | lr=cfg.SOLVER.BASE_LR, 67 | betas=(0.9, 0.999), 68 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 69 | ) 70 | elif cfg.SOLVER.OPTIMIZING_METHOD == "adamw": 71 | return AdamW( 72 | optim_params, 73 | lr=cfg.SOLVER.BASE_LR, 74 | betas=(0.9, 0.999), 75 | weight_decay=cfg.SOLVER.WEIGHT_DECAY, 76 | amsgrad=False 77 | ) 78 | else: 79 | raise NotImplementedError( 80 | "Does not support {} optimizer".format(cfg.SOLVER.OPTIMIZING_METHOD) 81 | ) 82 | 83 | 84 | def get_epoch_lr(cur_epoch, cfg): 85 | """ 86 | Retrieves the lr for the given epoch (as specified by the lr policy). 87 | Args: 88 | cfg (config): configs of hyper-parameters of ADAM, includes base 89 | learning rate, betas, and weight decays. 90 | cur_epoch (float): the number of epoch of the current training stage. 91 | """ 92 | return lr_policy.get_lr_at_epoch(cfg, cur_epoch) 93 | 94 | 95 | def set_lr(optimizer, new_lr): 96 | """ 97 | Sets the optimizer lr to the specified value. 98 | Args: 99 | optimizer (optim): the optimizer using to optimize the current network. 100 | new_lr (float): the new learning rate to set. 101 | """ 102 | for param_group in optimizer.param_groups: 103 | param_group["lr"] = new_lr 104 | -------------------------------------------------------------------------------- /slowfast/models/orthoformer_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | from einops import rearrange, repeat 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as Fn 8 | import math 9 | 10 | 11 | def orthogonal_landmarks(q, k, num_landmarks=64, subsample_fraction=1.0): 12 | """ 13 | Construct set of landmarks by recursively selecting new landmarks 14 | that are maximally orthogonal to the existing set. 15 | Returns near orthogonal landmarks with shape (B, M, D). 16 | """ 17 | if subsample_fraction < 1.0: 18 | # Need at least M/2 samples of queries and keys 19 | num_samples = max(int(subsample_fraction * q.size(-2)), num_landmarks) 20 | q_unnormalised = q[:, torch.randint(q.size(-2), (num_samples,), device=q.device), :] # (B, N, D) 21 | else: 22 | # (B, N, D) 23 | q_unnormalised = q 24 | 25 | # may need to change default eps to eps=1e-8 for mixed precision compatibility 26 | qk = Fn.normalize(q_unnormalised, p=2, dim=-1) 27 | B, N, D = qk.shape 28 | 29 | selected_mask = torch.zeros((B, N, 1), device=qk.device) 30 | landmark_mask = torch.ones((B, 1, 1), dtype=selected_mask.dtype, device=qk.device) 31 | 32 | # Get initial random landmark 33 | random_idx = torch.randint(qk.size(-2), (B, 1, 1), device=qk.device) 34 | selected_landmark = qk[torch.arange(qk.size(0)), random_idx.view(-1), :].view(B, D) 35 | selected_mask.scatter_(-2, random_idx, landmark_mask) 36 | 37 | # Selected landmarks 38 | selected_landmarks = torch.empty((B, num_landmarks, D), device=qk.device, dtype=qk.dtype) 39 | selected_landmarks[:, 0, :] = selected_landmark 40 | 41 | # Store computed cosine similarities 42 | cos_sims = torch.empty((B, N, num_landmarks), device=qk.device, dtype=qk.dtype) 43 | 44 | for M in range(1, num_landmarks): 45 | # Calculate absolute cosine similarity between selected and unselected landmarks 46 | # (B, N, D) * (B, D) -> (B, N) 47 | cos_sim = torch.einsum('b n d, b d -> b n', qk, selected_landmark).abs() 48 | cos_sims[:, :, M - 1] = cos_sim 49 | # (B, N, M) cosine similarities of current set of landmarks wrt all queries and keys 50 | cos_sim_set = cos_sims[:, :, :M] 51 | 52 | # Get orthogonal landmark: landmark with smallest absolute cosine similarity: 53 | # set cosine similarity for already selected landmarks to > 1 54 | cos_sim_set.view(-1, M)[selected_mask.flatten().bool(), :] = 10 55 | # (B,) - want max for non 56 | selected_landmark_idx = cos_sim_set.amax(-1).argmin(-1) 57 | selected_landmark = qk[torch.arange(qk.size(0)), selected_landmark_idx, :].view(B, D) 58 | 59 | # Add most orthogonal landmark to selected landmarks: 60 | selected_landmarks[:, M, :] = selected_landmark 61 | 62 | # Removed selected indices from non-selected mask: 63 | selected_mask.scatter_(-2, selected_landmark_idx.unsqueeze(-1).unsqueeze(-1), landmark_mask) 64 | landmarks = torch.masked_select( 65 | q_unnormalised, selected_mask.bool()).reshape(B, -1, D) # (B, M, D) 66 | return landmarks # (B, M, D) 67 | 68 | 69 | def orthoformer( 70 | q, k, v, num_landmarks=64, subsample_fraction=1.0, 71 | num_frames=None, shared_landmarks=True, return_attn=False 72 | ): 73 | """ 74 | Computes spatial attention for all pairs of frames. 75 | The attention matrix is approximated using 76 | intermediate landmarks taken from the queries and keys. 77 | The landmarks can be unique (to each frame) or 78 | shared (a common set of landmarks across frames). 79 | """ 80 | B, N, D = k.shape 81 | F = num_frames 82 | L = num_landmarks 83 | P = N // F 84 | 85 | scale = D ** -0.25 86 | q = q * scale 87 | k = k * scale 88 | 89 | if shared_landmarks: 90 | with torch.no_grad(): 91 | landmarks = orthogonal_landmarks(q, k, num_landmarks, subsample_fraction) 92 | kernel_1 = Fn.softmax(torch.matmul(q, landmarks.transpose(-1, -2)), dim=-1) 93 | kernel_2 = Fn.softmax( 94 | rearrange(torch.matmul( 95 | landmarks, k.transpose(-1, -2)), 'b l (f p) -> b l f p', f=F), dim=-1) 96 | v = rearrange(v, 'b (f p) d -> b f p d', f=F) 97 | x = torch.einsum('b l f p, b f p d -> b l f d', kernel_2, v) 98 | x = torch.einsum('b n l, b l f d -> b n f d', kernel_1, x) 99 | if return_attn: 100 | attn = torch.einsum('b m l, b l f p -> b m f p', kernel_1, kernel_2) 101 | return x, attn 102 | else: 103 | q = rearrange(q, 'b (f p) d -> (b f) p d', f=F) 104 | k = rearrange(k, 'b (g q) d -> (b g) q d', g=F) 105 | with torch.no_grad(): 106 | landmarks = orthogonal_landmarks(q, k, num_landmarks, subsample_fraction) 107 | landmarks = rearrange(landmarks, '(b f) l d -> b f l d', f=F) 108 | q = rearrange(q, '(b f) p d -> b f 1 p d', f=F) 109 | k = rearrange(k, '(b g) q d -> b 1 g q d', g=F) 110 | v = rearrange(v, 'b (g q) d -> b 1 g q d', g=F) 111 | kernel_1 = Fn.softmax( 112 | torch.matmul(q, landmarks.unsqueeze(-4).transpose(-1, -2)), dim=-1) 113 | kernel_2 = Fn.softmax( 114 | torch.matmul(landmarks.unsqueeze(-3), k.transpose(-1, -2)), dim=-1) 115 | x = torch.matmul(kernel_1, torch.matmul(kernel_2, v)) 116 | x = rearrange(x, 'b f g p d -> b (f p) g d') 117 | if return_attn: 118 | attn = torch.matmul(kernel_1, kernel_2) 119 | attn = rearrange(attn, 'b f g p q -> b (f p) g q') 120 | return x, attn 121 | 122 | return x -------------------------------------------------------------------------------- /slowfast/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /slowfast/utils/benchmark.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Functions for benchmarks. 4 | """ 5 | 6 | import numpy as np 7 | import pprint 8 | import torch 9 | import tqdm 10 | from fvcore.common.timer import Timer 11 | 12 | import slowfast.utils.logging as logging 13 | import slowfast.utils.misc as misc 14 | from slowfast.datasets import loader 15 | from slowfast.utils.env import setup_environment 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | def benchmark_data_loading(cfg): 21 | """ 22 | Benchmark the speed of data loading in PySlowFast. 23 | Args: 24 | 25 | cfg (CfgNode): configs. Details can be found in 26 | slowfast/config/defaults.py 27 | """ 28 | # Set up environment. 29 | setup_environment() 30 | # Set random seed from configs. 31 | np.random.seed(cfg.RNG_SEED) 32 | torch.manual_seed(cfg.RNG_SEED) 33 | 34 | # Setup logging format. 35 | logging.setup_logging(cfg.OUTPUT_DIR) 36 | 37 | # Print config. 38 | logger.info("Benchmark data loading with config:") 39 | logger.info(pprint.pformat(cfg)) 40 | 41 | timer = Timer() 42 | dataloader = loader.construct_loader(cfg, "train") 43 | logger.info( 44 | "Initialize loader using {:.2f} seconds.".format(timer.seconds()) 45 | ) 46 | # Total batch size across different machines. 47 | batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS 48 | log_period = cfg.BENCHMARK.LOG_PERIOD 49 | epoch_times = [] 50 | # Test for a few epochs. 51 | for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS): 52 | timer = Timer() 53 | timer_epoch = Timer() 54 | iter_times = [] 55 | if cfg.BENCHMARK.SHUFFLE: 56 | loader.shuffle_dataset(dataloader, cur_epoch) 57 | for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)): 58 | if cur_iter > 0 and cur_iter % log_period == 0: 59 | iter_times.append(timer.seconds()) 60 | ram_usage, ram_total = misc.cpu_mem_usage() 61 | logger.info( 62 | "Epoch {}: {} iters ({} videos) in {:.2f} seconds. " 63 | "RAM Usage: {:.2f}/{:.2f} GB.".format( 64 | cur_epoch, 65 | log_period, 66 | log_period * batch_size, 67 | iter_times[-1], 68 | ram_usage, 69 | ram_total, 70 | ) 71 | ) 72 | timer.reset() 73 | epoch_times.append(timer_epoch.seconds()) 74 | ram_usage, ram_total = misc.cpu_mem_usage() 75 | logger.info( 76 | "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. " 77 | "RAM Usage: {:.2f}/{:.2f} GB.".format( 78 | cur_epoch, 79 | len(dataloader), 80 | len(dataloader) * batch_size, 81 | epoch_times[-1], 82 | ram_usage, 83 | ram_total, 84 | ) 85 | ) 86 | logger.info( 87 | "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} " 88 | "(avg/std) seconds.".format( 89 | cur_epoch, 90 | log_period, 91 | log_period * batch_size, 92 | np.mean(iter_times), 93 | np.std(iter_times), 94 | ) 95 | ) 96 | logger.info( 97 | "On average every epoch ({} videos) takes {:.2f}/{:.2f} " 98 | "(avg/std) seconds.".format( 99 | len(dataloader) * batch_size, 100 | np.mean(epoch_times), 101 | np.std(epoch_times), 102 | ) 103 | ) 104 | -------------------------------------------------------------------------------- /slowfast/utils/bn_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """bn helper.""" 5 | 6 | import itertools 7 | import torch 8 | 9 | 10 | @torch.no_grad() 11 | def compute_and_update_bn_stats(model, data_loader, num_batches=200): 12 | """ 13 | Compute and update the batch norm stats to make it more precise. During 14 | training both bn stats and the weight are changing after every iteration, 15 | so the bn can not precisely reflect the latest stats of the current model. 16 | Here the bn stats is recomputed without change of weights, to make the 17 | running mean and running var more precise. 18 | Args: 19 | model (model): the model using to compute and update the bn stats. 20 | data_loader (dataloader): dataloader using to provide inputs. 21 | num_batches (int): running iterations using to compute the stats. 22 | """ 23 | 24 | # Prepares all the bn layers. 25 | bn_layers = [ 26 | m 27 | for m in model.modules() 28 | if any( 29 | ( 30 | isinstance(m, bn_type) 31 | for bn_type in ( 32 | torch.nn.BatchNorm1d, 33 | torch.nn.BatchNorm2d, 34 | torch.nn.BatchNorm3d, 35 | ) 36 | ) 37 | ) 38 | ] 39 | 40 | # In order to make the running stats only reflect the current batch, the 41 | # momentum is disabled. 42 | # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean 43 | # Setting the momentum to 1.0 to compute the stats without momentum. 44 | momentum_actual = [bn.momentum for bn in bn_layers] 45 | for bn in bn_layers: 46 | bn.momentum = 1.0 47 | 48 | # Calculates the running iterations for precise stats computation. 49 | running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers] 50 | running_square_mean = [torch.zeros_like(bn.running_var) for bn in bn_layers] 51 | 52 | for ind, (inputs, _, _) in enumerate( 53 | itertools.islice(data_loader, num_batches) 54 | ): 55 | # Forwards the model to update the bn stats. 56 | if isinstance(inputs, (list,)): 57 | for i in range(len(inputs)): 58 | inputs[i] = inputs[i].float().cuda(non_blocking=True) 59 | else: 60 | inputs = inputs.cuda(non_blocking=True) 61 | model(inputs) 62 | 63 | for i, bn in enumerate(bn_layers): 64 | # Accumulates the bn stats. 65 | running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1) 66 | # $E(x^2) = Var(x) + E(x)^2$. 67 | cur_square_mean = bn.running_var + bn.running_mean ** 2 68 | running_square_mean[i] += ( 69 | cur_square_mean - running_square_mean[i] 70 | ) / (ind + 1) 71 | 72 | for i, bn in enumerate(bn_layers): 73 | bn.running_mean = running_mean[i] 74 | # Var(x) = $E(x^2) - E(x)^2$. 75 | bn.running_var = running_square_mean[i] - bn.running_mean ** 2 76 | # Sets the precise bn stats. 77 | bn.momentum = momentum_actual[i] 78 | -------------------------------------------------------------------------------- /slowfast/utils/c2_model_loading.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Caffe2 to PyTorch checkpoint name converting utility.""" 5 | 6 | import re 7 | 8 | 9 | def get_name_convert_func(): 10 | """ 11 | Get the function to convert Caffe2 layer names to PyTorch layer names. 12 | Returns: 13 | (func): function to convert parameter name from Caffe2 format to PyTorch 14 | format. 15 | """ 16 | pairs = [ 17 | # ------------------------------------------------------------ 18 | # 'nonlocal_conv3_1_theta_w' -> 's3.pathway0_nonlocal3.conv_g.weight' 19 | [ 20 | r"^nonlocal_conv([0-9]+)_([0-9]+)_(.*)", 21 | r"s\1.pathway0_nonlocal\2_\3", 22 | ], 23 | # 'theta' -> 'conv_theta' 24 | [r"^(.*)_nonlocal([0-9]+)_(theta)(.*)", r"\1_nonlocal\2.conv_\3\4"], 25 | # 'g' -> 'conv_g' 26 | [r"^(.*)_nonlocal([0-9]+)_(g)(.*)", r"\1_nonlocal\2.conv_\3\4"], 27 | # 'phi' -> 'conv_phi' 28 | [r"^(.*)_nonlocal([0-9]+)_(phi)(.*)", r"\1_nonlocal\2.conv_\3\4"], 29 | # 'out' -> 'conv_out' 30 | [r"^(.*)_nonlocal([0-9]+)_(out)(.*)", r"\1_nonlocal\2.conv_\3\4"], 31 | # 'nonlocal_conv4_5_bn_s' -> 's4.pathway0_nonlocal3.bn.weight' 32 | [r"^(.*)_nonlocal([0-9]+)_(bn)_(.*)", r"\1_nonlocal\2.\3.\4"], 33 | # ------------------------------------------------------------ 34 | # 't_pool1_subsample_bn' -> 's1_fuse.conv_f2s.bn.running_mean' 35 | [r"^t_pool1_subsample_bn_(.*)", r"s1_fuse.bn.\1"], 36 | # 't_pool1_subsample' -> 's1_fuse.conv_f2s' 37 | [r"^t_pool1_subsample_(.*)", r"s1_fuse.conv_f2s.\1"], 38 | # 't_res4_5_branch2c_bn_subsample_bn_rm' -> 's4_fuse.conv_f2s.bias' 39 | [ 40 | r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_bn_(.*)", 41 | r"s\1_fuse.bn.\3", 42 | ], 43 | # 't_pool1_subsample' -> 's1_fuse.conv_f2s' 44 | [ 45 | r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_(.*)", 46 | r"s\1_fuse.conv_f2s.\3", 47 | ], 48 | # ------------------------------------------------------------ 49 | # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b' 50 | [ 51 | r"^res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)", 52 | r"s\1.pathway0_res\2.branch\3.\4_\5", 53 | ], 54 | # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.' 55 | [r"^res_conv1_bn_(.*)", r"s1.pathway0_stem.bn.\1"], 56 | # 'conv1_xy_w_momentum' -> 's1.pathway0_stem.conv_xy.' 57 | [r"^conv1_xy(.*)", r"s1.pathway0_stem.conv_xy\1"], 58 | # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.' 59 | [r"^conv1_(.*)", r"s1.pathway0_stem.conv.\1"], 60 | # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight' 61 | [ 62 | r"^res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)", 63 | r"s\1.pathway0_res\2.branch\3_\4", 64 | ], 65 | # 'res_conv1_' -> 's1.pathway0_stem.conv.' 66 | [r"^res_conv1_(.*)", r"s1.pathway0_stem.conv.\1"], 67 | # ------------------------------------------------------------ 68 | # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b' 69 | [ 70 | r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)", 71 | r"s\1.pathway1_res\2.branch\3.\4_\5", 72 | ], 73 | # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.' 74 | [r"^t_res_conv1_bn_(.*)", r"s1.pathway1_stem.bn.\1"], 75 | # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.' 76 | [r"^t_conv1_(.*)", r"s1.pathway1_stem.conv.\1"], 77 | # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight' 78 | [ 79 | r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)", 80 | r"s\1.pathway1_res\2.branch\3_\4", 81 | ], 82 | # 'res_conv1_' -> 's1.pathway0_stem.conv.' 83 | [r"^t_res_conv1_(.*)", r"s1.pathway1_stem.conv.\1"], 84 | # ------------------------------------------------------------ 85 | # pred_ -> head.projection. 86 | [r"pred_(.*)", r"head.projection.\1"], 87 | # '.b_bn_fc' -> '.se.fc' 88 | [r"(.*)b_bn_fc(.*)", r"\1se.fc\2"], 89 | # conv_5 -> head.conv_5. 90 | [r"conv_5(.*)", r"head.conv_5\1"], 91 | # conv_5 -> head.conv_5. 92 | [r"lin_5(.*)", r"head.lin_5\1"], 93 | # '.bn_b' -> '.weight' 94 | [r"(.*)bn.b\Z", r"\1bn.bias"], 95 | # '.bn_s' -> '.weight' 96 | [r"(.*)bn.s\Z", r"\1bn.weight"], 97 | # '_bn_rm' -> '.running_mean' 98 | [r"(.*)bn.rm\Z", r"\1bn.running_mean"], 99 | # '_bn_riv' -> '.running_var' 100 | [r"(.*)bn.riv\Z", r"\1bn.running_var"], 101 | # '_b' -> '.bias' 102 | [r"(.*)[\._]b\Z", r"\1.bias"], 103 | # '_w' -> '.weight' 104 | [r"(.*)[\._]w\Z", r"\1.weight"], 105 | ] 106 | 107 | def convert_caffe2_name_to_pytorch(caffe2_layer_name): 108 | """ 109 | Convert the caffe2_layer_name to pytorch format by apply the list of 110 | regular expressions. 111 | Args: 112 | caffe2_layer_name (str): caffe2 layer name. 113 | Returns: 114 | (str): pytorch layer name. 115 | """ 116 | for source, dest in pairs: 117 | caffe2_layer_name = re.sub(source, dest, caffe2_layer_name) 118 | return caffe2_layer_name 119 | 120 | return convert_caffe2_name_to_pytorch 121 | -------------------------------------------------------------------------------- /slowfast/utils/distributed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Distributed helpers.""" 5 | 6 | import functools 7 | import logging 8 | import pickle 9 | import torch 10 | import torch.distributed as dist 11 | 12 | _LOCAL_PROCESS_GROUP = None 13 | 14 | 15 | def all_gather(tensors): 16 | """ 17 | All gathers the provided tensors from all processes across machines. 18 | Args: 19 | tensors (list): tensors to perform all gather across all processes in 20 | all machines. 21 | """ 22 | 23 | gather_list = [] 24 | output_tensor = [] 25 | world_size = dist.get_world_size() 26 | for tensor in tensors: 27 | tensor_placeholder = [ 28 | torch.ones_like(tensor) for _ in range(world_size) 29 | ] 30 | dist.all_gather(tensor_placeholder, tensor, async_op=False) 31 | gather_list.append(tensor_placeholder) 32 | for gathered_tensor in gather_list: 33 | output_tensor.append(torch.cat(gathered_tensor, dim=0)) 34 | return output_tensor 35 | 36 | 37 | def all_reduce(tensors, average=True): 38 | """ 39 | All reduce the provided tensors from all processes across machines. 40 | Args: 41 | tensors (list): tensors to perform all reduce across all processes in 42 | all machines. 43 | average (bool): scales the reduced tensor by the number of overall 44 | processes across all machines. 45 | """ 46 | 47 | for tensor in tensors: 48 | dist.all_reduce(tensor, async_op=False) 49 | if average: 50 | world_size = dist.get_world_size() 51 | for tensor in tensors: 52 | tensor.mul_(1.0 / world_size) 53 | return tensors 54 | 55 | 56 | def init_process_group( 57 | local_rank, 58 | local_world_size, 59 | shard_id, 60 | num_shards, 61 | init_method, 62 | dist_backend="nccl", 63 | ): 64 | """ 65 | Initializes the default process group. 66 | Args: 67 | local_rank (int): the rank on the current local machine. 68 | local_world_size (int): the world size (number of processes running) on 69 | the current local machine. 70 | shard_id (int): the shard index (machine rank) of the current machine. 71 | num_shards (int): number of shards for distributed training. 72 | init_method (string): supporting three different methods for 73 | initializing process groups: 74 | "file": use shared file system to initialize the groups across 75 | different processes. 76 | "tcp": use tcp address to initialize the groups across different 77 | dist_backend (string): backend to use for distributed training. Options 78 | includes gloo, mpi and nccl, the details can be found here: 79 | https://pytorch.org/docs/stable/distributed.html 80 | """ 81 | # Sets the GPU to use. 82 | torch.cuda.set_device(local_rank) 83 | # Initialize the process group. 84 | proc_rank = local_rank + shard_id * local_world_size 85 | world_size = local_world_size * num_shards 86 | dist.init_process_group( 87 | backend=dist_backend, 88 | init_method=init_method, 89 | world_size=world_size, 90 | rank=proc_rank, 91 | ) 92 | 93 | 94 | def is_master_proc(num_gpus=8): 95 | """ 96 | Determines if the current process is the master process. 97 | """ 98 | if torch.distributed.is_initialized(): 99 | return dist.get_rank() % num_gpus == 0 100 | else: 101 | return True 102 | 103 | 104 | def is_root_proc(): 105 | """ 106 | Determines if the current process is the root process. 107 | """ 108 | if torch.distributed.is_initialized(): 109 | return dist.get_rank() == 0 110 | else: 111 | return True 112 | 113 | 114 | def get_world_size(): 115 | """ 116 | Get the size of the world. 117 | """ 118 | if not dist.is_available(): 119 | return 1 120 | if not dist.is_initialized(): 121 | return 1 122 | return dist.get_world_size() 123 | 124 | 125 | def get_rank(): 126 | """ 127 | Get the rank of the current process. 128 | """ 129 | if not dist.is_available(): 130 | return 0 131 | if not dist.is_initialized(): 132 | return 0 133 | return dist.get_rank() 134 | 135 | 136 | def synchronize(): 137 | """ 138 | Helper function to synchronize (barrier) among all processes when 139 | using distributed training 140 | """ 141 | if not dist.is_available(): 142 | return 143 | if not dist.is_initialized(): 144 | return 145 | world_size = dist.get_world_size() 146 | if world_size == 1: 147 | return 148 | dist.barrier() 149 | 150 | 151 | @functools.lru_cache() 152 | def _get_global_gloo_group(): 153 | """ 154 | Return a process group based on gloo backend, containing all the ranks 155 | The result is cached. 156 | Returns: 157 | (group): pytorch dist group. 158 | """ 159 | if dist.get_backend() == "nccl": 160 | return dist.new_group(backend="gloo") 161 | else: 162 | return dist.group.WORLD 163 | 164 | 165 | def _serialize_to_tensor(data, group): 166 | """ 167 | Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl` 168 | backend is supported. 169 | Args: 170 | data (data): data to be serialized. 171 | group (group): pytorch dist group. 172 | Returns: 173 | tensor (ByteTensor): tensor that serialized. 174 | """ 175 | 176 | backend = dist.get_backend(group) 177 | assert backend in ["gloo", "nccl"] 178 | device = torch.device("cpu" if backend == "gloo" else "cuda") 179 | 180 | buffer = pickle.dumps(data) 181 | if len(buffer) > 1024 ** 3: 182 | logger = logging.getLogger(__name__) 183 | logger.warning( 184 | "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( 185 | get_rank(), len(buffer) / (1024 ** 3), device 186 | ) 187 | ) 188 | storage = torch.ByteStorage.from_buffer(buffer) 189 | tensor = torch.ByteTensor(storage).to(device=device) 190 | return tensor 191 | 192 | 193 | def _pad_to_largest_tensor(tensor, group): 194 | """ 195 | Padding all the tensors from different GPUs to the largest ones. 196 | Args: 197 | tensor (tensor): tensor to pad. 198 | group (group): pytorch dist group. 199 | Returns: 200 | list[int]: size of the tensor, on each rank 201 | Tensor: padded tensor that has the max size 202 | """ 203 | world_size = dist.get_world_size(group=group) 204 | assert ( 205 | world_size >= 1 206 | ), "comm.gather/all_gather must be called from ranks within the given group!" 207 | local_size = torch.tensor( 208 | [tensor.numel()], dtype=torch.int64, device=tensor.device 209 | ) 210 | size_list = [ 211 | torch.zeros([1], dtype=torch.int64, device=tensor.device) 212 | for _ in range(world_size) 213 | ] 214 | dist.all_gather(size_list, local_size, group=group) 215 | size_list = [int(size.item()) for size in size_list] 216 | 217 | max_size = max(size_list) 218 | 219 | # we pad the tensor because torch all_gather does not support 220 | # gathering tensors of different shapes 221 | if local_size != max_size: 222 | padding = torch.zeros( 223 | (max_size - local_size,), dtype=torch.uint8, device=tensor.device 224 | ) 225 | tensor = torch.cat((tensor, padding), dim=0) 226 | return size_list, tensor 227 | 228 | 229 | def all_gather_unaligned(data, group=None): 230 | """ 231 | Run all_gather on arbitrary picklable data (not necessarily tensors). 232 | 233 | Args: 234 | data: any picklable object 235 | group: a torch process group. By default, will use a group which 236 | contains all ranks on gloo backend. 237 | 238 | Returns: 239 | list[data]: list of data gathered from each rank 240 | """ 241 | if get_world_size() == 1: 242 | return [data] 243 | if group is None: 244 | group = _get_global_gloo_group() 245 | if dist.get_world_size(group) == 1: 246 | return [data] 247 | 248 | tensor = _serialize_to_tensor(data, group) 249 | 250 | size_list, tensor = _pad_to_largest_tensor(tensor, group) 251 | max_size = max(size_list) 252 | 253 | # receiving Tensor from all ranks 254 | tensor_list = [ 255 | torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) 256 | for _ in size_list 257 | ] 258 | dist.all_gather(tensor_list, tensor, group=group) 259 | 260 | data_list = [] 261 | for size, tensor in zip(size_list, tensor_list): 262 | buffer = tensor.cpu().numpy().tobytes()[:size] 263 | data_list.append(pickle.loads(buffer)) 264 | 265 | return data_list 266 | 267 | 268 | def init_distributed_training(cfg): 269 | """ 270 | Initialize variables needed for distributed training. 271 | """ 272 | if cfg.NUM_GPUS <= 1: 273 | return 274 | num_gpus_per_machine = cfg.NUM_GPUS 275 | num_machines = dist.get_world_size() // num_gpus_per_machine 276 | for i in range(num_machines): 277 | ranks_on_i = list( 278 | range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine) 279 | ) 280 | pg = dist.new_group(ranks_on_i) 281 | if i == cfg.SHARD_ID: 282 | global _LOCAL_PROCESS_GROUP 283 | _LOCAL_PROCESS_GROUP = pg 284 | 285 | 286 | def get_local_size() -> int: 287 | """ 288 | Returns: 289 | The size of the per-machine process group, 290 | i.e. the number of processes per machine. 291 | """ 292 | if not dist.is_available(): 293 | return 1 294 | if not dist.is_initialized(): 295 | return 1 296 | return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) 297 | 298 | 299 | def get_local_rank() -> int: 300 | """ 301 | Returns: 302 | The rank of the current process within the local (per-machine) process group. 303 | """ 304 | if not dist.is_available(): 305 | return 0 306 | if not dist.is_initialized(): 307 | return 0 308 | assert _LOCAL_PROCESS_GROUP is not None 309 | return dist.get_rank(group=_LOCAL_PROCESS_GROUP) 310 | -------------------------------------------------------------------------------- /slowfast/utils/env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Set up Environment.""" 5 | 6 | import slowfast.utils.logging as logging 7 | 8 | _ENV_SETUP_DONE = False 9 | 10 | 11 | def setup_environment(): 12 | global _ENV_SETUP_DONE 13 | if _ENV_SETUP_DONE: 14 | return 15 | _ENV_SETUP_DONE = True 16 | -------------------------------------------------------------------------------- /slowfast/utils/logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Logging.""" 5 | 6 | import atexit 7 | import builtins 8 | import decimal 9 | import functools 10 | import logging 11 | import os 12 | import sys 13 | import simplejson 14 | from iopath.common.file_io import g_pathmgr 15 | 16 | import slowfast.utils.distributed as du 17 | 18 | 19 | def _suppress_print(): 20 | """ 21 | Suppresses printing from the current process. 22 | """ 23 | 24 | def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False): 25 | pass 26 | 27 | builtins.print = print_pass 28 | 29 | 30 | @functools.lru_cache(maxsize=None) 31 | def _cached_log_stream(filename): 32 | io = g_pathmgr.open(filename, "a", buffering=1024) 33 | atexit.register(io.close) 34 | return io 35 | 36 | 37 | def setup_logging(output_dir=None): 38 | """ 39 | Sets up the logging for multiple processes. Only enable the logging for the 40 | master process, and suppress logging for the non-master processes. 41 | """ 42 | # Set up logging format. 43 | _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s" 44 | 45 | if du.is_master_proc(): 46 | # Enable logging for the master process. 47 | logging.root.handlers = [] 48 | else: 49 | # Suppress logging for non-master processes. 50 | _suppress_print() 51 | 52 | logger = logging.getLogger() 53 | logger.setLevel(logging.DEBUG) 54 | logger.propagate = False 55 | plain_formatter = logging.Formatter( 56 | "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s", 57 | datefmt="%m/%d %H:%M:%S", 58 | ) 59 | 60 | if du.is_master_proc(): 61 | ch = logging.StreamHandler(stream=sys.stdout) 62 | ch.setLevel(logging.DEBUG) 63 | ch.setFormatter(plain_formatter) 64 | logger.addHandler(ch) 65 | 66 | if output_dir is not None and du.is_master_proc(du.get_world_size()): 67 | filename = os.path.join(output_dir, "stdout.log") 68 | fh = logging.StreamHandler(_cached_log_stream(filename)) 69 | fh.setLevel(logging.DEBUG) 70 | fh.setFormatter(plain_formatter) 71 | logger.addHandler(fh) 72 | 73 | 74 | def get_logger(name): 75 | """ 76 | Retrieve the logger with the specified name or, if name is None, return a 77 | logger which is the root logger of the hierarchy. 78 | Args: 79 | name (string): name of the logger. 80 | """ 81 | return logging.getLogger(name) 82 | 83 | 84 | def log_json_stats(stats): 85 | """ 86 | Logs json stats. 87 | Args: 88 | stats (dict): a dictionary of statistical information to log. 89 | """ 90 | stats = { 91 | k: decimal.Decimal("{:.5f}".format(v)) if isinstance(v, float) else v 92 | for k, v in stats.items() 93 | } 94 | json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True) 95 | logger = get_logger(__name__) 96 | logger.info("json_stats: {:s}".format(json_stats)) 97 | -------------------------------------------------------------------------------- /slowfast/utils/lr_policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Learning rate policy.""" 5 | 6 | import math 7 | 8 | 9 | def get_lr_at_epoch(cfg, cur_epoch): 10 | """ 11 | Retrieve the learning rate of the current epoch with the option to perform 12 | warm up in the beginning of the training stage. 13 | Args: 14 | cfg (CfgNode): configs. Details can be found in 15 | slowfast/config/defaults.py 16 | cur_epoch (float): the number of epoch of the current training stage. 17 | """ 18 | lr = get_lr_func(cfg.SOLVER.LR_POLICY)(cfg, cur_epoch) 19 | # Perform warm up. 20 | if cur_epoch < cfg.SOLVER.WARMUP_EPOCHS: 21 | lr_start = cfg.SOLVER.WARMUP_START_LR 22 | lr_end = get_lr_func(cfg.SOLVER.LR_POLICY)( 23 | cfg, cfg.SOLVER.WARMUP_EPOCHS 24 | ) 25 | alpha = (lr_end - lr_start) / cfg.SOLVER.WARMUP_EPOCHS 26 | lr = cur_epoch * alpha + lr_start 27 | return lr 28 | 29 | 30 | def lr_func_cosine(cfg, cur_epoch): 31 | """ 32 | Retrieve the learning rate to specified values at specified epoch with the 33 | cosine learning rate schedule. Details can be found in: 34 | Ilya Loshchilov, and Frank Hutter 35 | SGDR: Stochastic Gradient Descent With Warm Restarts. 36 | Args: 37 | cfg (CfgNode): configs. Details can be found in 38 | slowfast/config/defaults.py 39 | cur_epoch (float): the number of epoch of the current training stage. 40 | """ 41 | assert cfg.SOLVER.COSINE_END_LR < cfg.SOLVER.BASE_LR 42 | return ( 43 | cfg.SOLVER.COSINE_END_LR 44 | + (cfg.SOLVER.BASE_LR - cfg.SOLVER.COSINE_END_LR) 45 | * (math.cos(math.pi * cur_epoch / cfg.SOLVER.MAX_EPOCH) + 1.0) 46 | * 0.5 47 | ) 48 | 49 | 50 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch): 51 | """ 52 | Retrieve the learning rate to specified values at specified epoch with the 53 | steps with relative learning rate schedule. 54 | Args: 55 | cfg (CfgNode): configs. Details can be found in 56 | slowfast/config/defaults.py 57 | cur_epoch (float): the number of epoch of the current training stage. 58 | """ 59 | ind = get_step_index(cfg, cur_epoch) 60 | return cfg.SOLVER.LRS[ind] * cfg.SOLVER.BASE_LR 61 | 62 | 63 | def get_step_index(cfg, cur_epoch): 64 | """ 65 | Retrieves the lr step index for the given epoch. 66 | Args: 67 | cfg (CfgNode): configs. Details can be found in 68 | slowfast/config/defaults.py 69 | cur_epoch (float): the number of epoch of the current training stage. 70 | """ 71 | steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_EPOCH] 72 | for ind, step in enumerate(steps): # NoQA 73 | if cur_epoch < step: 74 | break 75 | return ind - 1 76 | 77 | 78 | def get_lr_func(lr_policy): 79 | """ 80 | Given the configs, retrieve the specified lr policy function. 81 | Args: 82 | lr_policy (string): the learning rate policy to use for the job. 83 | """ 84 | policy = "lr_func_" + lr_policy 85 | if policy not in globals(): 86 | raise NotImplementedError("Unknown LR policy: {}".format(lr_policy)) 87 | else: 88 | return globals()[policy] 89 | -------------------------------------------------------------------------------- /slowfast/utils/metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Functions for computing metrics.""" 5 | 6 | import numpy as np 7 | import torch 8 | 9 | 10 | def topks_correct(preds, labels, ks): 11 | """ 12 | Given the predictions, labels, and a list of top-k values, compute the 13 | number of correct predictions for each top-k value. 14 | 15 | Args: 16 | preds (array): array of predictions. Dimension is batchsize 17 | N x ClassNum. 18 | labels (array): array of labels. Dimension is batchsize N. 19 | ks (list): list of top-k values. For example, ks = [1, 5] correspods 20 | to top-1 and top-5. 21 | 22 | Returns: 23 | topks_correct (list): list of numbers, where the `i`-th entry 24 | corresponds to the number of top-`ks[i]` correct predictions. 25 | """ 26 | assert preds.size(0) == labels.size( 27 | 0 28 | ), "Batch dim of predictions and labels must match" 29 | # Find the top max_k predictions for each sample 30 | _top_max_k_vals, top_max_k_inds = torch.topk( 31 | preds, max(ks), dim=1, largest=True, sorted=True 32 | ) 33 | # (batch_size, max_k) -> (max_k, batch_size). 34 | top_max_k_inds = top_max_k_inds.t() 35 | # (batch_size, ) -> (max_k, batch_size). 36 | rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds) 37 | # (i, j) = 1 if top i-th prediction for the j-th sample is correct. 38 | top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels) 39 | # Compute the number of topk correct predictions for each k. 40 | topks_correct = [top_max_k_correct[:k, :].float().sum() for k in ks] 41 | return topks_correct 42 | 43 | 44 | def topk_errors(preds, labels, ks): 45 | """ 46 | Computes the top-k error for each k. 47 | Args: 48 | preds (array): array of predictions. Dimension is N. 49 | labels (array): array of labels. Dimension is N. 50 | ks (list): list of ks to calculate the top accuracies. 51 | """ 52 | num_topks_correct = topks_correct(preds, labels, ks) 53 | return [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct] 54 | 55 | 56 | def topk_accuracies(preds, labels, ks): 57 | """ 58 | Computes the top-k accuracy for each k. 59 | Args: 60 | preds (array): array of predictions. Dimension is N. 61 | labels (array): array of labels. Dimension is N. 62 | ks (list): list of ks to calculate the top accuracies. 63 | """ 64 | num_topks_correct = topks_correct(preds, labels, ks) 65 | return [(x / preds.size(0)) * 100.0 for x in num_topks_correct] 66 | 67 | 68 | def multitask_topks_correct(preds, labels, ks=(1,)): 69 | """ 70 | Args: 71 | preds: tuple(torch.FloatTensor), each tensor should be of shape 72 | [batch_size, class_count], class_count can vary on a per task basis, i.e. 73 | outputs[i].shape[1] can be different to outputs[j].shape[j]. 74 | labels: tuple(torch.LongTensor), each tensor should be of shape [batch_size] 75 | ks: tuple(int), compute accuracy at top-k for the values of k specified 76 | in this parameter. 77 | Returns: 78 | tuple(float), same length at topk with the corresponding accuracy@k in. 79 | """ 80 | max_k = int(np.max(ks)) 81 | task_count = len(preds) 82 | batch_size = labels[0].size(0) 83 | all_correct = torch.zeros(max_k, batch_size).type(torch.ByteTensor) 84 | all_correct = all_correct.to(preds[0].device) 85 | for output, label in zip(preds, labels): 86 | _, max_k_idx = output.topk(max_k, dim=1, largest=True, sorted=True) 87 | # Flip batch_size, class_count as .view doesn't work on non-contiguous 88 | max_k_idx = max_k_idx.t() 89 | correct_for_task = max_k_idx.eq(label.view(1, -1).expand_as(max_k_idx)) 90 | all_correct.add_(correct_for_task) 91 | 92 | multitask_topks_correct = [ 93 | torch.ge(all_correct[:k].float().sum(0), task_count).float().sum(0) for k in ks 94 | ] 95 | 96 | return multitask_topks_correct 97 | 98 | 99 | def multitask_topk_accuracies(preds, labels, ks): 100 | """ 101 | Computes the top-k accuracy for each k. 102 | Args: 103 | preds (array): array of predictions. Dimension is N. 104 | labels (array): array of labels. Dimension is N. 105 | ks (list): list of ks to calculate the top accuracies. 106 | """ 107 | num_multitask_topks_correct = multitask_topks_correct(preds, labels, ks) 108 | return [(x / preds[0].size(0)) * 100.0 for x in num_multitask_topks_correct] -------------------------------------------------------------------------------- /slowfast/utils/multigrid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Helper functions for multigrid training.""" 5 | 6 | import numpy as np 7 | 8 | import slowfast.utils.logging as logging 9 | 10 | logger = logging.get_logger(__name__) 11 | 12 | 13 | class MultigridSchedule(object): 14 | """ 15 | This class defines multigrid training schedule and update cfg accordingly. 16 | """ 17 | 18 | def init_multigrid(self, cfg): 19 | """ 20 | Update cfg based on multigrid settings. 21 | Args: 22 | cfg (configs): configs that contains training and multigrid specific 23 | hyperparameters. Details can be seen in 24 | slowfast/config/defaults.py. 25 | Returns: 26 | cfg (configs): the updated cfg. 27 | """ 28 | self.schedule = None 29 | # We may modify cfg.TRAIN.BATCH_SIZE, cfg.DATA.NUM_FRAMES, and 30 | # cfg.DATA.TRAIN_CROP_SIZE during training, so we store their original 31 | # value in cfg and use them as global variables. 32 | cfg.MULTIGRID.DEFAULT_B = cfg.TRAIN.BATCH_SIZE 33 | cfg.MULTIGRID.DEFAULT_T = cfg.DATA.NUM_FRAMES 34 | cfg.MULTIGRID.DEFAULT_S = cfg.DATA.TRAIN_CROP_SIZE 35 | 36 | if cfg.MULTIGRID.LONG_CYCLE: 37 | self.schedule = self.get_long_cycle_schedule(cfg) 38 | cfg.SOLVER.STEPS = [0] + [s[-1] for s in self.schedule] 39 | # Fine-tuning phase. 40 | cfg.SOLVER.STEPS[-1] = ( 41 | cfg.SOLVER.STEPS[-2] + cfg.SOLVER.STEPS[-1] 42 | ) // 2 43 | cfg.SOLVER.LRS = [ 44 | cfg.SOLVER.GAMMA ** s[0] * s[1][0] for s in self.schedule 45 | ] 46 | # Fine-tuning phase. 47 | cfg.SOLVER.LRS = cfg.SOLVER.LRS[:-1] + [ 48 | cfg.SOLVER.LRS[-2], 49 | cfg.SOLVER.LRS[-1], 50 | ] 51 | 52 | cfg.SOLVER.MAX_EPOCH = self.schedule[-1][-1] 53 | 54 | elif cfg.MULTIGRID.SHORT_CYCLE: 55 | cfg.SOLVER.STEPS = [ 56 | int(s * cfg.MULTIGRID.EPOCH_FACTOR) for s in cfg.SOLVER.STEPS 57 | ] 58 | cfg.SOLVER.MAX_EPOCH = int( 59 | cfg.SOLVER.MAX_EPOCH * cfg.MULTIGRID.EPOCH_FACTOR 60 | ) 61 | return cfg 62 | 63 | def update_long_cycle(self, cfg, cur_epoch): 64 | """ 65 | Before every epoch, check if long cycle shape should change. If it 66 | should, update cfg accordingly. 67 | Args: 68 | cfg (configs): configs that contains training and multigrid specific 69 | hyperparameters. Details can be seen in 70 | slowfast/config/defaults.py. 71 | cur_epoch (int): current epoch index. 72 | Returns: 73 | cfg (configs): the updated cfg. 74 | changed (bool): do we change long cycle shape at this epoch? 75 | """ 76 | base_b, base_t, base_s = get_current_long_cycle_shape( 77 | self.schedule, cur_epoch 78 | ) 79 | if base_s != cfg.DATA.TRAIN_CROP_SIZE or base_t != cfg.DATA.NUM_FRAMES: 80 | 81 | cfg.DATA.NUM_FRAMES = base_t 82 | cfg.DATA.TRAIN_CROP_SIZE = base_s 83 | cfg.TRAIN.BATCH_SIZE = base_b * cfg.MULTIGRID.DEFAULT_B 84 | 85 | bs_factor = ( 86 | float(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS) 87 | / cfg.MULTIGRID.BN_BASE_SIZE 88 | ) 89 | 90 | if bs_factor < 1: 91 | cfg.BN.NORM_TYPE = "sync_batchnorm" 92 | cfg.BN.NUM_SYNC_DEVICES = int(1.0 / bs_factor) 93 | elif bs_factor > 1: 94 | cfg.BN.NORM_TYPE = "sub_batchnorm" 95 | cfg.BN.NUM_SPLITS = int(bs_factor) 96 | else: 97 | cfg.BN.NORM_TYPE = "batchnorm" 98 | 99 | cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = cfg.DATA.SAMPLING_RATE * ( 100 | cfg.MULTIGRID.DEFAULT_T // cfg.DATA.NUM_FRAMES 101 | ) 102 | logger.info("Long cycle updates:") 103 | logger.info("\tBN.NORM_TYPE: {}".format(cfg.BN.NORM_TYPE)) 104 | if cfg.BN.NORM_TYPE == "sync_batchnorm": 105 | logger.info( 106 | "\tBN.NUM_SYNC_DEVICES: {}".format(cfg.BN.NUM_SYNC_DEVICES) 107 | ) 108 | elif cfg.BN.NORM_TYPE == "sub_batchnorm": 109 | logger.info("\tBN.NUM_SPLITS: {}".format(cfg.BN.NUM_SPLITS)) 110 | logger.info("\tTRAIN.BATCH_SIZE: {}".format(cfg.TRAIN.BATCH_SIZE)) 111 | logger.info( 112 | "\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format( 113 | cfg.DATA.NUM_FRAMES, cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE 114 | ) 115 | ) 116 | logger.info( 117 | "\tDATA.TRAIN_CROP_SIZE: {}".format(cfg.DATA.TRAIN_CROP_SIZE) 118 | ) 119 | return cfg, True 120 | else: 121 | return cfg, False 122 | 123 | def get_long_cycle_schedule(self, cfg): 124 | """ 125 | Based on multigrid hyperparameters, define the schedule of a long cycle. 126 | Args: 127 | cfg (configs): configs that contains training and multigrid specific 128 | hyperparameters. Details can be seen in 129 | slowfast/config/defaults.py. 130 | Returns: 131 | schedule (list): Specifies a list long cycle base shapes and their 132 | corresponding training epochs. 133 | """ 134 | 135 | steps = cfg.SOLVER.STEPS 136 | 137 | default_size = float( 138 | cfg.DATA.NUM_FRAMES * cfg.DATA.TRAIN_CROP_SIZE ** 2 139 | ) 140 | default_iters = steps[-1] 141 | 142 | # Get shapes and average batch size for each long cycle shape. 143 | avg_bs = [] 144 | all_shapes = [] 145 | for t_factor, s_factor in cfg.MULTIGRID.LONG_CYCLE_FACTORS: 146 | base_t = int(round(cfg.DATA.NUM_FRAMES * t_factor)) 147 | base_s = int(round(cfg.DATA.TRAIN_CROP_SIZE * s_factor)) 148 | if cfg.MULTIGRID.SHORT_CYCLE: 149 | shapes = [ 150 | [ 151 | base_t, 152 | cfg.MULTIGRID.DEFAULT_S 153 | * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[0], 154 | ], 155 | [ 156 | base_t, 157 | cfg.MULTIGRID.DEFAULT_S 158 | * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[1], 159 | ], 160 | [base_t, base_s], 161 | ] 162 | else: 163 | shapes = [[base_t, base_s]] 164 | 165 | # (T, S) -> (B, T, S) 166 | shapes = [ 167 | [int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]] 168 | for s in shapes 169 | ] 170 | avg_bs.append(np.mean([s[0] for s in shapes])) 171 | all_shapes.append(shapes) 172 | 173 | # Get schedule regardless of cfg.MULTIGRID.EPOCH_FACTOR. 174 | total_iters = 0 175 | schedule = [] 176 | for step_index in range(len(steps) - 1): 177 | step_epochs = steps[step_index + 1] - steps[step_index] 178 | 179 | for long_cycle_index, shapes in enumerate(all_shapes): 180 | cur_epochs = ( 181 | step_epochs * avg_bs[long_cycle_index] / sum(avg_bs) 182 | ) 183 | 184 | cur_iters = cur_epochs / avg_bs[long_cycle_index] 185 | total_iters += cur_iters 186 | schedule.append((step_index, shapes[-1], cur_epochs)) 187 | 188 | iter_saving = default_iters / total_iters 189 | 190 | final_step_epochs = cfg.SOLVER.MAX_EPOCH - steps[-1] 191 | 192 | # We define the fine-tuning phase to have the same amount of iteration 193 | # saving as the rest of the training. 194 | ft_epochs = final_step_epochs / iter_saving * avg_bs[-1] 195 | 196 | schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs)) 197 | 198 | # Obtrain final schedule given desired cfg.MULTIGRID.EPOCH_FACTOR. 199 | x = ( 200 | cfg.SOLVER.MAX_EPOCH 201 | * cfg.MULTIGRID.EPOCH_FACTOR 202 | / sum(s[-1] for s in schedule) 203 | ) 204 | 205 | final_schedule = [] 206 | total_epochs = 0 207 | for s in schedule: 208 | epochs = s[2] * x 209 | total_epochs += epochs 210 | final_schedule.append((s[0], s[1], int(round(total_epochs)))) 211 | print_schedule(final_schedule) 212 | return final_schedule 213 | 214 | 215 | def print_schedule(schedule): 216 | """ 217 | Log schedule. 218 | """ 219 | logger.info("Long cycle index\tBase shape\tEpochs") 220 | for s in schedule: 221 | logger.info("{}\t{}\t{}".format(s[0], s[1], s[2])) 222 | 223 | 224 | def get_current_long_cycle_shape(schedule, epoch): 225 | """ 226 | Given a schedule and epoch index, return the long cycle base shape. 227 | Args: 228 | schedule (configs): configs that contains training and multigrid specific 229 | hyperparameters. Details can be seen in 230 | slowfast/config/defaults.py. 231 | cur_epoch (int): current epoch index. 232 | Returns: 233 | shapes (list): A list describing the base shape in a long cycle: 234 | [batch size relative to default, 235 | number of frames, spatial dimension]. 236 | """ 237 | for s in schedule: 238 | if epoch < s[-1]: 239 | return s[1] 240 | return schedule[-1][1] 241 | -------------------------------------------------------------------------------- /slowfast/utils/multiprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Multiprocessing helpers.""" 5 | 6 | import torch 7 | 8 | 9 | def run( 10 | local_rank, 11 | num_proc, 12 | func, 13 | init_method, 14 | shard_id, 15 | num_shards, 16 | backend, 17 | cfg, 18 | output_queue=None, 19 | ): 20 | """ 21 | Runs a function from a child process. 22 | Args: 23 | local_rank (int): rank of the current process on the current machine. 24 | num_proc (int): number of processes per machine. 25 | func (function): function to execute on each of the process. 26 | init_method (string): method to initialize the distributed training. 27 | TCP initialization: equiring a network address reachable from all 28 | processes followed by the port. 29 | Shared file-system initialization: makes use of a file system that 30 | is shared and visible from all machines. The URL should start with 31 | file:// and contain a path to a non-existent file on a shared file 32 | system. 33 | shard_id (int): the rank of the current machine. 34 | num_shards (int): number of overall machines for the distributed 35 | training job. 36 | backend (string): three distributed backends ('nccl', 'gloo', 'mpi') are 37 | supports, each with different capabilities. Details can be found 38 | here: 39 | https://pytorch.org/docs/stable/distributed.html 40 | cfg (CfgNode): configs. Details can be found in 41 | slowfast/config/defaults.py 42 | output_queue (queue): can optionally be used to return values from the 43 | master process. 44 | """ 45 | # Initialize the process group. 46 | world_size = num_proc * num_shards 47 | rank = shard_id * num_proc + local_rank 48 | 49 | try: 50 | torch.distributed.init_process_group( 51 | backend=backend, 52 | init_method=init_method, 53 | world_size=world_size, 54 | rank=rank, 55 | ) 56 | except Exception as e: 57 | raise e 58 | 59 | torch.cuda.set_device(local_rank) 60 | ret = func(cfg) 61 | if output_queue is not None and local_rank == 0: 62 | output_queue.put(ret) 63 | -------------------------------------------------------------------------------- /slowfast/utils/parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Argument parser functions.""" 5 | 6 | import argparse 7 | import sys 8 | 9 | import slowfast.utils.checkpoint as cu 10 | from slowfast.config.defaults import get_cfg 11 | 12 | 13 | def parse_args(): 14 | """ 15 | Parse the following arguments for a default parser for PySlowFast users. 16 | Args: 17 | shard_id (int): shard id for the current machine. Starts from 0 to 18 | num_shards - 1. If single machine is used, then set shard id to 0. 19 | num_shards (int): number of shards using by the job. 20 | init_method (str): initialization method to launch the job with multiple 21 | devices. Options includes TCP or shared file-system for 22 | initialization. details can be find in 23 | https://pytorch.org/docs/stable/distributed.html#tcp-initialization 24 | cfg (str): path to the config file. 25 | opts (argument): provide addtional options from the command line, it 26 | overwrites the config loaded from file. 27 | """ 28 | parser = argparse.ArgumentParser( 29 | description="Provide SlowFast video training and testing pipeline." 30 | ) 31 | parser.add_argument( 32 | "--shard_id", 33 | help="The shard id of current node, Starts from 0 to num_shards - 1", 34 | default=0, 35 | type=int, 36 | ) 37 | parser.add_argument( 38 | "--num_shards", 39 | help="Number of shards using by the job", 40 | default=1, 41 | type=int, 42 | ) 43 | parser.add_argument( 44 | "--init_method", 45 | help="Initialization method, includes TCP or shared file-system", 46 | default="tcp://localhost:9999", 47 | type=str, 48 | ) 49 | parser.add_argument( 50 | "--cfg", 51 | dest="cfg_file", 52 | help="Path to the config file", 53 | default="configs/Kinetics/SLOWFAST_4x16_R50.yaml", 54 | type=str, 55 | ) 56 | parser.add_argument( 57 | "opts", 58 | help="See slowfast/config/defaults.py for all options", 59 | default=None, 60 | nargs=argparse.REMAINDER, 61 | ) 62 | if len(sys.argv) == 1: 63 | parser.print_help() 64 | return parser.parse_args() 65 | 66 | 67 | def load_config(args): 68 | """ 69 | Given the arguemnts, load and initialize the configs. 70 | Args: 71 | args (argument): arguments includes `shard_id`, `num_shards`, 72 | `init_method`, `cfg_file`, and `opts`. 73 | """ 74 | # Setup cfg. 75 | cfg = get_cfg() 76 | # Load config from cfg. 77 | if args.cfg_file is not None: 78 | cfg.merge_from_file(args.cfg_file) 79 | # Load config from command line, overwrite config from opts. 80 | if args.opts is not None: 81 | cfg.merge_from_list(args.opts) 82 | 83 | # Inherit parameters from args. 84 | if hasattr(args, "num_shards") and hasattr(args, "shard_id"): 85 | cfg.NUM_SHARDS = args.num_shards 86 | cfg.SHARD_ID = args.shard_id 87 | if hasattr(args, "rng_seed"): 88 | cfg.RNG_SEED = args.rng_seed 89 | if hasattr(args, "output_dir"): 90 | cfg.OUTPUT_DIR = args.output_dir 91 | 92 | # Create the checkpoint dir. 93 | cu.make_checkpoint_dir(cfg.OUTPUT_DIR) 94 | return cfg 95 | -------------------------------------------------------------------------------- /slowfast/utils/weight_init_helper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Utility function for weight initialization""" 5 | 6 | import torch.nn as nn 7 | from fvcore.nn.weight_init import c2_msra_fill 8 | 9 | 10 | def init_weights(model, fc_init_std=0.01, zero_init_final_bn=True): 11 | """ 12 | Performs ResNet style weight initialization. 13 | Args: 14 | fc_init_std (float): the expected standard deviation for fc layer. 15 | zero_init_final_bn (bool): if True, zero initialize the final bn for 16 | every bottleneck. 17 | """ 18 | for m in model.modules(): 19 | if isinstance(m, nn.Conv3d): 20 | """ 21 | Follow the initialization method proposed in: 22 | {He, Kaiming, et al. 23 | "Delving deep into rectifiers: Surpassing human-level 24 | performance on imagenet classification." 25 | arXiv preprint arXiv:1502.01852 (2015)} 26 | """ 27 | c2_msra_fill(m) 28 | elif isinstance(m, nn.BatchNorm3d): 29 | if ( 30 | hasattr(m, "transform_final_bn") 31 | and m.transform_final_bn 32 | and zero_init_final_bn 33 | ): 34 | batchnorm_weight = 0.0 35 | else: 36 | batchnorm_weight = 1.0 37 | if m.weight is not None: 38 | m.weight.data.fill_(batchnorm_weight) 39 | if m.bias is not None: 40 | m.bias.data.zero_() 41 | if isinstance(m, nn.Linear): 42 | m.weight.data.normal_(mean=0.0, std=fc_init_std) 43 | if m.bias is not None: 44 | m.bias.data.zero_() 45 | -------------------------------------------------------------------------------- /slowfast/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /slowfast/visualization/gradcam_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import matplotlib.pyplot as plt 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | import slowfast.datasets.utils as data_utils 9 | from slowfast.visualization.utils import get_layer 10 | 11 | 12 | class GradCAM: 13 | """ 14 | GradCAM class helps create localization maps using the Grad-CAM method for input videos 15 | and overlap the maps over the input videos as heatmaps. 16 | https://arxiv.org/pdf/1610.02391.pdf 17 | """ 18 | 19 | def __init__( 20 | self, model, target_layers, data_mean, data_std, colormap="viridis" 21 | ): 22 | """ 23 | Args: 24 | model (model): the model to be used. 25 | target_layers (list of str(s)): name of convolutional layer to be used to get 26 | gradients and feature maps from for creating localization maps. 27 | data_mean (tensor or list): mean value to add to input videos. 28 | data_std (tensor or list): std to multiply for input videos. 29 | colormap (Optional[str]): matplotlib colormap used to create heatmap. 30 | See https://matplotlib.org/3.3.0/tutorials/colors/colormaps.html 31 | """ 32 | 33 | self.model = model 34 | # Run in eval mode. 35 | self.model.eval() 36 | self.target_layers = target_layers 37 | 38 | self.gradients = {} 39 | self.activations = {} 40 | self.colormap = plt.get_cmap(colormap) 41 | self.data_mean = data_mean 42 | self.data_std = data_std 43 | self._register_hooks() 44 | 45 | def _register_single_hook(self, layer_name): 46 | """ 47 | Register forward and backward hook to a layer, given layer_name, 48 | to obtain gradients and activations. 49 | Args: 50 | layer_name (str): name of the layer. 51 | """ 52 | 53 | def get_gradients(module, grad_input, grad_output): 54 | self.gradients[layer_name] = grad_output[0].detach() 55 | 56 | def get_activations(module, input, output): 57 | self.activations[layer_name] = output.clone().detach() 58 | 59 | target_layer = get_layer(self.model, layer_name=layer_name) 60 | target_layer.register_forward_hook(get_activations) 61 | target_layer.register_backward_hook(get_gradients) 62 | 63 | def _register_hooks(self): 64 | """ 65 | Register hooks to layers in `self.target_layers`. 66 | """ 67 | for layer_name in self.target_layers: 68 | self._register_single_hook(layer_name=layer_name) 69 | 70 | def _calculate_localization_map(self, inputs, labels=None): 71 | """ 72 | Calculate localization map for all inputs with Grad-CAM. 73 | Args: 74 | inputs (list of tensor(s)): the input clips. 75 | labels (Optional[tensor]): labels of the current input clips. 76 | Returns: 77 | localization_maps (list of ndarray(s)): the localization map for 78 | each corresponding input. 79 | preds (tensor): shape (n_instances, n_class). Model predictions for `inputs`. 80 | """ 81 | assert len(inputs) == len( 82 | self.target_layers 83 | ), "Must register the same number of target layers as the number of input pathways." 84 | input_clone = [inp.clone() for inp in inputs] 85 | preds = self.model(input_clone) 86 | 87 | if labels is None: 88 | score = torch.max(preds, dim=-1)[0] 89 | else: 90 | if labels.ndim == 1: 91 | labels = labels.unsqueeze(-1) 92 | score = torch.gather(preds, dim=1, index=labels) 93 | 94 | self.model.zero_grad() 95 | score = torch.sum(score) 96 | score.backward() 97 | localization_maps = [] 98 | for i, inp in enumerate(inputs): 99 | _, _, T, H, W = inp.size() 100 | 101 | gradients = self.gradients[self.target_layers[i]] 102 | activations = self.activations[self.target_layers[i]] 103 | B, C, Tg, _, _ = gradients.size() 104 | 105 | weights = torch.mean(gradients.view(B, C, Tg, -1), dim=3) 106 | 107 | weights = weights.view(B, C, Tg, 1, 1) 108 | localization_map = torch.sum( 109 | weights * activations, dim=1, keepdim=True 110 | ) 111 | localization_map = F.relu(localization_map) 112 | localization_map = F.interpolate( 113 | localization_map, 114 | size=(T, H, W), 115 | mode="trilinear", 116 | align_corners=False, 117 | ) 118 | localization_map_min, localization_map_max = ( 119 | torch.min(localization_map.view(B, -1), dim=-1, keepdim=True)[ 120 | 0 121 | ], 122 | torch.max(localization_map.view(B, -1), dim=-1, keepdim=True)[ 123 | 0 124 | ], 125 | ) 126 | localization_map_min = torch.reshape( 127 | localization_map_min, shape=(B, 1, 1, 1, 1) 128 | ) 129 | localization_map_max = torch.reshape( 130 | localization_map_max, shape=(B, 1, 1, 1, 1) 131 | ) 132 | # Normalize the localization map. 133 | localization_map = (localization_map - localization_map_min) / ( 134 | localization_map_max - localization_map_min + 1e-6 135 | ) 136 | localization_map = localization_map.data 137 | 138 | localization_maps.append(localization_map) 139 | 140 | return localization_maps, preds 141 | 142 | def __call__(self, inputs, labels=None, alpha=0.5): 143 | """ 144 | Visualize the localization maps on their corresponding inputs as heatmap, 145 | using Grad-CAM. 146 | Args: 147 | inputs (list of tensor(s)): the input clips. 148 | labels (Optional[tensor]): labels of the current input clips. 149 | alpha (float): transparency level of the heatmap, in the range [0, 1]. 150 | Returns: 151 | result_ls (list of tensor(s)): the visualized inputs. 152 | preds (tensor): shape (n_instances, n_class). Model predictions for `inputs`. 153 | """ 154 | result_ls = [] 155 | localization_maps, preds = self._calculate_localization_map( 156 | inputs, labels=labels 157 | ) 158 | for i, localization_map in enumerate(localization_maps): 159 | # Convert (B, 1, T, H, W) to (B, T, H, W) 160 | localization_map = localization_map.squeeze(dim=1) 161 | if localization_map.device != torch.device("cpu"): 162 | localization_map = localization_map.cpu() 163 | heatmap = self.colormap(localization_map) 164 | heatmap = heatmap[:, :, :, :, :3] 165 | # Permute input from (B, C, T, H, W) to (B, T, H, W, C) 166 | curr_inp = inputs[i].permute(0, 2, 3, 4, 1) 167 | if curr_inp.device != torch.device("cpu"): 168 | curr_inp = curr_inp.cpu() 169 | curr_inp = data_utils.revert_tensor_normalize( 170 | curr_inp, self.data_mean, self.data_std 171 | ) 172 | heatmap = torch.from_numpy(heatmap) 173 | curr_inp = alpha * heatmap + (1 - alpha) * curr_inp 174 | # Permute inp to (B, T, C, H, W) 175 | curr_inp = curr_inp.permute(0, 1, 4, 2, 3) 176 | result_ls.append(curr_inp) 177 | 178 | return result_ls, preds 179 | -------------------------------------------------------------------------------- /slowfast/visualization/prediction_vis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import numpy as np 5 | import torch 6 | 7 | import slowfast.datasets.utils as data_utils 8 | import slowfast.utils.logging as logging 9 | import slowfast.visualization.tensorboard_vis as tb 10 | from slowfast.utils.misc import get_class_names 11 | from slowfast.visualization.video_visualizer import VideoVisualizer 12 | 13 | logger = logging.get_logger(__name__) 14 | 15 | 16 | class WrongPredictionVis: 17 | """ 18 | WrongPredictionVis class for visualizing video inputs to Tensorboard 19 | for instances that the model makes wrong predictions. 20 | """ 21 | 22 | def __init__(self, cfg): 23 | """ 24 | Args: 25 | cfg (CfgNode): configs. Details can be found in 26 | slowfast/config/defaults.py 27 | """ 28 | self.cfg = cfg 29 | self.class_names, _, self.subset = get_class_names( 30 | cfg.TENSORBOARD.CLASS_NAMES_PATH, 31 | subset_path=cfg.TENSORBOARD.WRONG_PRED_VIS.SUBSET_PATH, 32 | ) 33 | if self.subset is not None: 34 | self.subset = set(self.subset) 35 | self.num_class = cfg.MODEL.NUM_CLASSES 36 | self.video_vis = VideoVisualizer( 37 | cfg.MODEL.NUM_CLASSES, 38 | cfg.TENSORBOARD.CLASS_NAMES_PATH, 39 | 1, 40 | cfg.TENSORBOARD.MODEL_VIS.COLORMAP, 41 | ) 42 | self.tag = cfg.TENSORBOARD.WRONG_PRED_VIS.TAG 43 | self.writer = tb.TensorboardWriter(cfg) 44 | self.model_incorrect_classes = set() 45 | 46 | def _pick_wrong_preds(self, labels, preds): 47 | """ 48 | Returns a 1D tensor that contains the indices of instances that have 49 | wrong predictions, where true labels in in the specified subset. 50 | Args: 51 | labels (tensor): tensor of shape (n_instances,) containing class ids. 52 | preds (tensor): class scores from model, shape (n_intances, n_classes) 53 | Returns: 54 | mask (tensor): boolean tensor. `mask[i]` is True if `model` makes a wrong prediction. 55 | """ 56 | subset_mask = torch.ones(size=(len(labels),), dtype=torch.bool) 57 | if self.subset is not None: 58 | for i, label in enumerate(labels): 59 | if label not in self.subset: 60 | subset_mask[i] = False 61 | 62 | preds_ids = torch.argmax(preds, dim=-1) 63 | 64 | mask = preds_ids != labels 65 | mask &= subset_mask 66 | for i, wrong_pred in enumerate(mask): 67 | if wrong_pred: 68 | self.model_incorrect_classes.add(labels[i]) 69 | 70 | return mask 71 | 72 | def visualize_vid(self, video_input, labels, preds, batch_idx): 73 | """ 74 | Draw predicted labels on video inputs and visualize all incorrectly classified 75 | videos in the current batch. 76 | Args: 77 | video_input (list of list of tensor(s)): list of videos for all pathways. 78 | labels (array-like): shape (n_instances,) of true label for each instance. 79 | preds (tensor): shape (n, instances, n_classes). The predicted scores for all instances. 80 | tag (Optional[str]): all visualized video will be added under this tag. This is for organization 81 | purposes in Tensorboard. 82 | batch_idx (int): batch index of the current videos. 83 | """ 84 | 85 | def add_video(vid, preds, tag, true_class_name): 86 | """ 87 | Draw predicted label on video and add it to Tensorboard. 88 | Args: 89 | vid (array-like): shape (C, T, H, W). Each image in `vid` is a RGB image. 90 | preds (tensor): shape (n_classes,) or (1, n_classes). The predicted scores 91 | for the current `vid`. 92 | tag (str): tag for `vid` in Tensorboard. 93 | true_class_name (str): the ground-truth class name of the current `vid` instance. 94 | """ 95 | # Permute to (T, H, W, C). 96 | vid = vid.permute(1, 2, 3, 0) 97 | vid = data_utils.revert_tensor_normalize( 98 | vid.cpu(), self.cfg.DATA.MEAN, self.cfg.DATA.STD 99 | ) 100 | vid = self.video_vis.draw_clip(vid, preds) 101 | vid = torch.from_numpy(np.array(vid)).permute(0, 3, 1, 2) 102 | vid = torch.unsqueeze(vid, dim=0) 103 | self.writer.add_video( 104 | vid, tag="{}: {}".format(tag, true_class_name) 105 | ) 106 | 107 | mask = self._pick_wrong_preds(labels, preds) 108 | video_indices = torch.squeeze(mask.nonzero(), dim=-1) 109 | # Visualize each wrongly classfied video. 110 | for vid_idx in video_indices: 111 | cur_vid_idx = batch_idx * len(video_input[0]) + vid_idx 112 | for pathway in range(len(video_input)): 113 | add_video( 114 | video_input[pathway][vid_idx], 115 | preds=preds[vid_idx], 116 | tag=self.tag 117 | + "/Video {}, Pathway {}".format(cur_vid_idx, pathway), 118 | true_class_name=self.class_names[labels[vid_idx]], 119 | ) 120 | 121 | @property 122 | def wrong_class_prediction(self): 123 | """ 124 | Return class ids that the model predicted incorrectly. 125 | """ 126 | incorrect_class_names = [ 127 | self.class_names[i] for i in self.model_incorrect_classes 128 | ] 129 | return list(set(incorrect_class_names)) 130 | 131 | def clean(self): 132 | """ 133 | Close Tensorboard writer. 134 | """ 135 | self.writer.close() 136 | -------------------------------------------------------------------------------- /slowfast/visualization/predictor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | import queue 5 | import cv2 6 | import torch 7 | #from detectron2 import model_zoo 8 | #from detectron2.config import get_cfg 9 | #from detectron2.engine import DefaultPredictor 10 | 11 | import slowfast.utils.checkpoint as cu 12 | from slowfast.datasets import cv2_transform 13 | from slowfast.models import build_model 14 | from slowfast.utils import logging 15 | from slowfast.visualization.utils import process_cv2_inputs 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | 20 | class Predictor: 21 | """ 22 | Action Predictor for action recognition. 23 | """ 24 | 25 | def __init__(self, cfg, gpu_id=None): 26 | """ 27 | Args: 28 | cfg (CfgNode): configs. Details can be found in 29 | slowfast/config/defaults.py 30 | gpu_id (Optional[int]): GPU id. 31 | """ 32 | if cfg.NUM_GPUS: 33 | self.gpu_id = ( 34 | torch.cuda.current_device() if gpu_id is None else gpu_id 35 | ) 36 | 37 | # Build the video model and print model statistics. 38 | self.model = build_model(cfg, gpu_id=gpu_id) 39 | self.model.eval() 40 | self.cfg = cfg 41 | 42 | if cfg.DETECTION.ENABLE: 43 | self.object_detector = Detectron2Predictor(cfg, gpu_id=self.gpu_id) 44 | 45 | logger.info("Start loading model weights.") 46 | cu.load_test_checkpoint(cfg, self.model) 47 | logger.info("Finish loading model weights") 48 | 49 | def __call__(self, task): 50 | """ 51 | Returns the prediction results for the current task. 52 | Args: 53 | task (TaskInfo object): task object that contain 54 | the necessary information for action prediction. (e.g. frames, boxes) 55 | Returns: 56 | task (TaskInfo object): the same task info object but filled with 57 | prediction values (a tensor) and the corresponding boxes for 58 | action detection task. 59 | """ 60 | if self.cfg.DETECTION.ENABLE: 61 | task = self.object_detector(task) 62 | 63 | frames, bboxes = task.frames, task.bboxes 64 | if bboxes is not None: 65 | bboxes = cv2_transform.scale_boxes( 66 | self.cfg.DATA.TEST_CROP_SIZE, 67 | bboxes, 68 | task.img_height, 69 | task.img_width, 70 | ) 71 | if self.cfg.DEMO.INPUT_FORMAT == "BGR": 72 | frames = [ 73 | cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames 74 | ] 75 | 76 | frames = [ 77 | cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame) 78 | for frame in frames 79 | ] 80 | inputs = process_cv2_inputs(frames, self.cfg) 81 | if bboxes is not None: 82 | index_pad = torch.full( 83 | size=(bboxes.shape[0], 1), 84 | fill_value=float(0), 85 | device=bboxes.device, 86 | ) 87 | 88 | # Pad frame index for each box. 89 | bboxes = torch.cat([index_pad, bboxes], axis=1) 90 | if self.cfg.NUM_GPUS > 0: 91 | # Transfer the data to the current GPU device. 92 | if isinstance(inputs, (list,)): 93 | for i in range(len(inputs)): 94 | inputs[i] = inputs[i].cuda( 95 | device=torch.device(self.gpu_id), non_blocking=True 96 | ) 97 | else: 98 | inputs = inputs.cuda( 99 | device=torch.device(self.gpu_id), non_blocking=True 100 | ) 101 | if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]: 102 | preds = torch.tensor([]) 103 | else: 104 | preds = self.model(inputs, bboxes) 105 | 106 | if self.cfg.NUM_GPUS: 107 | preds = preds.cpu() 108 | if bboxes is not None: 109 | bboxes = bboxes.detach().cpu() 110 | 111 | preds = preds.detach() 112 | task.add_action_preds(preds) 113 | if bboxes is not None: 114 | task.add_bboxes(bboxes[:, 1:]) 115 | 116 | return task 117 | 118 | 119 | class ActionPredictor: 120 | """ 121 | Synchronous Action Prediction and Visualization pipeline with AsyncVis. 122 | """ 123 | 124 | def __init__(self, cfg, async_vis=None, gpu_id=None): 125 | """ 126 | Args: 127 | cfg (CfgNode): configs. Details can be found in 128 | slowfast/config/defaults.py 129 | async_vis (AsyncVis object): asynchronous visualizer. 130 | gpu_id (Optional[int]): GPU id. 131 | """ 132 | self.predictor = Predictor(cfg=cfg, gpu_id=gpu_id) 133 | self.async_vis = async_vis 134 | 135 | def put(self, task): 136 | """ 137 | Make prediction and put the results in `async_vis` task queue. 138 | Args: 139 | task (TaskInfo object): task object that contain 140 | the necessary information for action prediction. (e.g. frames, boxes) 141 | """ 142 | task = self.predictor(task) 143 | self.async_vis.get_indices_ls.append(task.id) 144 | self.async_vis.put(task) 145 | 146 | def get(self): 147 | """ 148 | Get the visualized clips if any. 149 | """ 150 | try: 151 | task = self.async_vis.get() 152 | except (queue.Empty, IndexError): 153 | raise IndexError("Results are not available yet.") 154 | 155 | return task 156 | 157 | 158 | class Detectron2Predictor: 159 | """ 160 | Wrapper around Detectron2 to return the required predicted bounding boxes 161 | as a ndarray. 162 | """ 163 | 164 | def __init__(self, cfg, gpu_id=None): 165 | """ 166 | Args: 167 | cfg (CfgNode): configs. Details can be found in 168 | slowfast/config/defaults.py 169 | gpu_id (Optional[int]): GPU id. 170 | """ 171 | 172 | self.cfg = get_cfg() 173 | self.cfg.merge_from_file( 174 | model_zoo.get_config_file(cfg.DEMO.DETECTRON2_CFG) 175 | ) 176 | self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = cfg.DEMO.DETECTRON2_THRESH 177 | self.cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_WEIGHTS 178 | self.cfg.INPUT.FORMAT = cfg.DEMO.INPUT_FORMAT 179 | if cfg.NUM_GPUS and gpu_id is None: 180 | gpu_id = torch.cuda.current_device() 181 | self.cfg.MODEL.DEVICE = ( 182 | "cuda:{}".format(gpu_id) if cfg.NUM_GPUS > 0 else "cpu" 183 | ) 184 | 185 | logger.info("Initialized Detectron2 Object Detection Model.") 186 | 187 | self.predictor = DefaultPredictor(self.cfg) 188 | 189 | def __call__(self, task): 190 | """ 191 | Return bounding boxes predictions as a tensor. 192 | Args: 193 | task (TaskInfo object): task object that contain 194 | the necessary information for action prediction. (e.g. frames) 195 | Returns: 196 | task (TaskInfo object): the same task info object but filled with 197 | prediction values (a tensor) and the corresponding boxes for 198 | action detection task. 199 | """ 200 | middle_frame = task.frames[len(task.frames) // 2] 201 | outputs = self.predictor(middle_frame) 202 | # Get only human instances 203 | mask = outputs["instances"].pred_classes == 0 204 | pred_boxes = outputs["instances"].pred_boxes.tensor[mask] 205 | task.add_bboxes(pred_boxes) 206 | 207 | return task 208 | -------------------------------------------------------------------------------- /slurm_scripts/run_multi_node_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --constraint=volta32gb 5 | #SBATCH --cpus-per-task=10 6 | #SBATCH --error=/checkpoint/%u/jobs/%j.err 7 | #SBATCH --gres=gpu:8 8 | #SBATCH --job-name=vtf 9 | #SBATCH --mem=450GB 10 | #SBATCH --nodes=8 11 | #SBATCH --ntasks-per-node=8 12 | #SBATCH --open-mode=append 13 | #SBATCH --output=/checkpoint/%u/jobs/%j.out 14 | #SBATCH --partition=learnfair 15 | #SBATCH --signal=USR1@600 16 | #SBATCH --time=72:00:00 17 | # #SBATCH --mail-type=END,FAIL,REQUEUE 18 | 19 | module load anaconda3 20 | source activate motionformer 21 | 22 | export MASTER_ADDR=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4} 23 | export MASTER_PORT=19500 24 | 25 | # debugging flags (optional) 26 | export NCCL_DEBUG=INFO 27 | export PYTHONFAULTHANDLER=1 28 | 29 | # set the network interface 30 | export NCCL_SOCKET_IFNAME=^docker0,lo 31 | echo $SLURMD_NODENAME $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES 32 | master_node=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4} 33 | dist_url="tcp://" 34 | dist_url+=$master_node 35 | dist_url+=:40000 36 | echo $dist_url 37 | 38 | 39 | if [ -z "$1" ] 40 | then 41 | CFG='configs/K400/joint_224_16x4.yaml' 42 | else 43 | CFG=$1 44 | fi 45 | 46 | if [ -z "$2" ] 47 | then 48 | ROOT_FOLDER="/checkpoint/motionformer" 49 | else 50 | ROOT_FOLDER=$2 51 | fi 52 | 53 | SAV_FOLDER="${ROOT_FOLDER}/${SLURM_JOB_ID}" 54 | mkdir -p ${SAV_FOLDER} 55 | 56 | # command 57 | srun --label python tools/run_net.py --init_method $dist_url --num_shards 8 --cfg $CFG \ 58 | NUM_GPUS 8 \ 59 | OUTPUT_DIR ${SAV_FOLDER} \ -------------------------------------------------------------------------------- /slurm_scripts/run_single_node_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --constraint=volta32gb 5 | #SBATCH --cpus-per-task=10 6 | #SBATCH --error=/checkpoint/%u/jobs/%j.err 7 | #SBATCH --gres=gpu:8 8 | #SBATCH --job-name=vtf 9 | #SBATCH --mem=450GB 10 | #SBATCH --nodes=1 11 | #SBATCH --ntasks-per-node=8 12 | #SBATCH --open-mode=append 13 | #SBATCH --output=/checkpoint/%u/jobs/%j.out 14 | #SBATCH --partition=learnfair 15 | #SBATCH --signal=USR1@600 16 | #SBATCH --time=72:00:00 17 | # #SBATCH --mail-type=END,FAIL,REQUEUE 18 | 19 | module load anaconda3 20 | source activate motionformer 21 | 22 | export MASTER_ADDR=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4} 23 | export MASTER_PORT=19500 24 | 25 | # debugging flags (optional) 26 | export NCCL_DEBUG=INFO 27 | export PYTHONFAULTHANDLER=1 28 | 29 | # set the network interface 30 | export NCCL_SOCKET_IFNAME=^docker0,lo 31 | echo $SLURMD_NODENAME $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES 32 | master_node=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4} 33 | dist_url="tcp://" 34 | dist_url+=$master_node 35 | dist_url+=:40000 36 | echo $dist_url 37 | 38 | 39 | if [ -z "$1" ] 40 | then 41 | CFG='configs/K400/joint_224_16x4.yaml' 42 | else 43 | CFG=$1 44 | fi 45 | 46 | if [ -z "$2" ] 47 | then 48 | ROOT_FOLDER="/checkpoint/motionformer" 49 | else 50 | ROOT_FOLDER=$2 51 | fi 52 | 53 | SAV_FOLDER="${ROOT_FOLDER}/${SLURM_JOB_ID}" 54 | mkdir -p ${SAV_FOLDER} 55 | 56 | # command 57 | srun --label python tools/run_net.py --init_method $dist_url --num_shards 1 --cfg $CFG \ 58 | NUM_GPUS 8 \ 59 | OUTPUT_DIR ${SAV_FOLDER} \ -------------------------------------------------------------------------------- /slurm_scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --constraint=volta32gb 5 | #SBATCH --cpus-per-task=10 6 | #SBATCH --error=/checkpoint/%u/jobs/%j.err 7 | #SBATCH --gres=gpu:8 8 | #SBATCH --job-name=vtf_test 9 | #SBATCH --mem=450GB 10 | #SBATCH --nodes=1 11 | #SBATCH --ntasks-per-node=8 12 | #SBATCH --open-mode=append 13 | #SBATCH --output=/checkpoint/%u/jobs/%j.out 14 | #SBATCH --partition=learnfair 15 | #SBATCH --signal=USR1@600 16 | #SBATCH --comment=icml21-deadline 17 | #SBATCH --time=12:00:00 18 | # #SBATCH --mail-user=mandelapatrick@fb.com 19 | # #SBATCH --mail-type=END,FAIL,REQUEUE 20 | 21 | module load anaconda3 22 | source activate pysf23_18 23 | 24 | export MASTER_ADDR=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4} 25 | export MASTER_PORT=19500 26 | 27 | # debugging flags (optional) 28 | export NCCL_DEBUG=INFO 29 | export PYTHONFAULTHANDLER=1 30 | 31 | # set the network interface 32 | export NCCL_SOCKET_IFNAME=^docker0,lo 33 | echo $SLURMD_NODENAME $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES 34 | master_node=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4} 35 | dist_url="tcp://" 36 | dist_url+=$master_node 37 | dist_url+=:40000 38 | echo $dist_url 39 | 40 | if [ -z "$1" ] 41 | then 42 | CFG='configs/Kinetics/ViT_base_ST_8x16.yaml' 43 | else 44 | CFG=$1 45 | fi 46 | if [ -z "$2" ] 47 | then 48 | CKPT_PATH='/checkpoint/mandelapatrick/slowfast/36328386/checkpoints/checkpoint_epoch_00030.pyth' 49 | else 50 | CKPT_PATH=$2 51 | fi 52 | 53 | 54 | SAV_FOLDER="/checkpoint/${USER}/slowfast/${SLURM_JOB_ID}_test" 55 | mkdir -p ${SAV_FOLDER} 56 | 57 | # command 58 | python tools/run_net.py --cfg $CFG \ 59 | NUM_GPUS 8 \ 60 | TRAIN.ENABLE False \ 61 | TEST.CHECKPOINT_FILE_PATH $CKPT_PATH \ -------------------------------------------------------------------------------- /tools/benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | """ 4 | A script to benchmark data loading. 5 | """ 6 | 7 | import slowfast.utils.logging as logging 8 | from slowfast.utils.benchmark import benchmark_data_loading 9 | from slowfast.utils.misc import launch_job 10 | from slowfast.utils.parser import load_config, parse_args 11 | 12 | logger = logging.get_logger(__name__) 13 | 14 | 15 | def main(): 16 | args = parse_args() 17 | cfg = load_config(args) 18 | 19 | launch_job( 20 | cfg=cfg, init_method=args.init_method, func=benchmark_data_loading 21 | ) 22 | 23 | 24 | if __name__ == "__main__": 25 | main() 26 | -------------------------------------------------------------------------------- /tools/run_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Wrapper to train and test a video classification model.""" 5 | from slowfast.utils.misc import launch_job 6 | from slowfast.utils.parser import load_config, parse_args 7 | 8 | from test_net import test 9 | from train_net import train 10 | 11 | 12 | def main(): 13 | """ 14 | Main function to spawn the train and test process. 15 | """ 16 | args = parse_args() 17 | cfg = load_config(args) 18 | 19 | # Perform training. 20 | if cfg.TRAIN.ENABLE: 21 | launch_job(cfg=cfg, init_method=args.init_method, func=train) 22 | 23 | # Perform multi-clip testing. 24 | if cfg.TEST.ENABLE: 25 | launch_job(cfg=cfg, init_method=args.init_method, func=test) 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 3 | 4 | """Multi-view test a video classification model.""" 5 | 6 | import numpy as np 7 | import os 8 | import pickle 9 | import torch 10 | from iopath.common.file_io import g_pathmgr 11 | 12 | import slowfast.utils.checkpoint as cu 13 | import slowfast.utils.distributed as du 14 | import slowfast.utils.logging as logging 15 | import slowfast.utils.misc as misc 16 | import slowfast.visualization.tensorboard_vis as tb 17 | from slowfast.datasets import loader 18 | from slowfast.models import build_model 19 | from slowfast.utils.meters import AVAMeter, TestMeter, EPICTestMeter 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | 24 | @torch.no_grad() 25 | def perform_test(test_loader, model, test_meter, cfg, writer=None): 26 | """ 27 | For classification: 28 | Perform mutli-view testing that uniformly samples N clips from a video along 29 | its temporal axis. For each clip, it takes 3 crops to cover the spatial 30 | dimension, followed by averaging the softmax scores across all Nx3 views to 31 | form a video-level prediction. All video predictions are compared to 32 | ground-truth labels and the final testing performance is logged. 33 | For detection: 34 | Perform fully-convolutional testing on the full frames without crop. 35 | Args: 36 | test_loader (loader): video testing loader. 37 | model (model): the pretrained video model to test. 38 | test_meter (TestMeter): testing meters to log and ensemble the testing 39 | results. 40 | cfg (CfgNode): configs. Details can be found in 41 | slowfast/config/defaults.py 42 | writer (TensorboardWriter object, optional): TensorboardWriter object 43 | to writer Tensorboard log. 44 | """ 45 | # Enable eval mode. 46 | model.eval() 47 | test_meter.iter_tic() 48 | 49 | for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader): 50 | if cfg.NUM_GPUS: 51 | # Transfer the data to the current GPU device. 52 | if isinstance(inputs, (list,)): 53 | for i in range(len(inputs)): 54 | inputs[i] = inputs[i].cuda(non_blocking=True) 55 | else: 56 | inputs = inputs.cuda(non_blocking=True) 57 | 58 | # Transfer the data to the current GPU device. 59 | if isinstance(labels, (dict,)): 60 | labels = {k: v.cuda() for k, v in labels.items()} 61 | else: 62 | labels = labels.cuda() 63 | video_idx = video_idx.cuda() 64 | for key, val in meta.items(): 65 | if isinstance(val, (list,)): 66 | for i in range(len(val)): 67 | if not isinstance(val[i], (str,)): 68 | val[i] = val[i].cuda(non_blocking=True) 69 | else: 70 | meta[key] = val.cuda(non_blocking=True) 71 | test_meter.data_toc() 72 | 73 | with torch.cuda.amp.autocast(enabled=cfg.SOLVER.USE_MIXED_PRECISION): 74 | # Perform the forward pass. 75 | shuffle_frames = cfg.TEST.SHUFFLE_FRAMES 76 | if shuffle_frames: 77 | N = len(inputs) 78 | B, C, T, H, W = inputs[0].shape 79 | shuffled_indices = np.random.permutation(T) 80 | inputs = [inputs[0][:, :, shuffled_indices, :, :]] 81 | preds = model(inputs) 82 | 83 | # Gather all the predictions across all the devices to perform ensemble. 84 | if isinstance(labels, (dict,)): 85 | # Gather all the predictions across all the devices to perform ensemble. 86 | if cfg.NUM_GPUS > 1: 87 | verb_preds, verb_labels, video_idx = du.all_gather( 88 | [preds[0], labels['verb'], video_idx] 89 | ) 90 | 91 | noun_preds, noun_labels, video_idx = du.all_gather( 92 | [preds[1], labels['noun'], video_idx] 93 | ) 94 | meta = du.all_gather_unaligned(meta) 95 | metadata = {'narration_id': []} 96 | for i in range(len(meta)): 97 | metadata['narration_id'].extend(meta[i]['narration_id']) 98 | else: 99 | metadata = meta 100 | verb_preds, verb_labels, video_idx = preds[0], labels['verb'], video_idx 101 | noun_preds, noun_labels, video_idx = preds[1], labels['noun'], video_idx 102 | test_meter.iter_toc() 103 | # Update and log stats. 104 | test_meter.update_stats( 105 | (verb_preds.detach().cpu(), noun_preds.detach().cpu()), 106 | (verb_labels.detach().cpu(), noun_labels.detach().cpu()), 107 | metadata, 108 | video_idx.detach().cpu(), 109 | ) 110 | test_meter.log_iter_stats(cur_iter) 111 | else: 112 | if cfg.NUM_GPUS > 1: 113 | preds, labels, video_idx = du.all_gather( 114 | [preds, labels, video_idx] 115 | ) 116 | if cfg.NUM_GPUS: 117 | preds = preds.cpu() 118 | labels = labels.cpu() 119 | video_idx = video_idx.cpu() 120 | 121 | test_meter.iter_toc() 122 | # Update and log stats. 123 | test_meter.update_stats( 124 | preds.detach(), labels.detach(), video_idx.detach() 125 | ) 126 | test_meter.log_iter_stats(cur_iter) 127 | 128 | test_meter.iter_tic() 129 | 130 | # Log epoch stats and print the final testing results. 131 | if not cfg.DETECTION.ENABLE: 132 | if cfg.TEST.DATASET == 'Epickitchens': 133 | if du.is_master_proc(): 134 | results = {'verb_output': preds[0], 135 | 'noun_output': preds[1], 136 | 'narration_id': metadata} 137 | scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores') 138 | if not os.path.exists(scores_path): 139 | os.makedirs(scores_path) 140 | TEST_SPLIT = "validation" 141 | file_path = os.path.join(scores_path, TEST_SPLIT + '.pkl') 142 | pickle.dump(results, open(file_path, 'wb')) 143 | else: 144 | all_preds = test_meter.video_preds.clone().detach() 145 | all_labels = test_meter.video_labels 146 | if cfg.NUM_GPUS: 147 | all_preds = all_preds.cpu() 148 | all_labels = all_labels.cpu() 149 | if writer is not None: 150 | writer.plot_eval(preds=all_preds, labels=all_labels) 151 | 152 | if cfg.TEST.SAVE_RESULTS_PATH != "": 153 | save_path = os.path.join(cfg.OUTPUT_DIR, cfg.TEST.SAVE_RESULTS_PATH) 154 | 155 | if du.is_root_proc(): 156 | with g_pathmgr.open(save_path, "wb") as f: 157 | pickle.dump([all_preds, all_labels], f) 158 | 159 | logger.info( 160 | "Successfully saved prediction results to {}".format(save_path) 161 | ) 162 | 163 | test_meter.finalize_metrics() 164 | return test_meter 165 | 166 | 167 | def test(cfg): 168 | """ 169 | Perform multi-view testing on the pretrained video model. 170 | Args: 171 | cfg (CfgNode): configs. Details can be found in 172 | slowfast/config/defaults.py 173 | """ 174 | # Set up environment. 175 | du.init_distributed_training(cfg) 176 | # Set random seed from configs. 177 | np.random.seed(cfg.RNG_SEED) 178 | torch.manual_seed(cfg.RNG_SEED) 179 | 180 | # Setup logging format. 181 | logging.setup_logging(cfg.OUTPUT_DIR) 182 | 183 | # Print config. 184 | logger.info("Test with config:") 185 | logger.info(cfg) 186 | 187 | # Build the video model and print model statistics. 188 | model = build_model(cfg) 189 | if du.is_master_proc() and cfg.LOG_MODEL_INFO: 190 | misc.log_model_info(model, cfg, use_train_input=False) 191 | 192 | cu.load_test_checkpoint(cfg, model) 193 | 194 | # Create video testing loaders. 195 | test_loader = loader.construct_loader(cfg, "test") 196 | logger.info("Testing model for {} iterations".format(len(test_loader))) 197 | 198 | if cfg.DETECTION.ENABLE: 199 | assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE or cfg.NUM_GPUS == 0 200 | test_meter = AVAMeter(len(test_loader), cfg, mode="test") 201 | else: 202 | assert ( 203 | test_loader.dataset.num_videos 204 | % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS) 205 | == 0 206 | ) 207 | # Create meters for multi-view testing. 208 | if cfg.TEST.DATASET == 'Epickitchens': 209 | test_meter = EPICTestMeter( 210 | len(test_loader.dataset) 211 | // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), 212 | cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, 213 | [97, 300], 214 | len(test_loader), 215 | ) 216 | else: 217 | test_meter = TestMeter( 218 | len(test_loader.dataset) 219 | // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS), 220 | cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS, 221 | cfg.MODEL.NUM_CLASSES, 222 | len(test_loader), 223 | ) 224 | 225 | # Set up writer for logging to Tensorboard format. 226 | if cfg.TENSORBOARD.ENABLE and du.is_master_proc( 227 | cfg.NUM_GPUS * cfg.NUM_SHARDS 228 | ): 229 | writer = tb.TensorboardWriter(cfg) 230 | else: 231 | writer = None 232 | 233 | # # Perform multi-view test on the entire dataset. 234 | test_meter = perform_test(test_loader, model, test_meter, cfg, writer) 235 | if writer is not None: 236 | writer.close() 237 | --------------------------------------------------------------------------------