├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── configs
    ├── EK
    │   ├── divided_224_16x4.yaml
    │   ├── joint_224_16x4.yaml
    │   ├── motionformer_224_16x4.yaml
    │   ├── motionformer_224_32x3.yaml
    │   └── motionformer_336_16x4.yaml
    ├── K400
    │   ├── divided_224_16x4.yaml
    │   ├── joint_224_16x4.yaml
    │   ├── motionformer_224_16x4.yaml
    │   ├── motionformer_224_32x3.yaml
    │   └── motionformer_336_16x8.yaml
    ├── K600
    │   ├── divided_224_16x4.yaml
    │   ├── joint_224_16x4.yaml
    │   ├── motionformer_224_16x4.yaml
    │   ├── motionformer_224_32x3.yaml
    │   └── motionformer_336_16x4.yaml
    └── SSV2
    │   ├── divided_224_16x4.yaml
    │   ├── joint_224_16x4.yaml
    │   ├── motionformer_224_16x4.yaml
    │   ├── motionformer_224_32x3.yaml
    │   └── motionformer_336_16x4.yaml
├── data
    ├── kinetics_400
    │   └── preprocess.py
    └── kinetics_600
    │   └── preprocess.py
├── environment.yml
├── figs
    ├── firstpage.png
    ├── qual_results.png
    ├── splash.png
    └── traj_attn_fig.png
├── index.html
├── run_with_submitit.py
├── setup.cfg
├── setup.py
├── slowfast
    ├── __init__.py
    ├── config
    │   ├── __init__.py
    │   ├── custom_config.py
    │   └── defaults.py
    ├── datasets
    │   ├── DATASET.md
    │   ├── __init__.py
    │   ├── autoaugment.py
    │   ├── build.py
    │   ├── cv2_transform.py
    │   ├── decoder.py
    │   ├── epickitchens.py
    │   ├── epickitchens_record.py
    │   ├── frame_loader.py
    │   ├── kinetics.py
    │   ├── loader.py
    │   ├── multigrid_helper.py
    │   ├── random_erasing.py
    │   ├── samplers.py
    │   ├── ssv2.py
    │   ├── transform.py
    │   ├── utils.py
    │   ├── video_container.py
    │   └── video_record.py
    ├── models
    │   ├── __init__.py
    │   ├── adamw.py
    │   ├── batchnorm_helper.py
    │   ├── build.py
    │   ├── losses.py
    │   ├── nystrom_helper.py
    │   ├── optimizer.py
    │   ├── orthoformer_helper.py
    │   ├── performer_helper.py
    │   ├── video_model_builder.py
    │   └── vit_helper.py
    ├── utils
    │   ├── __init__.py
    │   ├── benchmark.py
    │   ├── bn_helper.py
    │   ├── c2_model_loading.py
    │   ├── checkpoint.py
    │   ├── distributed.py
    │   ├── env.py
    │   ├── logging.py
    │   ├── lr_policy.py
    │   ├── meters.py
    │   ├── metrics.py
    │   ├── misc.py
    │   ├── multigrid.py
    │   ├── multiprocessing.py
    │   ├── parser.py
    │   └── weight_init_helper.py
    └── visualization
    │   ├── __init__.py
    │   ├── async_predictor.py
    │   ├── ava_demo_precomputed_boxes.py
    │   ├── demo_loader.py
    │   ├── gradcam_utils.py
    │   ├── prediction_vis.py
    │   ├── predictor.py
    │   ├── tensorboard_vis.py
    │   ├── utils.py
    │   └── video_visualizer.py
├── slurm_scripts
    ├── run_multi_node_job.sh
    ├── run_single_node_job.sh
    └── test.sh
└── tools
    ├── benchmark.py
    ├── run_net.py
    ├── test_net.py
    └── train_net.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | bin/
10 | build/
11 | develop-eggs/
12 | dist/
13 | eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | *.pkl
23 | *.json
24 | *.npy
25 | *.csv
26 | 
27 | # Installer logs
28 | pip-log.txt
29 | pip-delete-this-directory.txt
30 | 
31 | # Unit test / coverage reports
32 | .tox/
33 | .coverage
34 | .cache
35 | nosetests.xml
36 | coverage.xml
37 | 
38 | # Translations
39 | *.mo
40 | 
41 | # Mr Developer
42 | .mr.developer.cfg
43 | .project
44 | .pydevproject
45 | 
46 | # Rope
47 | .ropeproject
48 | 
49 | # Django stuff:
50 | *.log
51 | *.pot
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Motionformer
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `master`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to Motionformer, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction,
 10 | and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by
 13 | the copyright owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all
 16 | other entities that control, are controlled by, or are under common
 17 | control with that entity. For the purposes of this definition,
 18 | "control" means (i) the power, direct or indirect, to cause the
 19 | direction or management of such entity, whether by contract or
 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 | outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 | "You" (or "Your") shall mean an individual or Legal Entity
 24 | exercising permissions granted by this License.
 25 | 
 26 | "Source" form shall mean the preferred form for making modifications,
 27 | including but not limited to software source code, documentation
 28 | source, and configuration files.
 29 | 
 30 | "Object" form shall mean any form resulting from mechanical
 31 | transformation or translation of a Source form, including but
 32 | not limited to compiled object code, generated documentation,
 33 | and conversions to other media types.
 34 | 
 35 | "Work" shall mean the work of authorship, whether in Source or
 36 | Object form, made available under the License, as indicated by a
 37 | copyright notice that is included in or attached to the work
 38 | (an example is provided in the Appendix below).
 39 | 
 40 | "Derivative Works" shall mean any work, whether in Source or Object
 41 | form, that is based on (or derived from) the Work and for which the
 42 | editorial revisions, annotations, elaborations, or other modifications
 43 | represent, as a whole, an original work of authorship. For the purposes
 44 | of this License, Derivative Works shall not include works that remain
 45 | separable from, or merely link (or bind by name) to the interfaces of,
 46 | the Work and Derivative Works thereof.
 47 | 
 48 | "Contribution" shall mean any work of authorship, including
 49 | the original version of the Work and any modifications or additions
 50 | to that Work or Derivative Works thereof, that is intentionally
 51 | submitted to Licensor for inclusion in the Work by the copyright owner
 52 | or by an individual or Legal Entity authorized to submit on behalf of
 53 | the copyright owner. For the purposes of this definition, "submitted"
 54 | means any form of electronic, verbal, or written communication sent
 55 | to the Licensor or its representatives, including but not limited to
 56 | communication on electronic mailing lists, source code control systems,
 57 | and issue tracking systems that are managed by, or on behalf of, the
 58 | Licensor for the purpose of discussing and improving the Work, but
 59 | excluding communication that is conspicuously marked or otherwise
 60 | designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 | "Contributor" shall mean Licensor and any individual or Legal Entity
 63 | on behalf of whom a Contribution has been received by Licensor and
 64 | subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 | this License, each Contributor hereby grants to You a perpetual,
 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 | copyright license to reproduce, prepare Derivative Works of,
 70 | publicly display, publicly perform, sublicense, and distribute the
 71 | Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 | this License, each Contributor hereby grants to You a perpetual,
 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 | (except as stated in this section) patent license to make, have made,
 77 | use, offer to sell, sell, import, and otherwise transfer the Work,
 78 | where such license applies only to those patent claims licensable
 79 | by such Contributor that are necessarily infringed by their
 80 | Contribution(s) alone or by combination of their Contribution(s)
 81 | with the Work to which such Contribution(s) was submitted. If You
 82 | institute patent litigation against any entity (including a
 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 | or a Contribution incorporated within the Work constitutes direct
 85 | or contributory patent infringement, then any patent licenses
 86 | granted to You under this License for that Work shall terminate
 87 | as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 | Work or Derivative Works thereof in any medium, with or without
 91 | modifications, and in Source or Object form, provided that You
 92 | meet the following conditions:
 93 | 
 94 | (a) You must give any other recipients of the Work or
 95 | Derivative Works a copy of this License; and
 96 | 
 97 | (b) You must cause any modified files to carry prominent notices
 98 | stating that You changed the files; and
 99 | 
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 | 
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 | 
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!)  The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 | 
189 | Copyright 2019, Facebook, Inc
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/configs/EK/divided_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Epickitchens
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: data/epic_kitchens/
20 |   USE_RAND_AUGMENT: True
21 |   RE_PROB: 0.0
22 |   USE_REPEATED_AUG: False
23 |   USE_RANDOM_RESIZE_CROPS: False
24 |   COLORJITTER: False
25 |   GRAYSCALE: False
26 |   GAUSSIAN: False
27 | SOLVER:
28 |   BASE_LR: 1e-4
29 |   LR_POLICY: steps_with_relative_lrs
30 |   LRS: [1, 0.1, 0.01]
31 |   STEPS: [0, 30, 40]
32 |   MAX_EPOCH: 50
33 |   MOMENTUM: 0.9
34 |   WEIGHT_DECAY: 5e-2
35 |   WARMUP_EPOCHS: 0.0
36 |   OPTIMIZING_METHOD: adamw
37 |   USE_MIXED_PRECISION: True
38 |   SMOOTHING: 0.2
39 | SLOWFAST:
40 |   ALPHA: 8
41 | VIT:
42 |   PATCH_SIZE: 16
43 |   PATCH_SIZE_TEMP: 2
44 |   CHANNELS: 3
45 |   EMBED_DIM: 768
46 |   DEPTH: 12
47 |   NUM_HEADS: 12
48 |   MLP_RATIO: 4
49 |   QKV_BIAS: True
50 |   VIDEO_INPUT: True
51 |   TEMPORAL_RESOLUTION: 8
52 |   USE_MLP: True
53 |   DROP: 0.0
54 |   POS_DROPOUT: 0.0
55 |   DROP_PATH: 0.2
56 |   IM_PRETRAINED: True
57 |   HEAD_DROPOUT: 0.0
58 |   HEAD_ACT: tanh
59 |   PRETRAINED_WEIGHTS: vit_1k
60 |   ATTN_LAYER: divided
61 | MODEL:
62 |   NUM_CLASSES: 97
63 |   ARCH: slow
64 |   MODEL_NAME: VisionTransformer
65 |   LOSS_FUNC: cross_entropy
66 | TEST:
67 |   ENABLE: True
68 |   DATASET: Epickitchens
69 |   BATCH_SIZE: 64
70 |   NUM_ENSEMBLE_VIEWS: 10
71 |   NUM_SPATIAL_CROPS: 3
72 | DATA_LOADER:
73 |   NUM_WORKERS: 8
74 |   PIN_MEMORY: True
75 | NUM_GPUS: 8
76 | NUM_SHARDS: 4
77 | RNG_SEED: 0
78 | OUTPUT_DIR: .
79 | TENSORBOARD:
80 |   ENABLE: True
81 | 


--------------------------------------------------------------------------------
/configs/EK/joint_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Epickitchens
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: data/epic_kitchens/
20 |   USE_RAND_AUGMENT: True
21 |   RE_PROB: 0.0
22 |   USE_REPEATED_AUG: False
23 |   USE_RANDOM_RESIZE_CROPS: False
24 |   COLORJITTER: False
25 |   GRAYSCALE: False
26 |   GAUSSIAN: False
27 | SOLVER:
28 |   BASE_LR: 1e-4
29 |   LR_POLICY: steps_with_relative_lrs
30 |   LRS: [1, 0.1, 0.01]
31 |   STEPS: [0, 30, 40]
32 |   MAX_EPOCH: 50
33 |   MOMENTUM: 0.9
34 |   WEIGHT_DECAY: 5e-2
35 |   WARMUP_EPOCHS: 0.0
36 |   OPTIMIZING_METHOD: adamw
37 |   USE_MIXED_PRECISION: True
38 |   SMOOTHING: 0.2
39 | SLOWFAST:
40 |   ALPHA: 8
41 | VIT:
42 |   PATCH_SIZE: 16
43 |   PATCH_SIZE_TEMP: 2
44 |   CHANNELS: 3
45 |   EMBED_DIM: 768
46 |   DEPTH: 12
47 |   NUM_HEADS: 12
48 |   MLP_RATIO: 4
49 |   QKV_BIAS: True
50 |   VIDEO_INPUT: True
51 |   TEMPORAL_RESOLUTION: 8
52 |   USE_MLP: True
53 |   DROP: 0.0
54 |   POS_DROPOUT: 0.0
55 |   DROP_PATH: 0.2
56 |   IM_PRETRAINED: True
57 |   HEAD_DROPOUT: 0.0
58 |   HEAD_ACT: tanh
59 |   PRETRAINED_WEIGHTS: vit_1k
60 |   ATTN_LAYER: joint
61 | MODEL:
62 |   NUM_CLASSES: 97
63 |   ARCH: slow
64 |   MODEL_NAME: VisionTransformer
65 |   LOSS_FUNC: cross_entropy
66 | TEST:
67 |   ENABLE: True
68 |   DATASET: Epickitchens
69 |   BATCH_SIZE: 64
70 |   NUM_ENSEMBLE_VIEWS: 10
71 |   NUM_SPATIAL_CROPS: 3
72 | DATA_LOADER:
73 |   NUM_WORKERS: 8
74 |   PIN_MEMORY: True
75 | NUM_GPUS: 8
76 | NUM_SHARDS: 4
77 | RNG_SEED: 0
78 | OUTPUT_DIR: .
79 | TENSORBOARD:
80 |   ENABLE: True
81 | 


--------------------------------------------------------------------------------
/configs/EK/motionformer_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Epickitchens
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: data/epic_kitchens/
20 |   USE_RAND_AUGMENT: True
21 |   RE_PROB: 0.0
22 |   USE_REPEATED_AUG: False
23 |   USE_RANDOM_RESIZE_CROPS: False
24 |   COLORJITTER: False
25 |   GRAYSCALE: False
26 |   GAUSSIAN: False
27 | SOLVER:
28 |   BASE_LR: 1e-4
29 |   LR_POLICY: steps_with_relative_lrs
30 |   LRS: [1, 0.1, 0.01]
31 |   STEPS: [0, 30, 40]
32 |   MAX_EPOCH: 50
33 |   MOMENTUM: 0.9
34 |   WEIGHT_DECAY: 5e-2
35 |   WARMUP_EPOCHS: 0.0
36 |   OPTIMIZING_METHOD: adamw
37 |   USE_MIXED_PRECISION: True
38 |   SMOOTHING: 0.2
39 | SLOWFAST:
40 |   ALPHA: 8
41 | VIT:
42 |   PATCH_SIZE: 16
43 |   PATCH_SIZE_TEMP: 2
44 |   CHANNELS: 3
45 |   EMBED_DIM: 768
46 |   DEPTH: 12
47 |   NUM_HEADS: 12
48 |   MLP_RATIO: 4
49 |   QKV_BIAS: True
50 |   VIDEO_INPUT: True
51 |   TEMPORAL_RESOLUTION: 8
52 |   USE_MLP: True
53 |   DROP: 0.0
54 |   POS_DROPOUT: 0.0
55 |   DROP_PATH: 0.2
56 |   IM_PRETRAINED: True
57 |   HEAD_DROPOUT: 0.0
58 |   HEAD_ACT: tanh
59 |   PRETRAINED_WEIGHTS: vit_1k
60 |   ATTN_LAYER: trajectory
61 | MODEL:
62 |   NUM_CLASSES: 97
63 |   ARCH: slow
64 |   MODEL_NAME: VisionTransformer
65 |   LOSS_FUNC: cross_entropy
66 | TEST:
67 |   ENABLE: True
68 |   DATASET: Epickitchens
69 |   BATCH_SIZE: 64
70 |   NUM_ENSEMBLE_VIEWS: 10
71 |   NUM_SPATIAL_CROPS: 3
72 | DATA_LOADER:
73 |   NUM_WORKERS: 8
74 |   PIN_MEMORY: True
75 | NUM_GPUS: 8
76 | NUM_SHARDS: 4
77 | RNG_SEED: 0
78 | OUTPUT_DIR: .
79 | TENSORBOARD:
80 |   ENABLE: True
81 | 


--------------------------------------------------------------------------------
/configs/EK/motionformer_224_32x3.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Epickitchens
 4 |   BATCH_SIZE: 12
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 32
12 |   SAMPLING_RATE: 3
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: data/epic_kitchens/
20 |   USE_RAND_AUGMENT: True
21 |   RE_PROB: 0.0
22 |   USE_REPEATED_AUG: False
23 |   USE_RANDOM_RESIZE_CROPS: False
24 |   COLORJITTER: False
25 |   GRAYSCALE: False
26 |   GAUSSIAN: False
27 | SOLVER:
28 |   BASE_LR: 0.375e-4
29 |   LR_POLICY: steps_with_relative_lrs
30 |   LRS: [1, 0.1, 0.01]
31 |   STEPS: [0, 30, 40]
32 |   MAX_EPOCH: 50
33 |   MOMENTUM: 0.9
34 |   WEIGHT_DECAY: 5e-2
35 |   WARMUP_EPOCHS: 0.0
36 |   OPTIMIZING_METHOD: adamw
37 |   USE_MIXED_PRECISION: True
38 |   SMOOTHING: 0.2
39 | SLOWFAST:
40 |   ALPHA: 8
41 | VIT:
42 |   PATCH_SIZE: 16
43 |   PATCH_SIZE_TEMP: 2
44 |   CHANNELS: 3
45 |   EMBED_DIM: 768
46 |   DEPTH: 12
47 |   NUM_HEADS: 12
48 |   MLP_RATIO: 4
49 |   QKV_BIAS: True
50 |   VIDEO_INPUT: True
51 |   TEMPORAL_RESOLUTION: 16
52 |   USE_MLP: True
53 |   DROP: 0.0
54 |   POS_DROPOUT: 0.0
55 |   DROP_PATH: 0.2
56 |   IM_PRETRAINED: True
57 |   HEAD_DROPOUT: 0.0
58 |   HEAD_ACT: tanh
59 |   PRETRAINED_WEIGHTS: vit_1k
60 |   ATTN_LAYER: trajectory
61 | MODEL:
62 |   NUM_CLASSES: 97
63 |   ARCH: slow
64 |   MODEL_NAME: VisionTransformer
65 |   LOSS_FUNC: cross_entropy
66 | TEST:
67 |   ENABLE: True
68 |   DATASET: Epickitchens
69 |   BATCH_SIZE: 64
70 |   NUM_ENSEMBLE_VIEWS: 10
71 |   NUM_SPATIAL_CROPS: 3
72 | DATA_LOADER:
73 |   NUM_WORKERS: 8
74 |   PIN_MEMORY: True
75 | NUM_GPUS: 8
76 | NUM_SHARDS: 4
77 | RNG_SEED: 0
78 | OUTPUT_DIR: .
79 | TENSORBOARD:
80 |   ENABLE: True
81 | 


--------------------------------------------------------------------------------
/configs/EK/motionformer_336_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Epickitchens
 4 |   BATCH_SIZE: 12
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [384, 480]
14 |   TRAIN_CROP_SIZE: 336
15 |   TEST_CROP_SIZE: 336
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: data/epic_kitchens/
20 |   USE_RAND_AUGMENT: True
21 |   RE_PROB: 0.0
22 |   USE_REPEATED_AUG: False
23 |   USE_RANDOM_RESIZE_CROPS: False
24 |   COLORJITTER: False
25 |   GRAYSCALE: False
26 |   GAUSSIAN: False
27 | SOLVER:
28 |   BASE_LR: 0.375e-4
29 |   LR_POLICY: steps_with_relative_lrs
30 |   LRS: [1, 0.1, 0.01]
31 |   STEPS: [0, 30, 40]
32 |   MAX_EPOCH: 50
33 |   MOMENTUM: 0.9
34 |   WEIGHT_DECAY: 5e-2
35 |   WARMUP_EPOCHS: 0.0
36 |   OPTIMIZING_METHOD: adamw
37 |   USE_MIXED_PRECISION: True
38 |   SMOOTHING: 0.2
39 | SLOWFAST:
40 |   ALPHA: 8
41 | VIT:
42 |   PATCH_SIZE: 16
43 |   PATCH_SIZE_TEMP: 2
44 |   CHANNELS: 3
45 |   EMBED_DIM: 768
46 |   DEPTH: 12
47 |   NUM_HEADS: 12
48 |   MLP_RATIO: 4
49 |   QKV_BIAS: True
50 |   VIDEO_INPUT: True
51 |   TEMPORAL_RESOLUTION: 8
52 |   USE_MLP: True
53 |   DROP: 0.0
54 |   POS_DROPOUT: 0.0
55 |   DROP_PATH: 0.2
56 |   IM_PRETRAINED: True
57 |   HEAD_DROPOUT: 0.0
58 |   HEAD_ACT: tanh
59 |   PRETRAINED_WEIGHTS: vit_1k
60 |   ATTN_LAYER: trajectory
61 | MODEL:
62 |   NUM_CLASSES: 97
63 |   ARCH: slow
64 |   MODEL_NAME: VisionTransformer
65 |   LOSS_FUNC: cross_entropy
66 | TEST:
67 |   ENABLE: True
68 |   DATASET: Epickitchens
69 |   BATCH_SIZE: 64
70 |   NUM_ENSEMBLE_VIEWS: 10
71 |   NUM_SPATIAL_CROPS: 3
72 | DATA_LOADER:
73 |   NUM_WORKERS: 8
74 |   PIN_MEMORY: True
75 | NUM_GPUS: 8
76 | NUM_SHARDS: 4
77 | RNG_SEED: 0
78 | OUTPUT_DIR: .
79 | TENSORBOARD:
80 |   ENABLE: True
81 | 


--------------------------------------------------------------------------------
/configs/K400/divided_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 16
10 |   SAMPLING_RATE: 4
11 |   TRAIN_JITTER_SCALES: [256, 320]
12 |   TRAIN_CROP_SIZE: 224
13 |   TEST_CROP_SIZE: 224
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_400/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 1e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 8
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: divided
59 | MODEL:
60 |   NUM_CLASSES: 400
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K400/joint_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 16
10 |   SAMPLING_RATE: 4
11 |   TRAIN_JITTER_SCALES: [256, 320]
12 |   TRAIN_CROP_SIZE: 224
13 |   TEST_CROP_SIZE: 224
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_400/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 1e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 8
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: joint
59 | MODEL:
60 |   NUM_CLASSES: 400
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K400/motionformer_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 16
10 |   SAMPLING_RATE: 4
11 |   TRAIN_JITTER_SCALES: [256, 320]
12 |   TRAIN_CROP_SIZE: 224
13 |   TEST_CROP_SIZE: 224
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_400/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 1e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 8
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: trajectory
59 | MODEL:
60 |   NUM_CLASSES: 400
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K400/motionformer_224_32x3.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 12
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 32
10 |   SAMPLING_RATE: 3
11 |   TRAIN_JITTER_SCALES: [256, 320]
12 |   TRAIN_CROP_SIZE: 224
13 |   TEST_CROP_SIZE: 224
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_400/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: True
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 0.375e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 16
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: trajectory
59 | MODEL:
60 |   NUM_CLASSES: 400
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K400/motionformer_336_16x8.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 12
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 16
10 |   SAMPLING_RATE: 8
11 |   TRAIN_JITTER_SCALES: [384, 480]
12 |   TRAIN_CROP_SIZE: 336
13 |   TEST_CROP_SIZE: 336
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_400/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 0.375e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 8
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: trajectory
59 | MODEL:
60 |   NUM_CLASSES: 400
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K600/divided_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 16
10 |   SAMPLING_RATE: 4
11 |   TRAIN_JITTER_SCALES: [256, 320]
12 |   TRAIN_CROP_SIZE: 224
13 |   TEST_CROP_SIZE: 224
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_600/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 1e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 8
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: divided
59 | MODEL:
60 |   NUM_CLASSES: 600
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K600/joint_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 16
10 |   SAMPLING_RATE: 4
11 |   TRAIN_JITTER_SCALES: [256, 320]
12 |   TRAIN_CROP_SIZE: 224
13 |   TEST_CROP_SIZE: 224
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_600/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 1e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 8
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: joint
59 | MODEL:
60 |   NUM_CLASSES: 600
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K600/motionformer_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 16
10 |   SAMPLING_RATE: 4
11 |   TRAIN_JITTER_SCALES: [256, 320]
12 |   TRAIN_CROP_SIZE: 224
13 |   TEST_CROP_SIZE: 224
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_600/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 1e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 8
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: trajectory
59 | MODEL:
60 |   NUM_CLASSES: 600
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K600/motionformer_224_32x3.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 12
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 32
10 |   SAMPLING_RATE: 3
11 |   TRAIN_JITTER_SCALES: [256, 320]
12 |   TRAIN_CROP_SIZE: 224
13 |   TEST_CROP_SIZE: 224
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_600/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 0.375e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 16
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: trajectory
59 | MODEL:
60 |   NUM_CLASSES: 600
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/K600/motionformer_336_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Kinetics
 4 |   BATCH_SIZE: 12
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   NUM_FRAMES: 16
10 |   SAMPLING_RATE: 4
11 |   TRAIN_JITTER_SCALES: [384, 480]
12 |   TRAIN_CROP_SIZE: 336
13 |   TEST_CROP_SIZE: 336
14 |   INPUT_CHANNEL_NUM: [3]
15 |   MEAN: [0.5, 0.5, 0.5]
16 |   STD: [0.5, 0.5, 0.5]
17 |   PATH_TO_DATA_DIR: data/kinetics_600/
18 |   USE_RAND_AUGMENT: False
19 |   RE_PROB: 0.0
20 |   USE_REPEATED_AUG: False
21 |   USE_RANDOM_RESIZE_CROPS: False
22 |   COLORJITTER: True
23 |   GRAYSCALE: False
24 |   GAUSSIAN: False
25 | SOLVER:
26 |   BASE_LR: 0.375e-4
27 |   LR_POLICY: steps_with_relative_lrs
28 |   LRS: [1, 0.1, 0.01]
29 |   STEPS: [0, 20, 30]
30 |   MAX_EPOCH: 35
31 |   MOMENTUM: 0.9
32 |   WEIGHT_DECAY: 5e-2
33 |   WARMUP_EPOCHS: 0.0
34 |   OPTIMIZING_METHOD: adamw
35 |   USE_MIXED_PRECISION: True
36 |   SMOOTHING: 0.2
37 | SLOWFAST:
38 |   ALPHA: 8
39 | VIT:
40 |   PATCH_SIZE: 16
41 |   PATCH_SIZE_TEMP: 2
42 |   CHANNELS: 3
43 |   EMBED_DIM: 768
44 |   DEPTH: 12
45 |   NUM_HEADS: 12
46 |   MLP_RATIO: 4
47 |   QKV_BIAS: True
48 |   VIDEO_INPUT: True
49 |   TEMPORAL_RESOLUTION: 8
50 |   USE_MLP: True
51 |   DROP: 0.0
52 |   POS_DROPOUT: 0.0
53 |   DROP_PATH: 0.2
54 |   IM_PRETRAINED: True
55 |   HEAD_DROPOUT: 0.0
56 |   HEAD_ACT: tanh
57 |   PRETRAINED_WEIGHTS: vit_1k
58 |   ATTN_LAYER: trajectory
59 | MODEL:
60 |   NUM_CLASSES: 600
61 |   ARCH: slow
62 |   MODEL_NAME: VisionTransformer
63 |   LOSS_FUNC: cross_entropy
64 | TEST:
65 |   ENABLE: True
66 |   DATASET: Kinetics
67 |   BATCH_SIZE: 64
68 |   NUM_ENSEMBLE_VIEWS: 10
69 |   NUM_SPATIAL_CROPS: 3
70 | DATA_LOADER:
71 |   NUM_WORKERS: 8
72 |   PIN_MEMORY: True
73 | NUM_GPUS: 8
74 | NUM_SHARDS: 4
75 | RNG_SEED: 0
76 | OUTPUT_DIR: .
77 | TENSORBOARD:
78 |   ENABLE: True
79 | 


--------------------------------------------------------------------------------
/configs/SSV2/divided_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Ssv2
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
20 |   PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
21 |   INV_UNIFORM_SAMPLE: True
22 |   RANDOM_FLIP: False
23 |   REVERSE_INPUT_CHANNEL: True
24 |   USE_RAND_AUGMENT: True
25 |   RE_PROB: 0.0
26 |   USE_REPEATED_AUG: False
27 |   USE_RANDOM_RESIZE_CROPS: False
28 |   COLORJITTER: False
29 |   GRAYSCALE: False
30 |   GAUSSIAN: False
31 | SOLVER:
32 |   BASE_LR: 1e-4
33 |   LR_POLICY: steps_with_relative_lrs
34 |   LRS: [1, 0.1, 0.01]
35 |   STEPS: [0, 20, 30]
36 |   MAX_EPOCH: 35
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 5e-2
39 |   WARMUP_EPOCHS: 0.0
40 |   OPTIMIZING_METHOD: adamw
41 |   USE_MIXED_PRECISION: True
42 |   SMOOTHING: 0.2
43 | SLOWFAST:
44 |   ALPHA: 8
45 | VIT:
46 |   PATCH_SIZE: 16
47 |   PATCH_SIZE_TEMP: 2
48 |   CHANNELS: 3
49 |   EMBED_DIM: 768
50 |   DEPTH: 12
51 |   NUM_HEADS: 12
52 |   MLP_RATIO: 4
53 |   QKV_BIAS: True
54 |   VIDEO_INPUT: True
55 |   TEMPORAL_RESOLUTION: 8
56 |   USE_MLP: True
57 |   DROP: 0.0
58 |   POS_DROPOUT: 0.0
59 |   DROP_PATH: 0.2
60 |   IM_PRETRAINED: True
61 |   HEAD_DROPOUT: 0.0
62 |   HEAD_ACT: tanh
63 |   PRETRAINED_WEIGHTS: vit_1k
64 |   ATTN_LAYER: divided
65 | MODEL:
66 |   NUM_CLASSES: 174
67 |   ARCH: slow
68 |   MODEL_NAME: VisionTransformer
69 |   LOSS_FUNC: cross_entropy
70 | TEST:
71 |   ENABLE: True
72 |   DATASET: Ssv2
73 |   BATCH_SIZE: 64
74 |   NUM_ENSEMBLE_VIEWS: 1
75 |   NUM_SPATIAL_CROPS: 3
76 | DATA_LOADER:
77 |   NUM_WORKERS: 4
78 |   PIN_MEMORY: True
79 | NUM_GPUS: 8
80 | NUM_SHARDS: 4
81 | RNG_SEED: 0
82 | OUTPUT_DIR: .
83 | TENSORBOARD:
84 |   ENABLE: True
85 | 


--------------------------------------------------------------------------------
/configs/SSV2/joint_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Ssv2
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
20 |   PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
21 |   INV_UNIFORM_SAMPLE: True
22 |   RANDOM_FLIP: False
23 |   REVERSE_INPUT_CHANNEL: True
24 |   USE_RAND_AUGMENT: False
25 |   RE_PROB: 0.0
26 |   USE_REPEATED_AUG: False
27 |   USE_RANDOM_RESIZE_CROPS: False
28 |   COLORJITTER: False
29 |   GRAYSCALE: False
30 |   GAUSSIAN: False
31 | SOLVER:
32 |   BASE_LR: 1e-4
33 |   LR_POLICY: steps_with_relative_lrs
34 |   LRS: [1, 0.1, 0.01]
35 |   STEPS: [0, 20, 30]
36 |   MAX_EPOCH: 35
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 5e-2
39 |   WARMUP_EPOCHS: 0.0
40 |   OPTIMIZING_METHOD: adamw
41 |   USE_MIXED_PRECISION: True
42 |   SMOOTHING: 0.2
43 | SLOWFAST:
44 |   ALPHA: 8
45 | VIT:
46 |   PATCH_SIZE: 16
47 |   PATCH_SIZE_TEMP: 2
48 |   CHANNELS: 3
49 |   EMBED_DIM: 768
50 |   DEPTH: 12
51 |   NUM_HEADS: 12
52 |   MLP_RATIO: 4
53 |   QKV_BIAS: True
54 |   VIDEO_INPUT: True
55 |   TEMPORAL_RESOLUTION: 8
56 |   USE_MLP: True
57 |   DROP: 0.0
58 |   POS_DROPOUT: 0.0
59 |   DROP_PATH: 0.2
60 |   IM_PRETRAINED: True
61 |   HEAD_DROPOUT: 0.0
62 |   HEAD_ACT: tanh
63 |   PRETRAINED_WEIGHTS: vit_1k
64 |   ATTN_LAYER: joint
65 | MODEL:
66 |   NUM_CLASSES: 174
67 |   ARCH: slow
68 |   MODEL_NAME: VisionTransformer
69 |   LOSS_FUNC: cross_entropy
70 | TEST:
71 |   ENABLE: True
72 |   DATASET: Ssv2
73 |   BATCH_SIZE: 64
74 |   NUM_ENSEMBLE_VIEWS: 1
75 |   NUM_SPATIAL_CROPS: 3
76 | DATA_LOADER:
77 |   NUM_WORKERS: 4
78 |   PIN_MEMORY: True
79 | NUM_GPUS: 8
80 | NUM_SHARDS: 4
81 | RNG_SEED: 0
82 | OUTPUT_DIR: .
83 | TENSORBOARD:
84 |   ENABLE: True
85 | 


--------------------------------------------------------------------------------
/configs/SSV2/motionformer_224_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Ssv2
 4 |   BATCH_SIZE: 32
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
20 |   PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
21 |   INV_UNIFORM_SAMPLE: True
22 |   RANDOM_FLIP: False
23 |   REVERSE_INPUT_CHANNEL: True
24 |   USE_RAND_AUGMENT: True
25 |   RE_PROB: 0.0
26 |   USE_REPEATED_AUG: False
27 |   USE_RANDOM_RESIZE_CROPS: False
28 |   COLORJITTER: False
29 |   GRAYSCALE: False
30 |   GAUSSIAN: False
31 | SOLVER:
32 |   BASE_LR: 1e-4
33 |   LR_POLICY: steps_with_relative_lrs
34 |   LRS: [1, 0.1, 0.01]
35 |   STEPS: [0, 20, 30]
36 |   MAX_EPOCH: 35
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 5e-2
39 |   WARMUP_EPOCHS: 0.0
40 |   OPTIMIZING_METHOD: adamw
41 |   USE_MIXED_PRECISION: True
42 |   SMOOTHING: 0.2
43 | SLOWFAST:
44 |   ALPHA: 8
45 | VIT:
46 |   PATCH_SIZE: 16
47 |   PATCH_SIZE_TEMP: 2
48 |   CHANNELS: 3
49 |   EMBED_DIM: 768
50 |   DEPTH: 12
51 |   NUM_HEADS: 12
52 |   MLP_RATIO: 4
53 |   QKV_BIAS: True
54 |   VIDEO_INPUT: True
55 |   TEMPORAL_RESOLUTION: 8
56 |   USE_MLP: True
57 |   DROP: 0.0
58 |   POS_DROPOUT: 0.0
59 |   DROP_PATH: 0.2
60 |   IM_PRETRAINED: True
61 |   HEAD_DROPOUT: 0.0
62 |   HEAD_ACT: tanh
63 |   PRETRAINED_WEIGHTS: vit_1k
64 |   ATTN_LAYER: trajectory
65 | MODEL:
66 |   NUM_CLASSES: 174
67 |   ARCH: slow
68 |   MODEL_NAME: VisionTransformer
69 |   LOSS_FUNC: cross_entropy
70 | TEST:
71 |   ENABLE: True
72 |   DATASET: Ssv2
73 |   BATCH_SIZE: 64
74 |   NUM_ENSEMBLE_VIEWS: 1
75 |   NUM_SPATIAL_CROPS: 3
76 | DATA_LOADER:
77 |   NUM_WORKERS: 4
78 |   PIN_MEMORY: True
79 | NUM_GPUS: 8
80 | NUM_SHARDS: 4
81 | RNG_SEED: 0
82 | OUTPUT_DIR: .
83 | TENSORBOARD:
84 |   ENABLE: True
85 | 


--------------------------------------------------------------------------------
/configs/SSV2/motionformer_224_32x3.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Ssv2
 4 |   BATCH_SIZE: 12
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 32
12 |   SAMPLING_RATE: 3
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 224
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
20 |   PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
21 |   INV_UNIFORM_SAMPLE: True
22 |   RANDOM_FLIP: False
23 |   REVERSE_INPUT_CHANNEL: True
24 |   USE_RAND_AUGMENT: True
25 |   RE_PROB: 0.0
26 |   USE_REPEATED_AUG: False
27 |   USE_RANDOM_RESIZE_CROPS: False
28 |   COLORJITTER: False
29 |   GRAYSCALE: False
30 |   GAUSSIAN: False
31 | SOLVER:
32 |   BASE_LR: 0.375e-4
33 |   LR_POLICY: steps_with_relative_lrs
34 |   LRS: [1, 0.1, 0.01]
35 |   STEPS: [0, 20, 30]
36 |   MAX_EPOCH: 35
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 5e-2
39 |   WARMUP_EPOCHS: 0.0
40 |   OPTIMIZING_METHOD: adamw
41 |   USE_MIXED_PRECISION: True
42 |   SMOOTHING: 0.2
43 | SLOWFAST:
44 |   ALPHA: 8
45 | VIT:
46 |   PATCH_SIZE: 16
47 |   PATCH_SIZE_TEMP: 2
48 |   CHANNELS: 3
49 |   EMBED_DIM: 768
50 |   DEPTH: 12
51 |   NUM_HEADS: 12
52 |   MLP_RATIO: 4
53 |   QKV_BIAS: True
54 |   VIDEO_INPUT: True
55 |   TEMPORAL_RESOLUTION: 16
56 |   USE_MLP: True
57 |   DROP: 0.0
58 |   POS_DROPOUT: 0.0
59 |   DROP_PATH: 0.2
60 |   IM_PRETRAINED: True
61 |   HEAD_DROPOUT: 0.0
62 |   HEAD_ACT: tanh
63 |   PRETRAINED_WEIGHTS: vit_1k
64 |   ATTN_LAYER: trajectory
65 | MODEL:
66 |   NUM_CLASSES: 174
67 |   ARCH: slow
68 |   MODEL_NAME: VisionTransformer
69 |   LOSS_FUNC: cross_entropy
70 | TEST:
71 |   ENABLE: True
72 |   DATASET: Ssv2
73 |   BATCH_SIZE: 64
74 |   NUM_ENSEMBLE_VIEWS: 1
75 |   NUM_SPATIAL_CROPS: 3
76 | DATA_LOADER:
77 |   NUM_WORKERS: 4
78 |   PIN_MEMORY: True
79 | NUM_GPUS: 8
80 | NUM_SHARDS: 4
81 | RNG_SEED: 0
82 | OUTPUT_DIR: .
83 | TENSORBOARD:
84 |   ENABLE: True
85 | 


--------------------------------------------------------------------------------
/configs/SSV2/motionformer_336_16x4.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: Ssv2
 4 |   BATCH_SIZE: 12
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 |   CHECKPOINT_EPOCH_RESET: True
 9 |   CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
10 | DATA:
11 |   NUM_FRAMES: 16
12 |   SAMPLING_RATE: 4
13 |   TRAIN_JITTER_SCALES: [384, 480]
14 |   TRAIN_CROP_SIZE: 336
15 |   TEST_CROP_SIZE: 336
16 |   INPUT_CHANNEL_NUM: [3]
17 |   MEAN: [0.5, 0.5, 0.5]
18 |   STD: [0.5, 0.5, 0.5]
19 |   PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
20 |   PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
21 |   INV_UNIFORM_SAMPLE: True
22 |   RANDOM_FLIP: False
23 |   REVERSE_INPUT_CHANNEL: True
24 |   USE_RAND_AUGMENT: True
25 |   RE_PROB: 0.0
26 |   USE_REPEATED_AUG: False
27 |   USE_RANDOM_RESIZE_CROPS: False
28 |   COLORJITTER: False
29 |   GRAYSCALE: False
30 |   GAUSSIAN: False
31 | SOLVER:
32 |   BASE_LR: 0.375e-4
33 |   LR_POLICY: steps_with_relative_lrs
34 |   LRS: [1, 0.1, 0.01]
35 |   STEPS: [0, 20, 30]
36 |   MAX_EPOCH: 35
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 5e-2
39 |   WARMUP_EPOCHS: 0.0
40 |   OPTIMIZING_METHOD: adamw
41 |   USE_MIXED_PRECISION: True
42 |   SMOOTHING: 0.2
43 | SLOWFAST:
44 |   ALPHA: 8
45 | VIT:
46 |   PATCH_SIZE: 16
47 |   PATCH_SIZE_TEMP: 2
48 |   CHANNELS: 3
49 |   EMBED_DIM: 768
50 |   DEPTH: 12
51 |   NUM_HEADS: 12
52 |   MLP_RATIO: 4
53 |   QKV_BIAS: True
54 |   VIDEO_INPUT: True
55 |   TEMPORAL_RESOLUTION: 8
56 |   USE_MLP: True
57 |   DROP: 0.0
58 |   POS_DROPOUT: 0.0
59 |   DROP_PATH: 0.2
60 |   IM_PRETRAINED: True
61 |   HEAD_DROPOUT: 0.0
62 |   HEAD_ACT: tanh
63 |   PRETRAINED_WEIGHTS: vit_1k
64 |   ATTN_LAYER: trajectory
65 | MODEL:
66 |   NUM_CLASSES: 174
67 |   ARCH: slow
68 |   MODEL_NAME: VisionTransformer
69 |   LOSS_FUNC: cross_entropy
70 | TEST:
71 |   ENABLE: True
72 |   DATASET: Ssv2
73 |   BATCH_SIZE: 64
74 |   NUM_ENSEMBLE_VIEWS: 1
75 |   NUM_SPATIAL_CROPS: 3
76 | DATA_LOADER:
77 |   NUM_WORKERS: 4
78 |   PIN_MEMORY: True
79 | NUM_GPUS: 8
80 | NUM_SHARDS: 4
81 | RNG_SEED: 0
82 | OUTPUT_DIR: .
83 | TENSORBOARD:
84 |   ENABLE: True
85 | 


--------------------------------------------------------------------------------
/data/kinetics_400/preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | import csv
 4 | import glob
 5 | import os
 6 | 
 7 | 
 8 | def k400_preproces(
 9 |     root_dir='/datasets01/kinetics/070618/400/', split_dir='train_avi-288p', mode='train'
10 | ):
11 |     data_prefix = os.path.join(root_dir, split_dir)
12 |     files = list(sorted(glob.glob(os.path.join(data_prefix, '*', '*'))))
13 |     classes = list(sorted(glob.glob(os.path.join(data_prefix, '*'))))
14 |     classes = [os.path.basename(i) for i in classes]
15 |     class_to_idx = {classes[i]: i for i in range(len(classes))}
16 | 
17 |     with open(f'{mode}.csv', mode='w') as csv_file:
18 |         csv_writer = csv.writer(csv_file, delimiter=' ', quotechar='"', quoting=csv.QUOTE_MINIMAL)
19 |         for path in files:
20 |             class_name = path.split('/')[-2]
21 |             class_idx = class_to_idx[class_name]
22 |             csv_writer.writerow([path, class_idx])
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     import argparse
27 |     parser = argparse.ArgumentParser(description='K-400 preprocessing')
28 | 
29 |     parser.add_argument(
30 |         '--root_dir', 
31 |         default='/datasets01/kinetics/070618/400/', 
32 |         type=str, 
33 |         help='root dir of K-400 folder'
34 |     )
35 |     parser.add_argument(
36 |         '--split_dir', 
37 |         default='train_avi-288p', 
38 |         type=str,
39 |         help='name of dir of split'
40 |     )
41 |     parser.add_argument(
42 |         '--mode', 
43 |         default='train', 
44 |         type=str,
45 |         help='name of dir of split'
46 |     )
47 |     args = parser.parse_args()
48 |     k400_preproces(
49 |         root_dir=args.root_dir, 
50 |         split_dir=args.split_dir,
51 |         mode=args.mode
52 |     )


--------------------------------------------------------------------------------
/data/kinetics_600/preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | import csv
 4 | import glob
 5 | import os
 6 | 
 7 | 
 8 | def k600_preproces(
 9 |     root_dir='/datasets01/kinetics/070618/600/', split_dir='train_avi-288p', mode='train'
10 | ):   
11 |     data_prefix = os.path.join(root_dir, split_dir)
12 |     files = list(sorted(glob.glob(os.path.join(data_prefix, '*', '*'))))
13 |     classes = list(sorted(glob.glob(os.path.join(data_prefix, '*'))))
14 |     classes = [os.path.basename(i) for i in classes]
15 |     class_to_idx = {classes[i]: i for i in range(len(classes))}
16 | 
17 |     with open(f'{mode}.csv', mode='w') as csv_file:
18 |         csv_writer = csv.writer(csv_file, delimiter=' ', quotechar='"', quoting=csv.QUOTE_MINIMAL)
19 |         for path in files:
20 |             class_name = path.split('/')[-2]
21 |             class_idx = class_to_idx[class_name]
22 |             csv_writer.writerow([path, class_idx])
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     import argparse
27 |     parser = argparse.ArgumentParser(description='K-600 preprocessing')
28 | 
29 |     parser.add_argument(
30 |         '--root_dir', 
31 |         default='/datasets01/kinetics/070618/600/', 
32 |         type=str, 
33 |         help='root dir of K-400 folder'
34 |     )
35 |     parser.add_argument(
36 |         '--split_dir', 
37 |         default='train_avi-288p', 
38 |         type=str,
39 |         help='name of dir of split'
40 |     )
41 |     parser.add_argument(
42 |         '--mode', 
43 |         default='train', 
44 |         type=str,
45 |         help='name of dir of split'
46 |     )
47 |     args = parser.parse_args()
48 |     k600_preproces(
49 |         root_dir=args.root_dir, 
50 |         split_dir=args.split_dir,
51 |         mode=args.mode
52 |     )


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: motionformer
 2 | channels:
 3 |   - pytorch
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - _libgcc_mutex=0.1=conda_forge
 8 |   - _openmp_mutex=4.5=1_gnu
 9 |   - av=8.0.3=py38hcaf3a0b_0
10 |   - blas=1.0=mkl
11 |   - bzip2=1.0.8=h7b6447c_0
12 |   - ca-certificates=2021.5.30=ha878542_0
13 |   - certifi=2021.5.30=py38h578d9bd_0
14 |   - cudatoolkit=10.2.89=hfd86e86_1
15 |   - ffmpeg=4.3.1=h167e202_0
16 |   - freetype=2.10.4=h5ab3b9f_0
17 |   - gmp=6.2.1=h2531618_2
18 |   - gnutls=3.6.15=he1e5248_0
19 |   - intel-openmp=2021.2.0=h06a4308_610
20 |   - jpeg=9b=h024ee3a_2
21 |   - lame=3.100=h7b6447c_0
22 |   - lcms2=2.12=h3be6417_0
23 |   - ld_impl_linux-64=2.33.1=h53a641e_7
24 |   - libffi=3.3=he6710b0_2
25 |   - libgcc-ng=9.3.0=h2828fa1_19
26 |   - libgomp=9.3.0=h2828fa1_19
27 |   - libiconv=1.15=h63c8f33_5
28 |   - libidn2=2.3.1=h27cfd23_0
29 |   - libpng=1.6.37=hbc83047_0
30 |   - libstdcxx-ng=9.1.0=hdf63c60_0
31 |   - libtasn1=4.16.0=h27cfd23_0
32 |   - libtiff=4.2.0=h85742a9_0
33 |   - libunistring=0.9.10=h27cfd23_0
34 |   - libuv=1.40.0=h7b6447c_0
35 |   - libwebp-base=1.2.0=h27cfd23_0
36 |   - lz4-c=1.9.3=h2531618_0
37 |   - mkl=2021.2.0=h06a4308_296
38 |   - mkl-service=2.3.0=py38h27cfd23_1
39 |   - mkl_fft=1.3.0=py38h42c9631_2
40 |   - mkl_random=1.2.1=py38ha9443f7_2
41 |   - ncurses=6.2=he6710b0_1
42 |   - nettle=3.7.2=hbbd107a_1
43 |   - ninja=1.10.2=hff7bd54_1
44 |   - numpy=1.20.2=py38h2d18471_0
45 |   - numpy-base=1.20.2=py38hfae3a4d_0
46 |   - olefile=0.46=py_0
47 |   - openh264=2.1.1=h8b12597_0
48 |   - openssl=1.1.1k=h7f98852_0
49 |   - pillow=8.2.0=py38he98fc37_0
50 |   - pip=21.1.1=py38h06a4308_0
51 |   - python=3.8.5=h7579374_1
52 |   - python_abi=3.8=1_cp38
53 |   - pytorch=1.8.1=py3.8_cuda10.2_cudnn7.6.5_0
54 |   - readline=8.1=h27cfd23_0
55 |   - setuptools=52.0.0=py38h06a4308_0
56 |   - six=1.15.0=py38h06a4308_0
57 |   - sqlite=3.35.4=hdfb4753_0
58 |   - tk=8.6.10=hbc83047_0
59 |   - torchvision=0.9.1=py38_cu102
60 |   - typing_extensions=3.7.4.3=pyha847dfd_0
61 |   - wheel=0.36.2=pyhd3eb1b0_0
62 |   - x264=1!152.20180806=h14c3975_0
63 |   - xz=5.2.5=h7b6447c_0
64 |   - zlib=1.2.11=h7b6447c_3
65 |   - zstd=1.4.9=haebb681_0
66 |   - pip:
67 |     - chardet==4.0.0
68 |     - cloudpickle==1.6.0
69 |     - cycler==0.10.0
70 |     - ffmpeg-python==0.2.0
71 |     - future==0.18.2
72 |     - fvcore==0.1.5
73 |     - idna==2.10
74 |     - iopath==0.1.8
75 |     - joblib==1.0.1
76 |     - kiwisolver==1.3.1
77 |     - matplotlib==3.4.2
78 |     - pandas==1.2.4
79 |     - psutil==5.8.0
80 |     - pyparsing==2.4.7
81 |     - python-dateutil==2.8.1
82 |     - pytz==2021.1
83 |     - pyyaml==5.4.1
84 |     - requests==2.25.1
85 |     - scikit-learn==0.24.2
86 |     - scipy==1.6.3
87 |     - sklearn==0.0
88 |     - threadpoolctl==2.1.0
89 |     - timm==0.4.9
90 |     - tqdm==4.61.0
91 |     - urllib3==1.26.5
92 |     - werkzeug==2.0.1
93 | prefix: /private/home/mandelapatrick/.conda/envs/motionformer
94 | 


--------------------------------------------------------------------------------
/figs/firstpage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50cc0e13ba1240a4acf2a6800ccdbe1bb49/figs/firstpage.png


--------------------------------------------------------------------------------
/figs/qual_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50cc0e13ba1240a4acf2a6800ccdbe1bb49/figs/qual_results.png


--------------------------------------------------------------------------------
/figs/splash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50cc0e13ba1240a4acf2a6800ccdbe1bb49/figs/splash.png


--------------------------------------------------------------------------------
/figs/traj_attn_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50cc0e13ba1240a4acf2a6800ccdbe1bb49/figs/traj_attn_fig.png


--------------------------------------------------------------------------------
/run_with_submitit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  3 | 
  4 | import argparse
  5 | import os
  6 | from pathlib import Path
  7 | import shutil
  8 | import submitit
  9 | import multiprocessing
 10 | import sys
 11 | import uuid
 12 | 
 13 | import torch
 14 | import slowfast.utils.checkpoint as cu
 15 | import slowfast.utils.multiprocessing as mpu
 16 | from slowfast.utils.misc import launch_job
 17 | from slowfast.utils.parser import load_config
 18 | 
 19 | from tools.test_net import test
 20 | from tools.train_net import train
 21 | 
 22 | def parse_args():
 23 |     parser = argparse.ArgumentParser(
 24 |         "Submitit for onestage training", add_help=False
 25 |     )
 26 |     parser.add_argument(
 27 |         "--num_gpus",
 28 |         help="Number of GPUs",
 29 |         default=8,
 30 |         type=int,
 31 |     )
 32 |     parser.add_argument(
 33 |         "--num_shards",
 34 |         help="Number of Nodes",
 35 |         default=1,
 36 |         type=int,
 37 |     )
 38 |     parser.add_argument(
 39 |         "--partition", 
 40 |         default="learnfair", 
 41 |         type=str, 
 42 |         help="Partition where to submit"
 43 |     )
 44 |     parser.add_argument(
 45 |         "--timeout", 
 46 |         default=60 * 72, 
 47 |         type=int, 
 48 |         help="Duration of the job"
 49 |     )
 50 |     parser.add_argument(
 51 |         "--cfg", 
 52 |         dest="cfg_file", 
 53 |         help="Path to the config file",
 54 |         default="configs/test_R50_8GPU.yaml", type=str
 55 |     )
 56 |     parser.add_argument(
 57 |         "--job_dir", 
 58 |         default="/checkpoint/mandelapatrick/slowfast_ssv2", 
 59 |         type=str, 
 60 |         help="Job dir. Leave empty for automatic."
 61 |     )
 62 |     parser.add_argument(
 63 |         "--name", 
 64 |         default="", 
 65 |         type=str, 
 66 |         help="Job dir. Leave empty for automatic."
 67 |     )
 68 |     parser.add_argument(
 69 |         "--resume-from",
 70 |         default="",
 71 |         type=str,
 72 |         help=(
 73 |             "Weights to resume from (.*pth file) or a file (last_checkpoint) that contains "
 74 |             + "weight file name from the same directory"
 75 |         ),
 76 |     )
 77 |     parser.add_argument(
 78 |         "--resume-job", 
 79 |         default="", 
 80 |         type=str, 
 81 |         help="resume training from the job")
 82 |     parser.add_argument(
 83 |         "--use_volta32", 
 84 |         action='store_true', 
 85 |         help="Big models? Use this")
 86 |     parser.add_argument(
 87 |         "--postfix", 
 88 |         default="experiment", 
 89 |         type=str, 
 90 |         help="Postfix of the jobs"
 91 |     )
 92 |     parser.add_argument(
 93 |         "--mail", 
 94 |         default="", 
 95 |         type=str,
 96 |         help="Email this user when the job finishes if specified"
 97 |     )
 98 |     parser.add_argument(
 99 |         '--comment', 
100 |         default="", 
101 |         type=str,
102 |         help='Comment to pass to scheduler, e.g. priority message'
103 |     )
104 |     parser.add_argument(
105 |         "opts",
106 |         help="See slowfast/config/defaults.py for all options",
107 |         default=None,
108 |         nargs=argparse.REMAINDER,
109 |     )
110 |     return parser.parse_args()
111 | 
112 | 
113 | def get_shared_folder() -> Path:
114 |     user = os.getenv("USER")
115 |     if Path("/checkpoint/").is_dir():
116 |         p = Path(f"/checkpoint/{user}/slowfast")
117 |         p.mkdir(exist_ok=True)
118 |         return p
119 |     raise RuntimeError("No shared folder available")
120 | 
121 | 
122 | def get_init_file():	
123 |     # Init file must not exist, but it's parent dir must exist.
124 |     os.makedirs(str(get_shared_folder()), exist_ok=True)
125 |     init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
126 |     if init_file.exists():
127 |         os.remove(str(init_file))
128 |     return init_file
129 | 
130 | 
131 | def launch(shard_id, num_shards, cfg, init_method):
132 |     os.environ["NCCL_MIN_NRINGS"] = "8"
133 | 
134 |     print ("Pytorch version: ", torch.__version__)
135 |     cfg.SHARD_ID = shard_id
136 |     cfg.NUM_SHARDS = num_shards
137 |     cfg.USE_SBATCH = False
138 | 
139 |     print([
140 |         shard_id, num_shards, cfg
141 |     ])
142 | 
143 |     # train, test = get_func(cfg)
144 |     # Launch job.
145 |     if cfg.TRAIN.ENABLE:
146 |         launch_job(cfg=cfg, init_method=init_method, func=train)
147 | 
148 |     if cfg.TEST.ENABLE:
149 |         launch_job(cfg=cfg, init_method=init_method, func=test)
150 | 
151 | 
152 | class Trainer(object):
153 |     def __init__(self, args):
154 |         self.args = args
155 | 
156 |     def __call__(self):
157 | 
158 |         socket_name = os.popen("ip r | grep default | awk '{print $5}'").read().strip('\n')
159 |         print("Setting GLOO and NCCL sockets IFNAME to: {}".format(socket_name))
160 |         os.environ["GLOO_SOCKET_IFNAME"] = socket_name
161 |         os.environ["NCCL_SOCKET_IFNAME"] = socket_name
162 | 
163 |         hostname_first_node = os.popen(
164 |             "scontrol show hostnames $SLURM_JOB_NODELIST"
165 |         ).read().split("\n")[0]
166 |         dist_url = "tcp://{}:12399".format(hostname_first_node)
167 |         print("We will use the following dist url: {}".format(dist_url))
168 | 
169 |         self._setup_gpu_args()
170 |         results = launch(
171 |             shard_id=self.args.machine_rank,
172 |             num_shards=self.args.num_shards,
173 |             cfg=load_config(self.args),
174 |             init_method=dist_url,
175 |         )
176 |         return results
177 | 
178 |     def checkpoint(self):
179 |         import submitit
180 | 
181 |         job_env = submitit.JobEnvironment()
182 |         slurm_job_id = job_env.job_id
183 |         if self.args.resume_job == "":
184 |             self.args.resume_job = slurm_job_id
185 |         print("Requeuing ", self.args)
186 |         empty_trainer = type(self)(self.args)
187 |         return submitit.helpers.DelayedSubmission(empty_trainer)
188 | 
189 |     def _setup_gpu_args(self):
190 |         import submitit
191 | 
192 |         job_env = submitit.JobEnvironment()
193 |         print(self.args)
194 | 
195 |         self.args.machine_rank = job_env.global_rank
196 |         self.args.output_dir = str(self.args.output_dir).replace("%j", str(job_env.job_id))
197 |         print(f"Process rank: {job_env.global_rank}")
198 | 
199 | 
200 | def main():
201 |     args = parse_args()
202 | 
203 |     if args.name == "":
204 |         cfg_name = os.path.splitext(os.path.basename(args.cfg_file))[0]
205 |         args.name = '_'.join([cfg_name, args.postfix])
206 | 
207 |     assert args.job_dir != ""
208 | 
209 |     args.job_dir = Path(args.job_dir) / "%j"
210 |     args.output_dir = args.job_dir
211 |     executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
212 | 
213 |     # cluster setup is defined by environment variables
214 |     num_gpus_per_node = args.num_gpus
215 |     nodes = args.num_shards
216 |     partition = args.partition
217 |     timeout_min = args.timeout
218 |     kwargs = {}
219 |     if args.use_volta32:
220 |         kwargs['slurm_constraint'] = 'volta32gb'
221 |     if args.comment:
222 |         kwargs['slurm_comment'] = args.comment
223 | 
224 |     executor.update_parameters(
225 |         mem_gb=60 * num_gpus_per_node,
226 |         gpus_per_node=num_gpus_per_node,
227 |         tasks_per_node=1,
228 |         cpus_per_task=10 * num_gpus_per_node,
229 |         nodes=nodes,
230 |         timeout_min=timeout_min,  # max is 60 * 72
231 |         slurm_partition=partition,
232 |         slurm_signal_delay_s=120,
233 |         **kwargs
234 |     )
235 | 
236 | 
237 |     print(args.name)
238 |     executor.update_parameters(name=args.name)
239 | 
240 |     trainer = Trainer(args)
241 |     job = executor.submit(trainer)
242 |     print("Submitted job_id:", job.job_id)
243 | 
244 | 
245 | if __name__ == "__main__":
246 |     main()
247 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length=100
 3 | multi_line_output=4
 4 | known_standard_library=numpy,setuptools
 5 | known_myself=slowfast
 6 | known_third_party=fvcore,iopath,av,torch,pycocotools,yacs,termcolor,scipy,simplejson,matplotlib,detectron2,torchvision,yaml,tqdm,psutil,opencv-python,pandas,tensorboard,moviepy,sklearn,cv2
 7 | no_lines_before=STDLIB,THIRDPARTY
 8 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
 9 | default_section=FIRSTPARTY
10 | 
11 | [mypy]
12 | python_version=3.6
13 | ignore_missing_imports = True
14 | warn_unused_configs = True
15 | disallow_untyped_defs = True
16 | check_untyped_defs = True
17 | warn_unused_ignores = True
18 | warn_redundant_casts = True
19 | show_column_numbers = True
20 | follow_imports = silent
21 | allow_redefinition = True
22 | ; Require all functions to be annotated
23 | disallow_incomplete_defs = True
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | from setuptools import find_packages, setup
 5 | 
 6 | setup(
 7 |     name="slowfast",
 8 |     version="1.0",
 9 |     author="FAIR",
10 |     url="unknown",
11 |     description="SlowFast Video Understanding",
12 |     install_requires=[
13 |         "yacs>=0.1.6",
14 |         "pyyaml>=5.1",
15 |         "av",
16 |         "matplotlib",
17 |         "termcolor>=1.1",
18 |         "simplejson",
19 |         "tqdm",
20 |         "psutil",
21 |         "matplotlib",
22 |         "detectron2",
23 |         "opencv-python",
24 |         "pandas",
25 |         "torchvision>=0.4.2",
26 |         "sklearn",
27 |         "tensorboard",
28 |     ],
29 |     extras_require={"tensorboard_video_visualization": ["moviepy"]},
30 |     packages=find_packages(exclude=("configs", "tests")),
31 | )
32 | 


--------------------------------------------------------------------------------
/slowfast/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 
4 | from slowfast.utils.env import setup_environment
5 | 
6 | setup_environment()
7 | 


--------------------------------------------------------------------------------
/slowfast/config/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 


--------------------------------------------------------------------------------
/slowfast/config/custom_config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Add custom configs and default values"""
 5 | 
 6 | 
 7 | def add_custom_config(_C):
 8 |     # Add your own customized configs.
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/slowfast/datasets/DATASET.md:
--------------------------------------------------------------------------------
 1 | # Dataset Preparation
 2 | 
 3 | ## Kinetics
 4 | 
 5 | The Kinetics Dataset could be downloaded via the code released by ActivityNet:
 6 | 
 7 | 1. Download the videos via the official [scripts](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics).
 8 | 
 9 | 2. After all the videos were downloaded, resize the video to the short edge size of 256, then prepare the csv files for training, validation, and testing set as `train.csv`, `val.csv`, `test.csv`. The format of the csv file is:
10 | 
11 | ```
12 | path_to_video_1 label_1
13 | path_to_video_2 label_2
14 | path_to_video_3 label_3
15 | ...
16 | path_to_video_N label_N
17 | ```
18 | 
19 | You can use provided helper functions to create csv files:
20 | ```
21 | cd data/kinetics_400
22 | python3 preprocess.py --root_dir $PATH_TO_ROOT_DIR --split_dir $SPLIT_DIR --mode $MODE
23 | ```
24 | 
25 | For example:
26 | 
27 | ```
28 | cd data/kinetics_400
29 | python3 preprocess.py --root_dir /datasets01/kinetics/070618/400/ --split_dir train_avi-288p --mode train
30 | python3 preprocess.py --root_dir /datasets01/kinetics/070618/400/ --split_dir val_avi-288p --mode val
31 | python3 preprocess.py --root_dir /datasets01/kinetics/070618/400/ --split_dir val_avi-288p --mode test
32 | ```
33 | 
34 | ## Something-Something V2
35 | 1. Please download the dataset and annotations from [dataset provider](https://20bn.com/datasets/something-something).
36 | 
37 | 2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/val.csv)).
38 | 
39 | 3. Extract the frames at 30 FPS. (We used ffmpeg-4.1.3 with command
40 | `ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}"`
41 |    in experiments.) Please put the frames in a structure consistent with the frame lists.
42 | 
43 | 
44 | Please put all annotation json files and the frame lists in the same folder, and set `DATA.PATH_TO_DATA_DIR` to the path. Set `DATA.PATH_PREFIX` to be the path to the folder containing extracted frames.
45 | 
46 | ## Epic-Kitchens-100
47 | 
48 | Follow instructions from [dataset provider](https://github.com/epic-kitchens/epic-kitchens-100-annotations).


--------------------------------------------------------------------------------
/slowfast/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 
4 | from .build import DATASET_REGISTRY, build_dataset  # noqa
5 | from .epickitchens import Epickitchens
6 | from .kinetics import Kinetics  # noqa
7 | from .ssv2 import Ssv2  # noqa
8 | 


--------------------------------------------------------------------------------
/slowfast/datasets/build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | from fvcore.common.registry import Registry
 5 | 
 6 | DATASET_REGISTRY = Registry("DATASET")
 7 | DATASET_REGISTRY.__doc__ = """
 8 | Registry for dataset.
 9 | 
10 | The registered object will be called with `obj(cfg, split)`.
11 | The call should return a `torch.utils.data.Dataset` object.
12 | """
13 | 
14 | 
15 | def build_dataset(dataset_name, cfg, split):
16 |     """
17 |     Build a dataset, defined by `dataset_name`.
18 |     Args:
19 |         dataset_name (str): the name of the dataset to be constructed.
20 |         cfg (CfgNode): configs. Details can be found in
21 |             slowfast/config/defaults.py
22 |         split (str): the split of the data loader. Options include `train`,
23 |             `val`, and `test`.
24 |     Returns:
25 |         Dataset: a constructed dataset specified by dataset_name.
26 |     """
27 |     # Capitalize the the first letter of the dataset_name since the dataset_name
28 |     # in configs may be in lowercase but the name of dataset class should always
29 |     # start with an uppercase letter.
30 |     name = dataset_name.capitalize()
31 |     return DATASET_REGISTRY.get(name)(cfg, split)
32 | 


--------------------------------------------------------------------------------
/slowfast/datasets/epickitchens_record.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | from .video_record import VideoRecord
 5 | from datetime import timedelta
 6 | import time
 7 | 
 8 | 
 9 | def timestamp_to_sec(timestamp):
10 |     x = time.strptime(timestamp, '%H:%M:%S.%f')
11 |     sec = float(timedelta(hours=x.tm_hour,
12 |                           minutes=x.tm_min,
13 |                           seconds=x.tm_sec).total_seconds()) + float(
14 |         timestamp.split('.')[-1]) / 100
15 |     return sec
16 | 
17 | 
18 | class EpicKitchensVideoRecord(VideoRecord):
19 |     def __init__(self, tup):
20 |         self._index = str(tup[0])
21 |         self._series = tup[1]
22 | 
23 |     @property
24 |     def participant(self):
25 |         return self._series['participant_id']
26 | 
27 |     @property
28 |     def untrimmed_video_name(self):
29 |         return self._series['video_id']
30 | 
31 |     @property
32 |     def start_frame(self):
33 |         return int(round(timestamp_to_sec(self._series['start_timestamp']) * self.fps))
34 | 
35 |     @property
36 |     def end_frame(self):
37 |         return int(round(timestamp_to_sec(self._series['stop_timestamp']) * self.fps))
38 | 
39 |     @property
40 |     def fps(self):
41 |         is_100 = len(self.untrimmed_video_name.split('_')[1]) == 3
42 |         return 50 if is_100 else 60
43 | 
44 |     @property
45 |     def num_frames(self):
46 |         return self.end_frame - self.start_frame
47 | 
48 |     @property
49 |     def label(self):
50 |         return {'verb': self._series['verb_class'] if 'verb_class' in self._series else -1,
51 |                 'noun': self._series['noun_class'] if 'noun_class' in self._series else -1}
52 | 
53 |     @property
54 |     def metadata(self):
55 |         return {'narration_id': self._index}


--------------------------------------------------------------------------------
/slowfast/datasets/frame_loader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | import os
 5 | import torch
 6 | from . import utils as utils
 7 | from .decoder import get_start_end_idx
 8 | 
 9 | 
10 | def temporal_sampling(
11 |     num_frames, start_idx, end_idx, num_samples, start_frame=0
12 | ):
13 |     """
14 |     Given the start and end frame index, sample num_samples frames between
15 |     the start and end with equal interval.
16 |     Args:
17 |         num_frames (int): number of frames of the trimmed action clip
18 |         start_idx (int): the index of the start frame.
19 |         end_idx (int): the index of the end frame.
20 |         num_samples (int): number of frames to sample.
21 |         start_frame (int): starting frame of the action clip in the untrimmed video
22 |     Returns:
23 |         frames (tersor): a tensor of temporal sampled video frames, dimension is
24 |             `num clip frames` x `channel` x `height` x `width`.
25 |     """
26 |     index = torch.linspace(start_idx, end_idx, num_samples)
27 |     index = torch.clamp(index, 0, num_frames - 1).long()
28 |     return start_frame + index
29 | 
30 | 
31 | def pack_frames_to_video_clip(
32 |     cfg, video_record, temporal_sample_index, target_fps=60
33 | ):
34 |     # Load video by loading its extracted frames
35 |     path_to_video = '{}/{}/rgb_frames/{}'.format(
36 |         cfg.EPICKITCHENS.VISUAL_DATA_DIR,
37 |         video_record.participant,
38 |         video_record.untrimmed_video_name
39 |     
40 |     )
41 |     img_tmpl = "frame_{:010d}.jpg"
42 |     fps = video_record.fps
43 |     sampling_rate = cfg.DATA.SAMPLING_RATE
44 |     num_samples = cfg.DATA.NUM_FRAMES
45 |     start_idx, end_idx = get_start_end_idx(
46 |         video_record.num_frames,
47 |         num_samples * sampling_rate * fps / target_fps,
48 |         temporal_sample_index,
49 |         cfg.TEST.NUM_ENSEMBLE_VIEWS,
50 |     )
51 |     start_idx, end_idx = start_idx + 1, end_idx + 1
52 |     frame_idx = temporal_sampling(
53 |         video_record.num_frames,
54 |         start_idx, end_idx, num_samples,
55 |         start_frame=video_record.start_frame
56 |     )
57 |     img_paths = [
58 |         os.path.join(
59 |             path_to_video, 
60 |             img_tmpl.format(idx.item()
61 |         )) for idx in frame_idx]
62 |     frames = utils.retry_load_images(img_paths)
63 |     return frames


--------------------------------------------------------------------------------
/slowfast/datasets/loader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Data loader."""
  5 | 
  6 | import itertools
  7 | import numpy as np
  8 | import torch
  9 | from torch.utils.data._utils.collate import default_collate
 10 | from torch.utils.data.distributed import DistributedSampler
 11 | from torch.utils.data.sampler import RandomSampler
 12 | 
 13 | from slowfast.datasets.multigrid_helper import ShortCycleBatchSampler
 14 | 
 15 | from . import utils as utils
 16 | from .build import build_dataset
 17 | from .samplers import RASampler
 18 | 
 19 | 
 20 | def detection_collate(batch):
 21 |     """
 22 |     Collate function for detection task. Concatanate bboxes, labels and
 23 |     metadata from different samples in the first dimension instead of
 24 |     stacking them to have a batch-size dimension.
 25 |     Args:
 26 |         batch (tuple or list): data batch to collate.
 27 |     Returns:
 28 |         (tuple): collated detection data batch.
 29 |     """
 30 |     inputs, labels, video_idx, extra_data = zip(*batch)
 31 |     inputs, video_idx = default_collate(inputs), default_collate(video_idx)
 32 |     labels = torch.tensor(np.concatenate(labels, axis=0)).float()
 33 | 
 34 |     collated_extra_data = {}
 35 |     for key in extra_data[0].keys():
 36 |         data = [d[key] for d in extra_data]
 37 |         if key == "boxes" or key == "ori_boxes":
 38 |             # Append idx info to the bboxes before concatenating them.
 39 |             bboxes = [
 40 |                 np.concatenate(
 41 |                     [np.full((data[i].shape[0], 1), float(i)), data[i]], axis=1
 42 |                 )
 43 |                 for i in range(len(data))
 44 |             ]
 45 |             bboxes = np.concatenate(bboxes, axis=0)
 46 |             collated_extra_data[key] = torch.tensor(bboxes).float()
 47 |         elif key == "metadata":
 48 |             collated_extra_data[key] = torch.tensor(
 49 |                 list(itertools.chain(*data))
 50 |             ).view(-1, 2)
 51 |         else:
 52 |             collated_extra_data[key] = default_collate(data)
 53 | 
 54 |     return inputs, labels, video_idx, collated_extra_data
 55 | 
 56 | 
 57 | def construct_loader(cfg, split, is_precise_bn=False):
 58 |     """
 59 |     Constructs the data loader for the given dataset.
 60 |     Args:
 61 |         cfg (CfgNode): configs. Details can be found in
 62 |             slowfast/config/defaults.py
 63 |         split (str): the split of the data loader. Options include `train`,
 64 |             `val`, and `test`.
 65 |     """
 66 |     assert split in ["train", "val", "test"]
 67 |     if split in ["train"]:
 68 |         dataset_name = cfg.TRAIN.DATASET
 69 |         batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 70 |         shuffle = True
 71 |         drop_last = True
 72 |     elif split in ["val"]:
 73 |         dataset_name = cfg.TRAIN.DATASET
 74 |         batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 75 |         shuffle = False
 76 |         drop_last = False
 77 |     elif split in ["test"]:
 78 |         dataset_name = cfg.TEST.DATASET
 79 |         batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 80 |         shuffle = False
 81 |         drop_last = False
 82 | 
 83 |     # Construct the dataset
 84 |     dataset = build_dataset(dataset_name, cfg, split)
 85 | 
 86 |     if isinstance(dataset, torch.utils.data.IterableDataset):
 87 |         loader = torch.utils.data.DataLoader(
 88 |             dataset,
 89 |             batch_size=batch_size,
 90 |             num_workers=cfg.DATA_LOADER.NUM_WORKERS,
 91 |             pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
 92 |             drop_last=drop_last,
 93 |             collate_fn=detection_collate if cfg.DETECTION.ENABLE else None,
 94 |             worker_init_fn=utils.loader_worker_init_fn(dataset),
 95 |         )
 96 |     else:
 97 |         if (
 98 |             cfg.MULTIGRID.SHORT_CYCLE
 99 |             and split in ["train"]
100 |             and not is_precise_bn
101 |         ):
102 |             # Create a sampler for multi-process training
103 |             sampler = utils.create_sampler(dataset, shuffle, cfg)
104 |             batch_sampler = ShortCycleBatchSampler(
105 |                 sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg
106 |             )
107 |             # Create a loader
108 |             loader = torch.utils.data.DataLoader(
109 |                 dataset,
110 |                 batch_sampler=batch_sampler,
111 |                 num_workers=cfg.DATA_LOADER.NUM_WORKERS,
112 |                 pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
113 |                 worker_init_fn=utils.loader_worker_init_fn(dataset),
114 |             )
115 |         else:
116 |             # Create a sampler for multi-process training
117 |             sampler = utils.create_sampler(dataset, shuffle, cfg)
118 |             # Create a loader
119 |             loader = torch.utils.data.DataLoader(
120 |                 dataset,
121 |                 batch_size=batch_size,
122 |                 shuffle=(False if sampler else shuffle),
123 |                 sampler=sampler,
124 |                 num_workers=cfg.DATA_LOADER.NUM_WORKERS,
125 |                 pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
126 |                 drop_last=drop_last,
127 |                 collate_fn=detection_collate if cfg.DETECTION.ENABLE else None,
128 |                 worker_init_fn=utils.loader_worker_init_fn(dataset),
129 |             )
130 |     return loader
131 | 
132 | 
133 | def shuffle_dataset(loader, cur_epoch):
134 |     """ "
135 |     Shuffles the data.
136 |     Args:
137 |         loader (loader): data loader to perform shuffle.
138 |         cur_epoch (int): number of the current epoch.
139 |     """
140 |     if (
141 |         loader._dataset_kind
142 |         == torch.utils.data.dataloader._DatasetKind.Iterable
143 |     ):
144 |         if hasattr(loader.dataset, "sampler"):
145 |             sampler = loader.dataset.sampler
146 |         else:
147 |             raise RuntimeError(
148 |                 "Unknown sampler for IterableDataset when shuffling dataset"
149 |             )
150 |     else:
151 |         sampler = (
152 |             loader.batch_sampler.sampler
153 |             if isinstance(loader.batch_sampler, ShortCycleBatchSampler)
154 |             else loader.sampler
155 |         )
156 |     assert isinstance(
157 |         sampler, (RandomSampler, DistributedSampler, RASampler)
158 |     ), "Sampler type '{}' not supported".format(type(sampler))
159 |     # RandomSampler handles shuffling automatically
160 |     if isinstance(sampler, DistributedSampler) or isinstance(sampler, RASampler):
161 |         # DistributedSampler shuffles data based on epoch
162 |         sampler.set_epoch(cur_epoch)
163 | 


--------------------------------------------------------------------------------
/slowfast/datasets/multigrid_helper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Helper functions for multigrid training."""
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from torch.utils.data.sampler import Sampler
 9 | 
10 | TORCH_MAJOR = int(torch.__version__.split('.')[0])
11 | TORCH_MINOR = int(torch.__version__.split('.')[1])
12 | 
13 | if TORCH_MAJOR >= 1 and TORCH_MINOR >= 8:
14 |     _int_classes = int
15 | else:
16 |     from torch._six import int_classes as _int_classes
17 | 
18 | 
19 | class ShortCycleBatchSampler(Sampler):
20 |     """
21 |     Extend Sampler to support "short cycle" sampling.
22 |     See paper "A Multigrid Method for Efficiently Training Video Models",
23 |     Wu et al., 2019 (https://arxiv.org/abs/1912.00998) for details.
24 |     """
25 | 
26 |     def __init__(self, sampler, batch_size, drop_last, cfg):
27 |         if not isinstance(sampler, Sampler):
28 |             raise ValueError(
29 |                 "sampler should be an instance of "
30 |                 "torch.utils.data.Sampler, but got sampler={}".format(sampler)
31 |             )
32 |         if (
33 |             not isinstance(batch_size, _int_classes)
34 |             or isinstance(batch_size, bool)
35 |             or batch_size <= 0
36 |         ):
37 |             raise ValueError(
38 |                 "batch_size should be a positive integer value, "
39 |                 "but got batch_size={}".format(batch_size)
40 |             )
41 |         if not isinstance(drop_last, bool):
42 |             raise ValueError(
43 |                 "drop_last should be a boolean value, but got "
44 |                 "drop_last={}".format(drop_last)
45 |             )
46 |         self.sampler = sampler
47 |         self.drop_last = drop_last
48 | 
49 |         bs_factor = [
50 |             int(
51 |                 round(
52 |                     (
53 |                         float(cfg.DATA.TRAIN_CROP_SIZE)
54 |                         / (s * cfg.MULTIGRID.DEFAULT_S)
55 |                     )
56 |                     ** 2
57 |                 )
58 |             )
59 |             for s in cfg.MULTIGRID.SHORT_CYCLE_FACTORS
60 |         ]
61 | 
62 |         self.batch_sizes = [
63 |             batch_size * bs_factor[0],
64 |             batch_size * bs_factor[1],
65 |             batch_size,
66 |         ]
67 | 
68 |     def __iter__(self):
69 |         counter = 0
70 |         batch_size = self.batch_sizes[0]
71 |         batch = []
72 |         for idx in self.sampler:
73 |             batch.append((idx, counter % 3))
74 |             if len(batch) == batch_size:
75 |                 yield batch
76 |                 counter += 1
77 |                 batch_size = self.batch_sizes[counter % 3]
78 |                 batch = []
79 |         if len(batch) > 0 and not self.drop_last:
80 |             yield batch
81 | 
82 |     def __len__(self):
83 |         avg_batch_size = sum(self.batch_sizes) / 3.0
84 |         if self.drop_last:
85 |             return int(np.floor(len(self.sampler) / avg_batch_size))
86 |         else:
87 |             return int(np.ceil(len(self.sampler) / avg_batch_size))
88 | 


--------------------------------------------------------------------------------
/slowfast/datasets/random_erasing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | # Copyright 2020 Ross Wightman
 4 | # Modified
 5 | 
 6 | import random
 7 | import math
 8 | import torch
 9 | 
10 | 
11 | def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'):
12 |     # NOTE I've seen CUDA illegal memory access errors being caused by the normal_()
13 |     # paths, flip the order so normal is run on CPU if this becomes a problem
14 |     # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508
15 |     if per_pixel:
16 |         return torch.empty(patch_size, dtype=dtype, device=device).normal_()
17 |     elif rand_color:
18 |         return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_()
19 |     else:
20 |         return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device)
21 | 
22 | 
23 | class RandomErasing:
24 |     """ Randomly selects a rectangle region in an image and erases its pixels.
25 |         'Random Erasing Data Augmentation' by Zhong et al.
26 |         See https://arxiv.org/pdf/1708.04896.pdf
27 | 
28 |         This variant of RandomErasing is intended to be applied to either a batch
29 |         or single image tensor after it has been normalized by dataset mean and std.
30 |     Args:
31 |          probability: Probability that the Random Erasing operation will be performed.
32 |          min_area: Minimum percentage of erased area wrt input image area.
33 |          max_area: Maximum percentage of erased area wrt input image area.
34 |          min_aspect: Minimum aspect ratio of erased area.
35 |          mode: pixel color mode, one of 'const', 'rand', or 'pixel'
36 |             'const' - erase block is constant color of 0 for all channels
37 |             'rand'  - erase block is same per-channel random (normal) color
38 |             'pixel' - erase block is per-pixel random (normal) color
39 |         max_count: maximum number of erasing blocks per image, area per box is scaled by count.
40 |             per-image count is randomly chosen between 1 and this value.
41 |     """
42 | 
43 |     def __init__(
44 |             self,
45 |             probability=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None,
46 |             mode='const', min_count=1, max_count=None, num_splits=0, device='cuda', seed=None):
47 |         self.probability = probability
48 |         self.min_area = min_area
49 |         self.max_area = max_area
50 |         max_aspect = max_aspect or 1 / min_aspect
51 |         self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
52 |         self.min_count = min_count
53 |         self.max_count = max_count or min_count
54 |         self.num_splits = num_splits
55 |         mode = mode.lower()
56 |         self.rand_color = False
57 |         self.per_pixel = False
58 |         if mode == 'rand':
59 |             self.rand_color = True  # per block random normal
60 |         elif mode == 'pixel':
61 |             self.per_pixel = True  # per pixel random normal
62 |         else:
63 |             assert not mode or mode == 'const'
64 |         self.device = device
65 |         self.seed = seed
66 | 
67 |     def _erase(self, img, chan, img_h, img_w, dtype):
68 |         if self.seed is not None:
69 |             random.seed(self.seed)
70 | 
71 |         if random.random() > self.probability:
72 |             return
73 |         area = img_h * img_w
74 |         count = self.min_count if self.min_count == self.max_count else \
75 |             random.randint(self.min_count, self.max_count)
76 |         for _ in range(count):
77 |             for attempt in range(10):
78 |                 target_area = random.uniform(self.min_area, self.max_area) * area / count
79 |                 aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
80 |                 h = int(round(math.sqrt(target_area * aspect_ratio)))
81 |                 w = int(round(math.sqrt(target_area / aspect_ratio)))
82 |                 if w < img_w and h < img_h:
83 |                     top = random.randint(0, img_h - h)
84 |                     left = random.randint(0, img_w - w)
85 |                     img[:, top:top + h, left:left + w] = _get_pixels(
86 |                         self.per_pixel, self.rand_color, (chan, h, w),
87 |                         dtype=dtype, device=self.device)
88 |                     break
89 | 
90 |     def __call__(self, input):
91 |         if len(input.size()) == 3:
92 |             self._erase(input, *input.size(), input.dtype)
93 |         else:
94 |             batch_size, chan, img_h, img_w = input.size()
95 |             # skip first slice of batch if num_splits is set (for clean portion of samples)
96 |             batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0
97 |             for i in range(batch_start, batch_size):
98 |                 self._erase(input[i], chan, img_h, img_w, input.dtype)
99 |         return input


--------------------------------------------------------------------------------
/slowfast/datasets/samplers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) 2015-present, Facebook, Inc.
 3 | # All rights reserved.
 4 | import torch
 5 | import torch.distributed as dist
 6 | import math
 7 | 
 8 | 
 9 | class RASampler(torch.utils.data.Sampler):
10 |     """Sampler that restricts data loading to a subset of the dataset for distributed,
11 |     with repeated augmentation.
12 |     It ensures that different each augmented version of a sample will be visible to a
13 |     different process (GPU)
14 |     Heavily based on torch.utils.data.DistributedSampler
15 |     """
16 | 
17 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
18 |         if num_replicas is None:
19 |             if not dist.is_available():
20 |                 raise RuntimeError("Requires distributed package to be available")
21 |             num_replicas = dist.get_world_size()
22 |         if rank is None:
23 |             if not dist.is_available():
24 |                 raise RuntimeError("Requires distributed package to be available")
25 |             rank = dist.get_rank()
26 |         self.dataset = dataset
27 |         self.num_replicas = num_replicas
28 |         self.rank = rank
29 |         self.epoch = 0
30 |         self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.num_replicas))
31 |         self.total_size = self.num_samples * self.num_replicas
32 |         # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
33 |         self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas))
34 |         self.shuffle = shuffle
35 | 
36 |     def __iter__(self):
37 |         # deterministically shuffle based on epoch
38 |         g = torch.Generator()
39 |         g.manual_seed(self.epoch)
40 |         if self.shuffle:
41 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
42 |         else:
43 |             indices = list(range(len(self.dataset)))
44 | 
45 |         # add extra samples to make it evenly divisible
46 |         indices = [ele for ele in indices for i in range(3)]
47 |         indices += indices[:(self.total_size - len(indices))]
48 |         assert len(indices) == self.total_size
49 | 
50 |         # subsample
51 |         indices = indices[self.rank:self.total_size:self.num_replicas]
52 |         assert len(indices) == self.num_samples
53 | 
54 |         return iter(indices[:self.num_selected_samples])
55 | 
56 |     def __len__(self):
57 |         return self.num_selected_samples
58 | 
59 |     def set_epoch(self, epoch):
60 |         self.epoch = epoch


--------------------------------------------------------------------------------
/slowfast/datasets/video_container.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | import av
 5 | 
 6 | 
 7 | def get_video_container(path_to_vid, multi_thread_decode=False, backend="pyav"):
 8 |     """
 9 |     Given the path to the video, return the pyav video container.
10 |     Args:
11 |         path_to_vid (str): path to the video.
12 |         multi_thread_decode (bool): if True, perform multi-thread decoding.
13 |         backend (str): decoder backend, options include `pyav` and
14 |             `torchvision`, default is `pyav`.
15 |     Returns:
16 |         container (container): video container.
17 |     """
18 |     if backend == "torchvision":
19 |         with open(path_to_vid, "rb") as fp:
20 |             container = fp.read()
21 |         return container
22 |     elif backend == "pyav":
23 |         container = av.open(path_to_vid)
24 |         if multi_thread_decode:
25 |             # Enable multiple threads for decoding.
26 |             container.streams.video[0].thread_type = "AUTO"
27 |         return container
28 |     else:
29 |         raise NotImplementedError("Unknown backend {}".format(backend))
30 | 


--------------------------------------------------------------------------------
/slowfast/datasets/video_record.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | class VideoRecord(object):
 5 |     def __init__(self, row):
 6 |         self._data = row
 7 | 
 8 |     @property
 9 |     def segment_name(self):
10 |         return NotImplementedError()
11 | 
12 |     @property
13 |     def participant(self):
14 |         return NotImplementedError()
15 | 
16 |     @property
17 |     def untrimmed_video_name(self):
18 |         return NotImplementedError()
19 | 
20 |     @property
21 |     def start_frame(self):
22 |         return NotImplementedError()
23 | 
24 |     @property
25 |     def end_frame(self):
26 |         return NotImplementedError()
27 | 
28 |     @property
29 |     def num_frames(self):
30 |         return NotImplementedError()
31 | 
32 |     @property
33 |     def label(self):
34 |         return NotImplementedError()


--------------------------------------------------------------------------------
/slowfast/models/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 
4 | from .build import MODEL_REGISTRY, build_model  # noqa
5 | from .video_model_builder import VisionTransformer


--------------------------------------------------------------------------------
/slowfast/models/adamw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | import math
  5 | import torch
  6 | from torch.optim.optimizer import Optimizer
  7 | 
  8 | 
  9 | class AdamW(Optimizer):
 10 |     r"""Implements AdamW algorithm.
 11 | 
 12 |     The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
 13 |     The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
 14 | 
 15 |     Arguments:
 16 |         params (iterable): iterable of parameters to optimize or dicts defining
 17 |             parameter groups
 18 |         lr (float, optional): learning rate (default: 1e-3)
 19 |         betas (Tuple[float, float], optional): coefficients used for computing
 20 |             running averages of gradient and its square (default: (0.9, 0.999))
 21 |         eps (float, optional): term added to the denominator to improve
 22 |             numerical stability (default: 1e-8)
 23 |         weight_decay (float, optional): weight decay coefficient (default: 1e-2)
 24 |         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
 25 |             algorithm from the paper `On the Convergence of Adam and Beyond`_
 26 |             (default: False)
 27 | 
 28 |     .. _Adam\: A Method for Stochastic Optimization:
 29 |         https://arxiv.org/abs/1412.6980
 30 |     .. _Decoupled Weight Decay Regularization:
 31 |         https://arxiv.org/abs/1711.05101
 32 |     .. _On the Convergence of Adam and Beyond:
 33 |         https://openreview.net/forum?id=ryQu7f-RZ
 34 |     """
 35 | 
 36 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
 37 |                  weight_decay=1e-2, amsgrad=False):
 38 |         if not 0.0 <= lr:
 39 |             raise ValueError("Invalid learning rate: {}".format(lr))
 40 |         if not 0.0 <= eps:
 41 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 42 |         if not 0.0 <= betas[0] < 1.0:
 43 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 44 |         if not 0.0 <= betas[1] < 1.0:
 45 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 46 |         if not 0.0 <= weight_decay:
 47 |             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 48 |         defaults = dict(lr=lr, betas=betas, eps=eps,
 49 |                         weight_decay=weight_decay, amsgrad=amsgrad)
 50 |         super(AdamW, self).__init__(params, defaults)
 51 | 
 52 |     def __setstate__(self, state):
 53 |         super(AdamW, self).__setstate__(state)
 54 |         for group in self.param_groups:
 55 |             group.setdefault('amsgrad', False)
 56 | 
 57 |     @torch.no_grad()
 58 |     def step(self, closure=None):
 59 |         """Performs a single optimization step.
 60 | 
 61 |         Arguments:
 62 |             closure (callable, optional): A closure that reevaluates the model
 63 |                 and returns the loss.
 64 |         """
 65 |         loss = None
 66 |         if closure is not None:
 67 |             with torch.enable_grad():
 68 |                 loss = closure()
 69 | 
 70 |         for group in self.param_groups:
 71 |             for p in group['params']:
 72 |                 if p.grad is None:
 73 |                     continue
 74 | 
 75 |                 # Perform stepweight decay
 76 |                 p.mul_(1 - group['lr'] * group['weight_decay'])
 77 | 
 78 |                 # Perform optimization step
 79 |                 grad = p.grad
 80 |                 if grad.is_sparse:
 81 |                     raise RuntimeError('AdamW does not support sparse gradients')
 82 |                 amsgrad = group['amsgrad']
 83 | 
 84 |                 state = self.state[p]
 85 | 
 86 |                 # State initialization
 87 |                 if len(state) == 0:
 88 |                     state['step'] = 0
 89 |                     # Exponential moving average of gradient values
 90 |                     state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
 91 |                     # Exponential moving average of squared gradient values
 92 |                     state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
 93 |                     if amsgrad:
 94 |                         # Maintains max of all exp. moving avg. of sq. grad. values
 95 |                         state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
 96 | 
 97 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 98 |                 if amsgrad:
 99 |                     max_exp_avg_sq = state['max_exp_avg_sq']
100 |                 beta1, beta2 = group['betas']
101 | 
102 |                 state['step'] += 1
103 |                 bias_correction1 = 1 - beta1 ** state['step']
104 |                 bias_correction2 = 1 - beta2 ** state['step']
105 | 
106 |                 # Decay the first and second moment running average coefficient
107 |                 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
108 |                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
109 |                 if amsgrad:
110 |                     # Maintains the maximum of all 2nd moment running avg. till now
111 |                     torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
112 |                     # Use the max. for normalizing running avg. of gradient
113 |                     denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
114 |                 else:
115 |                     denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
116 | 
117 |                 step_size = group['lr'] / bias_correction1
118 | 
119 |                 p.addcdiv_(exp_avg, denom, value=-step_size)
120 | 
121 |         return loss


--------------------------------------------------------------------------------
/slowfast/models/batchnorm_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """BatchNorm (BN) utility functions and custom batch-size BN implementations"""
  5 | 
  6 | from functools import partial
  7 | import torch
  8 | import torch.distributed as dist
  9 | import torch.nn as nn
 10 | from torch.autograd.function import Function
 11 | 
 12 | import slowfast.utils.distributed as du
 13 | 
 14 | 
 15 | def get_norm(cfg):
 16 |     """
 17 |     Args:
 18 |         cfg (CfgNode): model building configs, details are in the comments of
 19 |             the config file.
 20 |     Returns:
 21 |         nn.Module: the normalization layer.
 22 |     """
 23 |     if cfg.BN.NORM_TYPE == "batchnorm":
 24 |         return nn.BatchNorm3d
 25 |     elif cfg.BN.NORM_TYPE == "sub_batchnorm":
 26 |         return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS)
 27 |     elif cfg.BN.NORM_TYPE == "sync_batchnorm":
 28 |         return partial(
 29 |             NaiveSyncBatchNorm3d, num_sync_devices=cfg.BN.NUM_SYNC_DEVICES
 30 |         )
 31 |     else:
 32 |         raise NotImplementedError(
 33 |             "Norm type {} is not supported".format(cfg.BN.NORM_TYPE)
 34 |         )
 35 | 
 36 | 
 37 | class SubBatchNorm3d(nn.Module):
 38 |     """
 39 |     The standard BN layer computes stats across all examples in a GPU. In some
 40 |     cases it is desirable to compute stats across only a subset of examples
 41 |     (e.g., in multigrid training https://arxiv.org/abs/1912.00998).
 42 |     SubBatchNorm3d splits the batch dimension into N splits, and run BN on
 43 |     each of them separately (so that the stats are computed on each subset of
 44 |     examples (1/N of batch) independently. During evaluation, it aggregates
 45 |     the stats from all splits into one BN.
 46 |     """
 47 | 
 48 |     def __init__(self, num_splits, **args):
 49 |         """
 50 |         Args:
 51 |             num_splits (int): number of splits.
 52 |             args (list): other arguments.
 53 |         """
 54 |         super(SubBatchNorm3d, self).__init__()
 55 |         self.num_splits = num_splits
 56 |         num_features = args["num_features"]
 57 |         # Keep only one set of weight and bias.
 58 |         if args.get("affine", True):
 59 |             self.affine = True
 60 |             args["affine"] = False
 61 |             self.weight = torch.nn.Parameter(torch.ones(num_features))
 62 |             self.bias = torch.nn.Parameter(torch.zeros(num_features))
 63 |         else:
 64 |             self.affine = False
 65 |         self.bn = nn.BatchNorm3d(**args)
 66 |         args["num_features"] = num_features * num_splits
 67 |         self.split_bn = nn.BatchNorm3d(**args)
 68 | 
 69 |     def _get_aggregated_mean_std(self, means, stds, n):
 70 |         """
 71 |         Calculate the aggregated mean and stds.
 72 |         Args:
 73 |             means (tensor): mean values.
 74 |             stds (tensor): standard deviations.
 75 |             n (int): number of sets of means and stds.
 76 |         """
 77 |         mean = means.view(n, -1).sum(0) / n
 78 |         std = (
 79 |             stds.view(n, -1).sum(0) / n
 80 |             + ((means.view(n, -1) - mean) ** 2).view(n, -1).sum(0) / n
 81 |         )
 82 |         return mean.detach(), std.detach()
 83 | 
 84 |     def aggregate_stats(self):
 85 |         """
 86 |         Synchronize running_mean, and running_var. Call this before eval.
 87 |         """
 88 |         if self.split_bn.track_running_stats:
 89 |             (
 90 |                 self.bn.running_mean.data,
 91 |                 self.bn.running_var.data,
 92 |             ) = self._get_aggregated_mean_std(
 93 |                 self.split_bn.running_mean,
 94 |                 self.split_bn.running_var,
 95 |                 self.num_splits,
 96 |             )
 97 | 
 98 |     def forward(self, x):
 99 |         if self.training:
100 |             n, c, t, h, w = x.shape
101 |             x = x.view(n // self.num_splits, c * self.num_splits, t, h, w)
102 |             x = self.split_bn(x)
103 |             x = x.view(n, c, t, h, w)
104 |         else:
105 |             x = self.bn(x)
106 |         if self.affine:
107 |             x = x * self.weight.view((-1, 1, 1, 1))
108 |             x = x + self.bias.view((-1, 1, 1, 1))
109 |         return x
110 | 
111 | 
112 | class GroupGather(Function):
113 |     """
114 |     GroupGather performs all gather on each of the local process/ GPU groups.
115 |     """
116 | 
117 |     @staticmethod
118 |     def forward(ctx, input, num_sync_devices, num_groups):
119 |         """
120 |         Perform forwarding, gathering the stats across different process/ GPU
121 |         group.
122 |         """
123 |         ctx.num_sync_devices = num_sync_devices
124 |         ctx.num_groups = num_groups
125 | 
126 |         input_list = [
127 |             torch.zeros_like(input) for k in range(du.get_local_size())
128 |         ]
129 |         dist.all_gather(
130 |             input_list, input, async_op=False, group=du._LOCAL_PROCESS_GROUP
131 |         )
132 | 
133 |         inputs = torch.stack(input_list, dim=0)
134 |         if num_groups > 1:
135 |             rank = du.get_local_rank()
136 |             group_idx = rank // num_sync_devices
137 |             inputs = inputs[
138 |                 group_idx
139 |                 * num_sync_devices : (group_idx + 1)
140 |                 * num_sync_devices
141 |             ]
142 |         inputs = torch.sum(inputs, dim=0)
143 |         return inputs
144 | 
145 |     @staticmethod
146 |     def backward(ctx, grad_output):
147 |         """
148 |         Perform backwarding, gathering the gradients across different process/ GPU
149 |         group.
150 |         """
151 |         grad_output_list = [
152 |             torch.zeros_like(grad_output) for k in range(du.get_local_size())
153 |         ]
154 |         dist.all_gather(
155 |             grad_output_list,
156 |             grad_output,
157 |             async_op=False,
158 |             group=du._LOCAL_PROCESS_GROUP,
159 |         )
160 | 
161 |         grads = torch.stack(grad_output_list, dim=0)
162 |         if ctx.num_groups > 1:
163 |             rank = du.get_local_rank()
164 |             group_idx = rank // ctx.num_sync_devices
165 |             grads = grads[
166 |                 group_idx
167 |                 * ctx.num_sync_devices : (group_idx + 1)
168 |                 * ctx.num_sync_devices
169 |             ]
170 |         grads = torch.sum(grads, dim=0)
171 |         return grads, None, None
172 | 
173 | 
174 | class NaiveSyncBatchNorm3d(nn.BatchNorm3d):
175 |     def __init__(self, num_sync_devices, **args):
176 |         """
177 |         Naive version of Synchronized 3D BatchNorm.
178 |         Args:
179 |             num_sync_devices (int): number of device to sync.
180 |             args (list): other arguments.
181 |         """
182 |         self.num_sync_devices = num_sync_devices
183 |         if self.num_sync_devices > 0:
184 |             assert du.get_local_size() % self.num_sync_devices == 0, (
185 |                 du.get_local_size(),
186 |                 self.num_sync_devices,
187 |             )
188 |             self.num_groups = du.get_local_size() // self.num_sync_devices
189 |         else:
190 |             self.num_sync_devices = du.get_local_size()
191 |             self.num_groups = 1
192 |         super(NaiveSyncBatchNorm3d, self).__init__(**args)
193 | 
194 |     def forward(self, input):
195 |         if du.get_local_size() == 1 or not self.training:
196 |             return super().forward(input)
197 | 
198 |         assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs"
199 |         C = input.shape[1]
200 |         mean = torch.mean(input, dim=[0, 2, 3, 4])
201 |         meansqr = torch.mean(input * input, dim=[0, 2, 3, 4])
202 | 
203 |         vec = torch.cat([mean, meansqr], dim=0)
204 |         vec = GroupGather.apply(vec, self.num_sync_devices, self.num_groups) * (
205 |             1.0 / self.num_sync_devices
206 |         )
207 | 
208 |         mean, meansqr = torch.split(vec, C)
209 |         var = meansqr - mean * mean
210 |         self.running_mean += self.momentum * (mean.detach() - self.running_mean)
211 |         self.running_var += self.momentum * (var.detach() - self.running_var)
212 | 
213 |         invstd = torch.rsqrt(var + self.eps)
214 |         scale = self.weight * invstd
215 |         bias = self.bias - mean * scale
216 |         scale = scale.reshape(1, -1, 1, 1, 1)
217 |         bias = bias.reshape(1, -1, 1, 1, 1)
218 |         return input * scale + bias


--------------------------------------------------------------------------------
/slowfast/models/build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Model construction functions."""
 5 | import math
 6 | import torch
 7 | import slowfast as slowfast
 8 | from fvcore.common.registry import Registry
 9 | 
10 | from . import vit_helper
11 | 
12 | MODEL_REGISTRY = Registry("MODEL")
13 | MODEL_REGISTRY.__doc__ = """
14 | Registry for video model.
15 | 
16 | The registered object will be called with `obj(cfg)`.
17 | The call should return a `torch.nn.Module` object.
18 | """
19 | 
20 | 
21 | def build_model(cfg, gpu_id=None):
22 |     """
23 |     Builds the video model.
24 |     Args:
25 |         cfg (configs): configs that contains the hyper-parameters to build the
26 |         backbone. Details can be seen in slowfast/config/defaults.py.
27 |         gpu_id (Optional[int]): specify the gpu index to build model.
28 |     """
29 |     if torch.cuda.is_available():
30 |         assert (
31 |             cfg.NUM_GPUS <= torch.cuda.device_count()
32 |         ), "Cannot use more GPU devices than available"
33 |     else:
34 |         assert (
35 |             cfg.NUM_GPUS == 0
36 |         ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs."
37 | 
38 |     # Construct the model
39 |     name = cfg.MODEL.MODEL_NAME
40 |     model = MODEL_REGISTRY.get(name)(cfg)
41 | 
42 |     if isinstance(model, slowfast.models.video_model_builder.VisionTransformer):
43 |         if cfg.VIT.IM_PRETRAINED:
44 |             vit_helper.load_pretrained(
45 |                 model, cfg=cfg, num_classes=cfg.MODEL.NUM_CLASSES, 
46 |                 in_chans=cfg.VIT.CHANNELS, filter_fn=vit_helper._conv_filter, 
47 |                 strict=False
48 |             )
49 |             if hasattr(model, 'st_embed'):
50 |                 model.st_embed.data[:, 1:, :] = model.pos_embed.data[:, 1:, :].repeat(
51 |                     1, cfg.VIT.TEMPORAL_RESOLUTION, 1)
52 |                 model.st_embed.data[:, 0, :] = model.pos_embed.data[:, 0, :]
53 |             if hasattr(model, 'patch_embed_3d'):
54 |                 model.patch_embed_3d.proj.weight.data = torch.zeros_like(
55 |                     model.patch_embed_3d.proj.weight.data)
56 |                 n = math.floor(model.patch_embed_3d.proj.weight.shape[2] / 2)
57 |                 model.patch_embed_3d.proj.weight.data[:, :, n, :, :] = model.patch_embed.proj.weight.data
58 |                 model.patch_embed_3d.proj.bias.data = model.patch_embed.proj.bias.data
59 | 
60 |     if cfg.NUM_GPUS:
61 |         if gpu_id is None:
62 |             # Determine the GPU used by the current process
63 |             cur_device = torch.cuda.current_device()
64 |         else:
65 |             cur_device = gpu_id
66 |         # Transfer the model to the current GPU device
67 |         model = model.cuda(device=cur_device)
68 |     # Use multi-process data parallel model in the multi-gpu setting
69 |     if cfg.NUM_GPUS > 1:
70 |         # Make model replica operate on the current device
71 |         model = torch.nn.parallel.DistributedDataParallel(
72 |             module=model, device_ids=[cur_device], output_device=cur_device,
73 |             find_unused_parameters=True
74 |         )
75 |     return model
76 | 


--------------------------------------------------------------------------------
/slowfast/models/losses.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Loss functions."""
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
 9 | import slowfast.utils.metrics as metrics
10 | 
11 | 
12 | _LOSSES = {
13 |     "cross_entropy": nn.CrossEntropyLoss,
14 |     "bce": nn.BCELoss,
15 |     "bce_logit": nn.BCEWithLogitsLoss,
16 |     "label_smoothing_cross_entropy": LabelSmoothingCrossEntropy,
17 |     "mse_loss": nn.MSELoss,
18 |     "soft_target_cross_entropy": SoftTargetCrossEntropy
19 | }
20 | 
21 | 
22 | def get_loss_func(loss_name):
23 |     """
24 |     Retrieve the loss given the loss name.
25 |     Args (int):
26 |         loss_name: the name of the loss to use.
27 |     """
28 |     if loss_name not in _LOSSES.keys():
29 |         raise NotImplementedError("Loss {} is not supported".format(loss_name))
30 |     return _LOSSES[loss_name]
31 | 


--------------------------------------------------------------------------------
/slowfast/models/nystrom_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | from einops import rearrange, repeat
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as Fn
  8 | import math
  9 | 
 10 | 
 11 | def iterative_inv(mat, n_iter = 6, init_option="exact"):
 12 |     I = torch.eye(mat.size(-2), device = mat.device)
 13 |     K = mat
 14 | 
 15 |     if init_option == "original":
 16 |         # This original implementation is more conservative to compute coefficient of Z_0. 
 17 |         V = 1. / torch.max(torch.sum(K, dim = -2)) * K.transpose(-1, -2)
 18 |     elif init_option == "arbitrary_input":
 19 |         # sum = 1 for softmax input but not for exp
 20 |         a1 = torch.max(torch.sum(torch.abs(K), dim = -2, keepdim=True), dim=-1, keepdim=True).values
 21 |         a2 = torch.max(torch.sum(torch.abs(K), dim = -1, keepdim=True), dim=-2, keepdim=True).values
 22 |         V = 1. / (a1 * a2) * K.transpose(-1, -2)
 23 |     else: # The entries of K are positive and ||K||_{\infty} = 1 due to softmax
 24 |         # This is the exact coefficient computation, 
 25 |         # 1 / ||K||_1, of initialization of Z_0, leading to faster convergence. 
 26 |         V = 1. / torch.max(
 27 |             torch.sum(K, dim = -2), dim = -1).values.unsqueeze(-1).unsqueeze(-1) * K.transpose(-1, -2)
 28 | 
 29 |     for _ in range(n_iter):
 30 |         KV = torch.matmul(K, V)
 31 |         V = torch.matmul(0.25 * V, 13 * I - torch.matmul(KV, 15 * I - torch.matmul(KV, 7 * I - KV)))
 32 |     return V
 33 | 
 34 | 
 35 | def nystrom_spatial_attn(
 36 |     q, k, v, landmarks=64, num_frames=None, inv_iters=6, 
 37 |     use_full_matrix=False, use_spatial_landmarks=False, return_attn=False
 38 | ):
 39 | 
 40 |     """
 41 |     Compute full space-time attention but only softmax over spatial dimension
 42 |     """
 43 |     B, N, D = k.shape
 44 |     F = num_frames
 45 |     scale = D ** -0.5
 46 |     q = q * scale
 47 |     if use_full_matrix:
 48 |         queries_landmarks = q.clone()
 49 |         keys_landmarks = k.clone()
 50 |     else:
 51 |         segs = N // landmarks
 52 |         with torch.no_grad():
 53 |             if use_spatial_landmarks:
 54 |                 # transpose spatial and temporal dimensions
 55 |                 q2 = rearrange(q, 'b (f p) d -> b (p f) d', f=F)
 56 |                 k2 = rearrange(k, 'b (f p) d -> b (p f) d', f=F)
 57 |                 if (N % landmarks == 0):
 58 |                     keys_landmarks = k2.reshape(B, landmarks, N // landmarks, D).mean(dim = -2)
 59 |                     queries_landmarks = q2.reshape(B, landmarks, N // landmarks, D).mean(dim = -2)
 60 |                 else:
 61 |                     num_k = (segs + 1) * landmarks - N
 62 |                     keys_landmarks_f = k2[:, :num_k * segs, :].reshape(
 63 |                         B, num_k, segs, D).mean(dim = -2)
 64 |                     keys_landmarks_l = k2[:, num_k * segs:, :].reshape(
 65 |                         B, landmarks - num_k, segs + 1, D).mean(dim = -2)
 66 |                     keys_landmarks = torch.cat((keys_landmarks_f, keys_landmarks_l), dim = -2)
 67 | 
 68 |                     queries_landmarks_f = q2[:, :num_k * segs, :].reshape(
 69 |                         B, num_k, segs, D).mean(dim = -2)
 70 |                     queries_landmarks_l = q2[:, num_k * segs:, :].reshape(
 71 |                         B, landmarks - num_k, segs + 1, D).mean(dim = -2)
 72 |                     queries_landmarks = torch.cat((queries_landmarks_f, queries_landmarks_l), dim = -2)
 73 |             else:
 74 |                 if (N % landmarks == 0):
 75 |                     keys_landmarks = k.reshape(
 76 |                         B, landmarks, N // landmarks, D).mean(dim = -2)
 77 |                     queries_landmarks = q.reshape(
 78 |                         B, landmarks, N // landmarks, D).mean(dim = -2)
 79 |                 else:
 80 |                     num_k = (segs + 1) * landmarks - N
 81 |                     keys_landmarks_f = k[:, :num_k * segs, :].reshape(
 82 |                         B, num_k, segs, D).mean(dim = -2)
 83 |                     keys_landmarks_l = k[:, num_k * segs:, :].reshape(
 84 |                         B, landmarks - num_k, segs + 1, D).mean(dim = -2)
 85 |                     keys_landmarks = torch.cat((keys_landmarks_f, keys_landmarks_l), dim = -2)
 86 | 
 87 |                     queries_landmarks_f = q[:, :num_k * segs, :].reshape(
 88 |                         B, num_k, segs, D).mean(dim = -2)
 89 |                     queries_landmarks_l = q[:, num_k * segs:, :].reshape(
 90 |                         B, landmarks - num_k, segs + 1, D).mean(dim = -2)
 91 |                     queries_landmarks = torch.cat((queries_landmarks_f, queries_landmarks_l), dim = -2)
 92 | 
 93 |     kernel_1 = Fn.softmax(
 94 |         torch.matmul(q, keys_landmarks.transpose(-1, -2)), dim = -1)
 95 |     kernel_2 = Fn.softmax(
 96 |         torch.matmul(queries_landmarks, keys_landmarks.transpose(-1, -2)), dim = -1)
 97 |     kernel_3 = Fn.softmax(
 98 |         rearrange(torch.matmul(
 99 |             queries_landmarks, k.transpose(-1, -2)), 'b l (f p) -> b l f p', f=F), dim = -1)
100 |     attn = torch.matmul(kernel_1, iterative_inv(kernel_2, n_iter=inv_iters))
101 | 
102 |     v = rearrange(v, 'b (f p) d -> b f p d', f=F)
103 |     x = torch.einsum(
104 |         'b n l, b l f d -> b n f d', 
105 |         attn, torch.einsum('b l f p, b f p d -> b l f d', kernel_3, v)
106 |     )
107 | 
108 |     if return_attn:
109 |         attn = torch.einsum('b m l, b l f p -> b m f p', attn, kernel_3)
110 |         return x, attn
111 | 
112 |     return x


--------------------------------------------------------------------------------
/slowfast/models/optimizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Optimizer."""
  5 | 
  6 | import torch
  7 | from .adamw import AdamW
  8 | 
  9 | import slowfast.utils.lr_policy as lr_policy
 10 | 
 11 | 
 12 | def construct_optimizer(model, cfg):
 13 |     """
 14 |     Construct a stochastic gradient descent or ADAM optimizer with momentum.
 15 |     Details can be found in:
 16 |     Herbert Robbins, and Sutton Monro. "A stochastic approximation method."
 17 |     and
 18 |     Diederik P.Kingma, and Jimmy Ba.
 19 |     "Adam: A Method for Stochastic Optimization."
 20 | 
 21 |     Args:
 22 |         model (model): model to perform stochastic gradient descent
 23 |         optimization or ADAM optimization.
 24 |         cfg (config): configs of hyper-parameters of SGD or ADAM, includes base
 25 |         learning rate,  momentum, weight_decay, dampening, and etc.
 26 |     """
 27 |     # Batchnorm parameters.
 28 |     bn_params = []
 29 |     # Non-batchnorm parameters.
 30 |     non_bn_parameters = []
 31 |     for m in model.modules():
 32 |         is_bn = isinstance(m, torch.nn.modules.batchnorm._NormBase)
 33 |         for p in m.parameters(recurse=False):
 34 |             if is_bn:
 35 |                 bn_params.append(p)
 36 |             else:
 37 |                 non_bn_parameters.append(p)
 38 | 
 39 |     # Apply different weight decay to Batchnorm and non-batchnorm parameters.
 40 |     # In Caffe2 classification codebase the weight decay for batchnorm is 0.0.
 41 |     # Having a different weight decay on batchnorm might cause a performance
 42 |     # drop.
 43 |     optim_params = [
 44 |         {"params": bn_params, "weight_decay": cfg.BN.WEIGHT_DECAY},
 45 |         {"params": non_bn_parameters, "weight_decay": cfg.SOLVER.WEIGHT_DECAY},
 46 |     ]
 47 |     # Check all parameters will be passed into optimizer.
 48 |     assert len(list(model.parameters())) == len(non_bn_parameters) + len(
 49 |         bn_params
 50 |     ), "parameter size does not match: {} + {} != {}".format(
 51 |         len(non_bn_parameters), len(bn_params), len(list(model.parameters()))
 52 |     )
 53 | 
 54 |     if cfg.SOLVER.OPTIMIZING_METHOD == "sgd":
 55 |         return torch.optim.SGD(
 56 |             optim_params,
 57 |             lr=cfg.SOLVER.BASE_LR,
 58 |             momentum=cfg.SOLVER.MOMENTUM,
 59 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
 60 |             dampening=cfg.SOLVER.DAMPENING,
 61 |             nesterov=cfg.SOLVER.NESTEROV,
 62 |         )
 63 |     elif cfg.SOLVER.OPTIMIZING_METHOD == "adam":
 64 |         return torch.optim.Adam(
 65 |             optim_params,
 66 |             lr=cfg.SOLVER.BASE_LR,
 67 |             betas=(0.9, 0.999),
 68 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
 69 |         )
 70 |     elif cfg.SOLVER.OPTIMIZING_METHOD == "adamw":
 71 |         return AdamW(
 72 |             optim_params,
 73 |             lr=cfg.SOLVER.BASE_LR,
 74 |             betas=(0.9, 0.999),
 75 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
 76 |             amsgrad=False
 77 |         )
 78 |     else:
 79 |         raise NotImplementedError(
 80 |             "Does not support {} optimizer".format(cfg.SOLVER.OPTIMIZING_METHOD)
 81 |         )
 82 | 
 83 | 
 84 | def get_epoch_lr(cur_epoch, cfg):
 85 |     """
 86 |     Retrieves the lr for the given epoch (as specified by the lr policy).
 87 |     Args:
 88 |         cfg (config): configs of hyper-parameters of ADAM, includes base
 89 |         learning rate, betas, and weight decays.
 90 |         cur_epoch (float): the number of epoch of the current training stage.
 91 |     """
 92 |     return lr_policy.get_lr_at_epoch(cfg, cur_epoch)
 93 | 
 94 | 
 95 | def set_lr(optimizer, new_lr):
 96 |     """
 97 |     Sets the optimizer lr to the specified value.
 98 |     Args:
 99 |         optimizer (optim): the optimizer using to optimize the current network.
100 |         new_lr (float): the new learning rate to set.
101 |     """
102 |     for param_group in optimizer.param_groups:
103 |         param_group["lr"] = new_lr
104 | 


--------------------------------------------------------------------------------
/slowfast/models/orthoformer_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | from einops import rearrange, repeat
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as Fn
  8 | import math
  9 | 
 10 | 
 11 | def orthogonal_landmarks(q, k, num_landmarks=64, subsample_fraction=1.0):
 12 |     """
 13 |     Construct set of landmarks by recursively selecting new landmarks 
 14 |     that are maximally orthogonal to the existing set.
 15 |     Returns near orthogonal landmarks with shape (B, M, D).
 16 |     """
 17 |     if subsample_fraction < 1.0:
 18 |         # Need at least M/2 samples of queries and keys
 19 |         num_samples = max(int(subsample_fraction * q.size(-2)), num_landmarks)
 20 |         q_unnormalised = q[:, torch.randint(q.size(-2), (num_samples,), device=q.device), :] # (B, N, D)
 21 |     else:
 22 |         # (B, N, D)
 23 |         q_unnormalised = q
 24 | 
 25 |     # may need to change default eps to eps=1e-8 for mixed precision compatibility
 26 |     qk = Fn.normalize(q_unnormalised, p=2, dim=-1)
 27 |     B, N, D = qk.shape
 28 | 
 29 |     selected_mask = torch.zeros((B, N, 1), device=qk.device)
 30 |     landmark_mask = torch.ones((B, 1, 1), dtype=selected_mask.dtype, device=qk.device)
 31 | 
 32 |     # Get initial random landmark
 33 |     random_idx = torch.randint(qk.size(-2), (B, 1, 1), device=qk.device)
 34 |     selected_landmark = qk[torch.arange(qk.size(0)), random_idx.view(-1), :].view(B, D)
 35 |     selected_mask.scatter_(-2, random_idx, landmark_mask)
 36 | 
 37 |     # Selected landmarks
 38 |     selected_landmarks = torch.empty((B, num_landmarks, D), device=qk.device, dtype=qk.dtype)
 39 |     selected_landmarks[:, 0, :] = selected_landmark
 40 | 
 41 |     # Store computed cosine similarities
 42 |     cos_sims = torch.empty((B, N, num_landmarks), device=qk.device, dtype=qk.dtype)
 43 | 
 44 |     for M in range(1, num_landmarks):
 45 |         # Calculate absolute cosine similarity between selected and unselected landmarks
 46 |         # (B, N, D) * (B, D) -> (B, N)
 47 |         cos_sim = torch.einsum('b n d, b d -> b n', qk, selected_landmark).abs()
 48 |         cos_sims[:, :, M - 1] = cos_sim
 49 |         # (B, N, M) cosine similarities of current set of landmarks wrt all queries and keys
 50 |         cos_sim_set = cos_sims[:, :, :M]
 51 | 
 52 |         # Get orthogonal landmark: landmark with smallest absolute cosine similarity:
 53 |         # set cosine similarity for already selected landmarks to > 1
 54 |         cos_sim_set.view(-1, M)[selected_mask.flatten().bool(), :] = 10
 55 |         # (B,) - want max for non
 56 |         selected_landmark_idx = cos_sim_set.amax(-1).argmin(-1)
 57 |         selected_landmark = qk[torch.arange(qk.size(0)), selected_landmark_idx, :].view(B, D)
 58 | 
 59 |         # Add most orthogonal landmark to selected landmarks: 
 60 |         selected_landmarks[:, M, :] = selected_landmark
 61 | 
 62 |         # Removed selected indices from non-selected mask: 
 63 |         selected_mask.scatter_(-2, selected_landmark_idx.unsqueeze(-1).unsqueeze(-1), landmark_mask)
 64 |     landmarks = torch.masked_select(
 65 |         q_unnormalised, selected_mask.bool()).reshape(B, -1, D) # (B, M, D)
 66 |     return landmarks # (B, M, D)
 67 | 
 68 | 
 69 | def orthoformer(
 70 |     q, k, v, num_landmarks=64, subsample_fraction=1.0, 
 71 |     num_frames=None, shared_landmarks=True, return_attn=False
 72 | ):
 73 |     """
 74 |     Computes spatial attention for all pairs of frames.
 75 |     The attention matrix is approximated using 
 76 |     intermediate landmarks taken from the queries and keys.
 77 |     The landmarks can be unique (to each frame) or 
 78 |     shared (a common set of landmarks across frames).
 79 |     """
 80 |     B, N, D = k.shape
 81 |     F = num_frames
 82 |     L = num_landmarks
 83 |     P = N // F
 84 | 
 85 |     scale = D ** -0.25
 86 |     q = q * scale
 87 |     k = k * scale
 88 |     
 89 |     if shared_landmarks:
 90 |         with torch.no_grad():
 91 |             landmarks = orthogonal_landmarks(q, k, num_landmarks, subsample_fraction)
 92 |         kernel_1 = Fn.softmax(torch.matmul(q, landmarks.transpose(-1, -2)), dim=-1)
 93 |         kernel_2 = Fn.softmax(
 94 |             rearrange(torch.matmul(
 95 |                 landmarks, k.transpose(-1, -2)), 'b l (f p) -> b l f p', f=F), dim=-1)
 96 |         v = rearrange(v, 'b (f p) d -> b f p d', f=F)
 97 |         x = torch.einsum('b l f p, b f p d -> b l f d', kernel_2, v)
 98 |         x = torch.einsum('b n l, b l f d -> b n f d', kernel_1, x)
 99 |         if return_attn:
100 |             attn = torch.einsum('b m l, b l f p -> b m f p', kernel_1, kernel_2)
101 |             return x, attn
102 |     else:
103 |         q = rearrange(q, 'b (f p) d -> (b f) p d', f=F)
104 |         k = rearrange(k, 'b (g q) d -> (b g) q d', g=F)
105 |         with torch.no_grad():
106 |             landmarks = orthogonal_landmarks(q, k, num_landmarks, subsample_fraction)
107 |             landmarks = rearrange(landmarks, '(b f) l d -> b f l d', f=F)
108 |         q = rearrange(q, '(b f) p d -> b f 1 p d', f=F)
109 |         k = rearrange(k, '(b g) q d -> b 1 g q d', g=F)
110 |         v = rearrange(v, 'b (g q) d -> b 1 g q d', g=F)
111 |         kernel_1 = Fn.softmax(
112 |             torch.matmul(q, landmarks.unsqueeze(-4).transpose(-1, -2)), dim=-1)
113 |         kernel_2 = Fn.softmax(
114 |             torch.matmul(landmarks.unsqueeze(-3), k.transpose(-1, -2)), dim=-1)
115 |         x = torch.matmul(kernel_1, torch.matmul(kernel_2, v))
116 |         x = rearrange(x, 'b f g p d -> b (f p) g d')
117 |         if return_attn:
118 |             attn = torch.matmul(kernel_1, kernel_2)
119 |             attn = rearrange(attn, 'b f g p q -> b (f p) g q')
120 |             return x, attn
121 | 
122 |     return x


--------------------------------------------------------------------------------
/slowfast/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 


--------------------------------------------------------------------------------
/slowfast/utils/benchmark.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Functions for benchmarks.
  4 | """
  5 | 
  6 | import numpy as np
  7 | import pprint
  8 | import torch
  9 | import tqdm
 10 | from fvcore.common.timer import Timer
 11 | 
 12 | import slowfast.utils.logging as logging
 13 | import slowfast.utils.misc as misc
 14 | from slowfast.datasets import loader
 15 | from slowfast.utils.env import setup_environment
 16 | 
 17 | logger = logging.get_logger(__name__)
 18 | 
 19 | 
 20 | def benchmark_data_loading(cfg):
 21 |     """
 22 |     Benchmark the speed of data loading in PySlowFast.
 23 |     Args:
 24 | 
 25 |         cfg (CfgNode): configs. Details can be found in
 26 |             slowfast/config/defaults.py
 27 |     """
 28 |     # Set up environment.
 29 |     setup_environment()
 30 |     # Set random seed from configs.
 31 |     np.random.seed(cfg.RNG_SEED)
 32 |     torch.manual_seed(cfg.RNG_SEED)
 33 | 
 34 |     # Setup logging format.
 35 |     logging.setup_logging(cfg.OUTPUT_DIR)
 36 | 
 37 |     # Print config.
 38 |     logger.info("Benchmark data loading with config:")
 39 |     logger.info(pprint.pformat(cfg))
 40 | 
 41 |     timer = Timer()
 42 |     dataloader = loader.construct_loader(cfg, "train")
 43 |     logger.info(
 44 |         "Initialize loader using {:.2f} seconds.".format(timer.seconds())
 45 |     )
 46 |     # Total batch size across different machines.
 47 |     batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS
 48 |     log_period = cfg.BENCHMARK.LOG_PERIOD
 49 |     epoch_times = []
 50 |     # Test for a few epochs.
 51 |     for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS):
 52 |         timer = Timer()
 53 |         timer_epoch = Timer()
 54 |         iter_times = []
 55 |         if cfg.BENCHMARK.SHUFFLE:
 56 |             loader.shuffle_dataset(dataloader, cur_epoch)
 57 |         for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)):
 58 |             if cur_iter > 0 and cur_iter % log_period == 0:
 59 |                 iter_times.append(timer.seconds())
 60 |                 ram_usage, ram_total = misc.cpu_mem_usage()
 61 |                 logger.info(
 62 |                     "Epoch {}: {} iters ({} videos) in {:.2f} seconds. "
 63 |                     "RAM Usage: {:.2f}/{:.2f} GB.".format(
 64 |                         cur_epoch,
 65 |                         log_period,
 66 |                         log_period * batch_size,
 67 |                         iter_times[-1],
 68 |                         ram_usage,
 69 |                         ram_total,
 70 |                     )
 71 |                 )
 72 |                 timer.reset()
 73 |         epoch_times.append(timer_epoch.seconds())
 74 |         ram_usage, ram_total = misc.cpu_mem_usage()
 75 |         logger.info(
 76 |             "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. "
 77 |             "RAM Usage: {:.2f}/{:.2f} GB.".format(
 78 |                 cur_epoch,
 79 |                 len(dataloader),
 80 |                 len(dataloader) * batch_size,
 81 |                 epoch_times[-1],
 82 |                 ram_usage,
 83 |                 ram_total,
 84 |             )
 85 |         )
 86 |         logger.info(
 87 |             "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} "
 88 |             "(avg/std) seconds.".format(
 89 |                 cur_epoch,
 90 |                 log_period,
 91 |                 log_period * batch_size,
 92 |                 np.mean(iter_times),
 93 |                 np.std(iter_times),
 94 |             )
 95 |         )
 96 |     logger.info(
 97 |         "On average every epoch ({} videos) takes {:.2f}/{:.2f} "
 98 |         "(avg/std) seconds.".format(
 99 |             len(dataloader) * batch_size,
100 |             np.mean(epoch_times),
101 |             np.std(epoch_times),
102 |         )
103 |     )
104 | 


--------------------------------------------------------------------------------
/slowfast/utils/bn_helper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """bn helper."""
 5 | 
 6 | import itertools
 7 | import torch
 8 | 
 9 | 
10 | @torch.no_grad()
11 | def compute_and_update_bn_stats(model, data_loader, num_batches=200):
12 |     """
13 |     Compute and update the batch norm stats to make it more precise. During
14 |     training both bn stats and the weight are changing after every iteration,
15 |     so the bn can not precisely reflect the latest stats of the current model.
16 |     Here the bn stats is recomputed without change of weights, to make the
17 |     running mean and running var more precise.
18 |     Args:
19 |         model (model): the model using to compute and update the bn stats.
20 |         data_loader (dataloader): dataloader using to provide inputs.
21 |         num_batches (int): running iterations using to compute the stats.
22 |     """
23 | 
24 |     # Prepares all the bn layers.
25 |     bn_layers = [
26 |         m
27 |         for m in model.modules()
28 |         if any(
29 |             (
30 |                 isinstance(m, bn_type)
31 |                 for bn_type in (
32 |                     torch.nn.BatchNorm1d,
33 |                     torch.nn.BatchNorm2d,
34 |                     torch.nn.BatchNorm3d,
35 |                 )
36 |             )
37 |         )
38 |     ]
39 | 
40 |     # In order to make the running stats only reflect the current batch, the
41 |     # momentum is disabled.
42 |     # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean
43 |     # Setting the momentum to 1.0 to compute the stats without momentum.
44 |     momentum_actual = [bn.momentum for bn in bn_layers]
45 |     for bn in bn_layers:
46 |         bn.momentum = 1.0
47 | 
48 |     # Calculates the running iterations for precise stats computation.
49 |     running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers]
50 |     running_square_mean = [torch.zeros_like(bn.running_var) for bn in bn_layers]
51 | 
52 |     for ind, (inputs, _, _) in enumerate(
53 |         itertools.islice(data_loader, num_batches)
54 |     ):
55 |         # Forwards the model to update the bn stats.
56 |         if isinstance(inputs, (list,)):
57 |             for i in range(len(inputs)):
58 |                 inputs[i] = inputs[i].float().cuda(non_blocking=True)
59 |         else:
60 |             inputs = inputs.cuda(non_blocking=True)
61 |         model(inputs)
62 | 
63 |         for i, bn in enumerate(bn_layers):
64 |             # Accumulates the bn stats.
65 |             running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1)
66 |             # $E(x^2) = Var(x) + E(x)^2$.
67 |             cur_square_mean = bn.running_var + bn.running_mean ** 2
68 |             running_square_mean[i] += (
69 |                 cur_square_mean - running_square_mean[i]
70 |             ) / (ind + 1)
71 | 
72 |     for i, bn in enumerate(bn_layers):
73 |         bn.running_mean = running_mean[i]
74 |         # Var(x) = $E(x^2) - E(x)^2$.
75 |         bn.running_var = running_square_mean[i] - bn.running_mean ** 2
76 |         # Sets the precise bn stats.
77 |         bn.momentum = momentum_actual[i]
78 | 


--------------------------------------------------------------------------------
/slowfast/utils/c2_model_loading.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Caffe2 to PyTorch checkpoint name converting utility."""
  5 | 
  6 | import re
  7 | 
  8 | 
  9 | def get_name_convert_func():
 10 |     """
 11 |     Get the function to convert Caffe2 layer names to PyTorch layer names.
 12 |     Returns:
 13 |         (func): function to convert parameter name from Caffe2 format to PyTorch
 14 |         format.
 15 |     """
 16 |     pairs = [
 17 |         # ------------------------------------------------------------
 18 |         # 'nonlocal_conv3_1_theta_w' -> 's3.pathway0_nonlocal3.conv_g.weight'
 19 |         [
 20 |             r"^nonlocal_conv([0-9]+)_([0-9]+)_(.*)",
 21 |             r"s\1.pathway0_nonlocal\2_\3",
 22 |         ],
 23 |         # 'theta' -> 'conv_theta'
 24 |         [r"^(.*)_nonlocal([0-9]+)_(theta)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 25 |         # 'g' -> 'conv_g'
 26 |         [r"^(.*)_nonlocal([0-9]+)_(g)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 27 |         # 'phi' -> 'conv_phi'
 28 |         [r"^(.*)_nonlocal([0-9]+)_(phi)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 29 |         # 'out' -> 'conv_out'
 30 |         [r"^(.*)_nonlocal([0-9]+)_(out)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 31 |         # 'nonlocal_conv4_5_bn_s' -> 's4.pathway0_nonlocal3.bn.weight'
 32 |         [r"^(.*)_nonlocal([0-9]+)_(bn)_(.*)", r"\1_nonlocal\2.\3.\4"],
 33 |         # ------------------------------------------------------------
 34 |         # 't_pool1_subsample_bn' -> 's1_fuse.conv_f2s.bn.running_mean'
 35 |         [r"^t_pool1_subsample_bn_(.*)", r"s1_fuse.bn.\1"],
 36 |         # 't_pool1_subsample' -> 's1_fuse.conv_f2s'
 37 |         [r"^t_pool1_subsample_(.*)", r"s1_fuse.conv_f2s.\1"],
 38 |         # 't_res4_5_branch2c_bn_subsample_bn_rm' -> 's4_fuse.conv_f2s.bias'
 39 |         [
 40 |             r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_bn_(.*)",
 41 |             r"s\1_fuse.bn.\3",
 42 |         ],
 43 |         # 't_pool1_subsample' -> 's1_fuse.conv_f2s'
 44 |         [
 45 |             r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_(.*)",
 46 |             r"s\1_fuse.conv_f2s.\3",
 47 |         ],
 48 |         # ------------------------------------------------------------
 49 |         # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b'
 50 |         [
 51 |             r"^res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)",
 52 |             r"s\1.pathway0_res\2.branch\3.\4_\5",
 53 |         ],
 54 |         # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.'
 55 |         [r"^res_conv1_bn_(.*)", r"s1.pathway0_stem.bn.\1"],
 56 |         # 'conv1_xy_w_momentum' -> 's1.pathway0_stem.conv_xy.'
 57 |         [r"^conv1_xy(.*)", r"s1.pathway0_stem.conv_xy\1"],
 58 |         # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.'
 59 |         [r"^conv1_(.*)", r"s1.pathway0_stem.conv.\1"],
 60 |         # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight'
 61 |         [
 62 |             r"^res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)",
 63 |             r"s\1.pathway0_res\2.branch\3_\4",
 64 |         ],
 65 |         # 'res_conv1_' -> 's1.pathway0_stem.conv.'
 66 |         [r"^res_conv1_(.*)", r"s1.pathway0_stem.conv.\1"],
 67 |         # ------------------------------------------------------------
 68 |         # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b'
 69 |         [
 70 |             r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)",
 71 |             r"s\1.pathway1_res\2.branch\3.\4_\5",
 72 |         ],
 73 |         # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.'
 74 |         [r"^t_res_conv1_bn_(.*)", r"s1.pathway1_stem.bn.\1"],
 75 |         # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.'
 76 |         [r"^t_conv1_(.*)", r"s1.pathway1_stem.conv.\1"],
 77 |         # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight'
 78 |         [
 79 |             r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)",
 80 |             r"s\1.pathway1_res\2.branch\3_\4",
 81 |         ],
 82 |         # 'res_conv1_' -> 's1.pathway0_stem.conv.'
 83 |         [r"^t_res_conv1_(.*)", r"s1.pathway1_stem.conv.\1"],
 84 |         # ------------------------------------------------------------
 85 |         # pred_ -> head.projection.
 86 |         [r"pred_(.*)", r"head.projection.\1"],
 87 |         # '.b_bn_fc' -> '.se.fc'
 88 |         [r"(.*)b_bn_fc(.*)", r"\1se.fc\2"],
 89 |         # conv_5 -> head.conv_5.
 90 |         [r"conv_5(.*)", r"head.conv_5\1"],
 91 |         # conv_5 -> head.conv_5.
 92 |         [r"lin_5(.*)", r"head.lin_5\1"],
 93 |         # '.bn_b' -> '.weight'
 94 |         [r"(.*)bn.b\Z", r"\1bn.bias"],
 95 |         # '.bn_s' -> '.weight'
 96 |         [r"(.*)bn.s\Z", r"\1bn.weight"],
 97 |         # '_bn_rm' -> '.running_mean'
 98 |         [r"(.*)bn.rm\Z", r"\1bn.running_mean"],
 99 |         # '_bn_riv' -> '.running_var'
100 |         [r"(.*)bn.riv\Z", r"\1bn.running_var"],
101 |         # '_b' -> '.bias'
102 |         [r"(.*)[\._]b\Z", r"\1.bias"],
103 |         # '_w' -> '.weight'
104 |         [r"(.*)[\._]w\Z", r"\1.weight"],
105 |     ]
106 | 
107 |     def convert_caffe2_name_to_pytorch(caffe2_layer_name):
108 |         """
109 |         Convert the caffe2_layer_name to pytorch format by apply the list of
110 |         regular expressions.
111 |         Args:
112 |             caffe2_layer_name (str): caffe2 layer name.
113 |         Returns:
114 |             (str): pytorch layer name.
115 |         """
116 |         for source, dest in pairs:
117 |             caffe2_layer_name = re.sub(source, dest, caffe2_layer_name)
118 |         return caffe2_layer_name
119 | 
120 |     return convert_caffe2_name_to_pytorch
121 | 


--------------------------------------------------------------------------------
/slowfast/utils/distributed.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Distributed helpers."""
  5 | 
  6 | import functools
  7 | import logging
  8 | import pickle
  9 | import torch
 10 | import torch.distributed as dist
 11 | 
 12 | _LOCAL_PROCESS_GROUP = None
 13 | 
 14 | 
 15 | def all_gather(tensors):
 16 |     """
 17 |     All gathers the provided tensors from all processes across machines.
 18 |     Args:
 19 |         tensors (list): tensors to perform all gather across all processes in
 20 |         all machines.
 21 |     """
 22 | 
 23 |     gather_list = []
 24 |     output_tensor = []
 25 |     world_size = dist.get_world_size()
 26 |     for tensor in tensors:
 27 |         tensor_placeholder = [
 28 |             torch.ones_like(tensor) for _ in range(world_size)
 29 |         ]
 30 |         dist.all_gather(tensor_placeholder, tensor, async_op=False)
 31 |         gather_list.append(tensor_placeholder)
 32 |     for gathered_tensor in gather_list:
 33 |         output_tensor.append(torch.cat(gathered_tensor, dim=0))
 34 |     return output_tensor
 35 | 
 36 | 
 37 | def all_reduce(tensors, average=True):
 38 |     """
 39 |     All reduce the provided tensors from all processes across machines.
 40 |     Args:
 41 |         tensors (list): tensors to perform all reduce across all processes in
 42 |         all machines.
 43 |         average (bool): scales the reduced tensor by the number of overall
 44 |         processes across all machines.
 45 |     """
 46 | 
 47 |     for tensor in tensors:
 48 |         dist.all_reduce(tensor, async_op=False)
 49 |     if average:
 50 |         world_size = dist.get_world_size()
 51 |         for tensor in tensors:
 52 |             tensor.mul_(1.0 / world_size)
 53 |     return tensors
 54 | 
 55 | 
 56 | def init_process_group(
 57 |     local_rank,
 58 |     local_world_size,
 59 |     shard_id,
 60 |     num_shards,
 61 |     init_method,
 62 |     dist_backend="nccl",
 63 | ):
 64 |     """
 65 |     Initializes the default process group.
 66 |     Args:
 67 |         local_rank (int): the rank on the current local machine.
 68 |         local_world_size (int): the world size (number of processes running) on
 69 |         the current local machine.
 70 |         shard_id (int): the shard index (machine rank) of the current machine.
 71 |         num_shards (int): number of shards for distributed training.
 72 |         init_method (string): supporting three different methods for
 73 |             initializing process groups:
 74 |             "file": use shared file system to initialize the groups across
 75 |             different processes.
 76 |             "tcp": use tcp address to initialize the groups across different
 77 |         dist_backend (string): backend to use for distributed training. Options
 78 |             includes gloo, mpi and nccl, the details can be found here:
 79 |             https://pytorch.org/docs/stable/distributed.html
 80 |     """
 81 |     # Sets the GPU to use.
 82 |     torch.cuda.set_device(local_rank)
 83 |     # Initialize the process group.
 84 |     proc_rank = local_rank + shard_id * local_world_size
 85 |     world_size = local_world_size * num_shards
 86 |     dist.init_process_group(
 87 |         backend=dist_backend,
 88 |         init_method=init_method,
 89 |         world_size=world_size,
 90 |         rank=proc_rank,
 91 |     )
 92 | 
 93 | 
 94 | def is_master_proc(num_gpus=8):
 95 |     """
 96 |     Determines if the current process is the master process.
 97 |     """
 98 |     if torch.distributed.is_initialized():
 99 |         return dist.get_rank() % num_gpus == 0
100 |     else:
101 |         return True
102 | 
103 | 
104 | def is_root_proc():
105 |     """
106 |     Determines if the current process is the root process.
107 |     """
108 |     if torch.distributed.is_initialized():
109 |         return dist.get_rank() == 0
110 |     else:
111 |         return True
112 | 
113 | 
114 | def get_world_size():
115 |     """
116 |     Get the size of the world.
117 |     """
118 |     if not dist.is_available():
119 |         return 1
120 |     if not dist.is_initialized():
121 |         return 1
122 |     return dist.get_world_size()
123 | 
124 | 
125 | def get_rank():
126 |     """
127 |     Get the rank of the current process.
128 |     """
129 |     if not dist.is_available():
130 |         return 0
131 |     if not dist.is_initialized():
132 |         return 0
133 |     return dist.get_rank()
134 | 
135 | 
136 | def synchronize():
137 |     """
138 |     Helper function to synchronize (barrier) among all processes when
139 |     using distributed training
140 |     """
141 |     if not dist.is_available():
142 |         return
143 |     if not dist.is_initialized():
144 |         return
145 |     world_size = dist.get_world_size()
146 |     if world_size == 1:
147 |         return
148 |     dist.barrier()
149 | 
150 | 
151 | @functools.lru_cache()
152 | def _get_global_gloo_group():
153 |     """
154 |     Return a process group based on gloo backend, containing all the ranks
155 |     The result is cached.
156 |     Returns:
157 |         (group): pytorch dist group.
158 |     """
159 |     if dist.get_backend() == "nccl":
160 |         return dist.new_group(backend="gloo")
161 |     else:
162 |         return dist.group.WORLD
163 | 
164 | 
165 | def _serialize_to_tensor(data, group):
166 |     """
167 |     Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl`
168 |         backend is supported.
169 |     Args:
170 |         data (data): data to be serialized.
171 |         group (group): pytorch dist group.
172 |     Returns:
173 |         tensor (ByteTensor): tensor that serialized.
174 |     """
175 | 
176 |     backend = dist.get_backend(group)
177 |     assert backend in ["gloo", "nccl"]
178 |     device = torch.device("cpu" if backend == "gloo" else "cuda")
179 | 
180 |     buffer = pickle.dumps(data)
181 |     if len(buffer) > 1024 ** 3:
182 |         logger = logging.getLogger(__name__)
183 |         logger.warning(
184 |             "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
185 |                 get_rank(), len(buffer) / (1024 ** 3), device
186 |             )
187 |         )
188 |     storage = torch.ByteStorage.from_buffer(buffer)
189 |     tensor = torch.ByteTensor(storage).to(device=device)
190 |     return tensor
191 | 
192 | 
193 | def _pad_to_largest_tensor(tensor, group):
194 |     """
195 |     Padding all the tensors from different GPUs to the largest ones.
196 |     Args:
197 |         tensor (tensor): tensor to pad.
198 |         group (group): pytorch dist group.
199 |     Returns:
200 |         list[int]: size of the tensor, on each rank
201 |         Tensor: padded tensor that has the max size
202 |     """
203 |     world_size = dist.get_world_size(group=group)
204 |     assert (
205 |         world_size >= 1
206 |     ), "comm.gather/all_gather must be called from ranks within the given group!"
207 |     local_size = torch.tensor(
208 |         [tensor.numel()], dtype=torch.int64, device=tensor.device
209 |     )
210 |     size_list = [
211 |         torch.zeros([1], dtype=torch.int64, device=tensor.device)
212 |         for _ in range(world_size)
213 |     ]
214 |     dist.all_gather(size_list, local_size, group=group)
215 |     size_list = [int(size.item()) for size in size_list]
216 | 
217 |     max_size = max(size_list)
218 | 
219 |     # we pad the tensor because torch all_gather does not support
220 |     # gathering tensors of different shapes
221 |     if local_size != max_size:
222 |         padding = torch.zeros(
223 |             (max_size - local_size,), dtype=torch.uint8, device=tensor.device
224 |         )
225 |         tensor = torch.cat((tensor, padding), dim=0)
226 |     return size_list, tensor
227 | 
228 | 
229 | def all_gather_unaligned(data, group=None):
230 |     """
231 |     Run all_gather on arbitrary picklable data (not necessarily tensors).
232 | 
233 |     Args:
234 |         data: any picklable object
235 |         group: a torch process group. By default, will use a group which
236 |             contains all ranks on gloo backend.
237 | 
238 |     Returns:
239 |         list[data]: list of data gathered from each rank
240 |     """
241 |     if get_world_size() == 1:
242 |         return [data]
243 |     if group is None:
244 |         group = _get_global_gloo_group()
245 |     if dist.get_world_size(group) == 1:
246 |         return [data]
247 | 
248 |     tensor = _serialize_to_tensor(data, group)
249 | 
250 |     size_list, tensor = _pad_to_largest_tensor(tensor, group)
251 |     max_size = max(size_list)
252 | 
253 |     # receiving Tensor from all ranks
254 |     tensor_list = [
255 |         torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
256 |         for _ in size_list
257 |     ]
258 |     dist.all_gather(tensor_list, tensor, group=group)
259 | 
260 |     data_list = []
261 |     for size, tensor in zip(size_list, tensor_list):
262 |         buffer = tensor.cpu().numpy().tobytes()[:size]
263 |         data_list.append(pickle.loads(buffer))
264 | 
265 |     return data_list
266 | 
267 | 
268 | def init_distributed_training(cfg):
269 |     """
270 |     Initialize variables needed for distributed training.
271 |     """
272 |     if cfg.NUM_GPUS <= 1:
273 |         return
274 |     num_gpus_per_machine = cfg.NUM_GPUS
275 |     num_machines = dist.get_world_size() // num_gpus_per_machine
276 |     for i in range(num_machines):
277 |         ranks_on_i = list(
278 |             range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
279 |         )
280 |         pg = dist.new_group(ranks_on_i)
281 |         if i == cfg.SHARD_ID:
282 |             global _LOCAL_PROCESS_GROUP
283 |             _LOCAL_PROCESS_GROUP = pg
284 | 
285 | 
286 | def get_local_size() -> int:
287 |     """
288 |     Returns:
289 |         The size of the per-machine process group,
290 |         i.e. the number of processes per machine.
291 |     """
292 |     if not dist.is_available():
293 |         return 1
294 |     if not dist.is_initialized():
295 |         return 1
296 |     return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
297 | 
298 | 
299 | def get_local_rank() -> int:
300 |     """
301 |     Returns:
302 |         The rank of the current process within the local (per-machine) process group.
303 |     """
304 |     if not dist.is_available():
305 |         return 0
306 |     if not dist.is_initialized():
307 |         return 0
308 |     assert _LOCAL_PROCESS_GROUP is not None
309 |     return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
310 | 


--------------------------------------------------------------------------------
/slowfast/utils/env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Set up Environment."""
 5 | 
 6 | import slowfast.utils.logging as logging
 7 | 
 8 | _ENV_SETUP_DONE = False
 9 | 
10 | 
11 | def setup_environment():
12 |     global _ENV_SETUP_DONE
13 |     if _ENV_SETUP_DONE:
14 |         return
15 |     _ENV_SETUP_DONE = True
16 | 


--------------------------------------------------------------------------------
/slowfast/utils/logging.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Logging."""
 5 | 
 6 | import atexit
 7 | import builtins
 8 | import decimal
 9 | import functools
10 | import logging
11 | import os
12 | import sys
13 | import simplejson
14 | from iopath.common.file_io import g_pathmgr
15 | 
16 | import slowfast.utils.distributed as du
17 | 
18 | 
19 | def _suppress_print():
20 |     """
21 |     Suppresses printing from the current process.
22 |     """
23 | 
24 |     def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False):
25 |         pass
26 | 
27 |     builtins.print = print_pass
28 | 
29 | 
30 | @functools.lru_cache(maxsize=None)
31 | def _cached_log_stream(filename):
32 |     io = g_pathmgr.open(filename, "a", buffering=1024)
33 |     atexit.register(io.close)
34 |     return io
35 | 
36 | 
37 | def setup_logging(output_dir=None):
38 |     """
39 |     Sets up the logging for multiple processes. Only enable the logging for the
40 |     master process, and suppress logging for the non-master processes.
41 |     """
42 |     # Set up logging format.
43 |     _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s"
44 | 
45 |     if du.is_master_proc():
46 |         # Enable logging for the master process.
47 |         logging.root.handlers = []
48 |     else:
49 |         # Suppress logging for non-master processes.
50 |         _suppress_print()
51 | 
52 |     logger = logging.getLogger()
53 |     logger.setLevel(logging.DEBUG)
54 |     logger.propagate = False
55 |     plain_formatter = logging.Formatter(
56 |         "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s",
57 |         datefmt="%m/%d %H:%M:%S",
58 |     )
59 | 
60 |     if du.is_master_proc():
61 |         ch = logging.StreamHandler(stream=sys.stdout)
62 |         ch.setLevel(logging.DEBUG)
63 |         ch.setFormatter(plain_formatter)
64 |         logger.addHandler(ch)
65 | 
66 |     if output_dir is not None and du.is_master_proc(du.get_world_size()):
67 |         filename = os.path.join(output_dir, "stdout.log")
68 |         fh = logging.StreamHandler(_cached_log_stream(filename))
69 |         fh.setLevel(logging.DEBUG)
70 |         fh.setFormatter(plain_formatter)
71 |         logger.addHandler(fh)
72 | 
73 | 
74 | def get_logger(name):
75 |     """
76 |     Retrieve the logger with the specified name or, if name is None, return a
77 |     logger which is the root logger of the hierarchy.
78 |     Args:
79 |         name (string): name of the logger.
80 |     """
81 |     return logging.getLogger(name)
82 | 
83 | 
84 | def log_json_stats(stats):
85 |     """
86 |     Logs json stats.
87 |     Args:
88 |         stats (dict): a dictionary of statistical information to log.
89 |     """
90 |     stats = {
91 |         k: decimal.Decimal("{:.5f}".format(v)) if isinstance(v, float) else v
92 |         for k, v in stats.items()
93 |     }
94 |     json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True)
95 |     logger = get_logger(__name__)
96 |     logger.info("json_stats: {:s}".format(json_stats))
97 | 


--------------------------------------------------------------------------------
/slowfast/utils/lr_policy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Learning rate policy."""
 5 | 
 6 | import math
 7 | 
 8 | 
 9 | def get_lr_at_epoch(cfg, cur_epoch):
10 |     """
11 |     Retrieve the learning rate of the current epoch with the option to perform
12 |     warm up in the beginning of the training stage.
13 |     Args:
14 |         cfg (CfgNode): configs. Details can be found in
15 |             slowfast/config/defaults.py
16 |         cur_epoch (float): the number of epoch of the current training stage.
17 |     """
18 |     lr = get_lr_func(cfg.SOLVER.LR_POLICY)(cfg, cur_epoch)
19 |     # Perform warm up.
20 |     if cur_epoch < cfg.SOLVER.WARMUP_EPOCHS:
21 |         lr_start = cfg.SOLVER.WARMUP_START_LR
22 |         lr_end = get_lr_func(cfg.SOLVER.LR_POLICY)(
23 |             cfg, cfg.SOLVER.WARMUP_EPOCHS
24 |         )
25 |         alpha = (lr_end - lr_start) / cfg.SOLVER.WARMUP_EPOCHS
26 |         lr = cur_epoch * alpha + lr_start
27 |     return lr
28 | 
29 | 
30 | def lr_func_cosine(cfg, cur_epoch):
31 |     """
32 |     Retrieve the learning rate to specified values at specified epoch with the
33 |     cosine learning rate schedule. Details can be found in:
34 |     Ilya Loshchilov, and  Frank Hutter
35 |     SGDR: Stochastic Gradient Descent With Warm Restarts.
36 |     Args:
37 |         cfg (CfgNode): configs. Details can be found in
38 |             slowfast/config/defaults.py
39 |         cur_epoch (float): the number of epoch of the current training stage.
40 |     """
41 |     assert cfg.SOLVER.COSINE_END_LR < cfg.SOLVER.BASE_LR
42 |     return (
43 |         cfg.SOLVER.COSINE_END_LR
44 |         + (cfg.SOLVER.BASE_LR - cfg.SOLVER.COSINE_END_LR)
45 |         * (math.cos(math.pi * cur_epoch / cfg.SOLVER.MAX_EPOCH) + 1.0)
46 |         * 0.5
47 |     )
48 | 
49 | 
50 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch):
51 |     """
52 |     Retrieve the learning rate to specified values at specified epoch with the
53 |     steps with relative learning rate schedule.
54 |     Args:
55 |         cfg (CfgNode): configs. Details can be found in
56 |             slowfast/config/defaults.py
57 |         cur_epoch (float): the number of epoch of the current training stage.
58 |     """
59 |     ind = get_step_index(cfg, cur_epoch)
60 |     return cfg.SOLVER.LRS[ind] * cfg.SOLVER.BASE_LR
61 | 
62 | 
63 | def get_step_index(cfg, cur_epoch):
64 |     """
65 |     Retrieves the lr step index for the given epoch.
66 |     Args:
67 |         cfg (CfgNode): configs. Details can be found in
68 |             slowfast/config/defaults.py
69 |         cur_epoch (float): the number of epoch of the current training stage.
70 |     """
71 |     steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_EPOCH]
72 |     for ind, step in enumerate(steps):  # NoQA
73 |         if cur_epoch < step:
74 |             break
75 |     return ind - 1
76 | 
77 | 
78 | def get_lr_func(lr_policy):
79 |     """
80 |     Given the configs, retrieve the specified lr policy function.
81 |     Args:
82 |         lr_policy (string): the learning rate policy to use for the job.
83 |     """
84 |     policy = "lr_func_" + lr_policy
85 |     if policy not in globals():
86 |         raise NotImplementedError("Unknown LR policy: {}".format(lr_policy))
87 |     else:
88 |         return globals()[policy]
89 | 


--------------------------------------------------------------------------------
/slowfast/utils/metrics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Functions for computing metrics."""
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | 
  9 | 
 10 | def topks_correct(preds, labels, ks):
 11 |     """
 12 |     Given the predictions, labels, and a list of top-k values, compute the
 13 |     number of correct predictions for each top-k value.
 14 | 
 15 |     Args:
 16 |         preds (array): array of predictions. Dimension is batchsize
 17 |             N x ClassNum.
 18 |         labels (array): array of labels. Dimension is batchsize N.
 19 |         ks (list): list of top-k values. For example, ks = [1, 5] correspods
 20 |             to top-1 and top-5.
 21 | 
 22 |     Returns:
 23 |         topks_correct (list): list of numbers, where the `i`-th entry
 24 |             corresponds to the number of top-`ks[i]` correct predictions.
 25 |     """
 26 |     assert preds.size(0) == labels.size(
 27 |         0
 28 |     ), "Batch dim of predictions and labels must match"
 29 |     # Find the top max_k predictions for each sample
 30 |     _top_max_k_vals, top_max_k_inds = torch.topk(
 31 |         preds, max(ks), dim=1, largest=True, sorted=True
 32 |     )
 33 |     # (batch_size, max_k) -> (max_k, batch_size).
 34 |     top_max_k_inds = top_max_k_inds.t()
 35 |     # (batch_size, ) -> (max_k, batch_size).
 36 |     rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds)
 37 |     # (i, j) = 1 if top i-th prediction for the j-th sample is correct.
 38 |     top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels)
 39 |     # Compute the number of topk correct predictions for each k.
 40 |     topks_correct = [top_max_k_correct[:k, :].float().sum() for k in ks]
 41 |     return topks_correct
 42 | 
 43 | 
 44 | def topk_errors(preds, labels, ks):
 45 |     """
 46 |     Computes the top-k error for each k.
 47 |     Args:
 48 |         preds (array): array of predictions. Dimension is N.
 49 |         labels (array): array of labels. Dimension is N.
 50 |         ks (list): list of ks to calculate the top accuracies.
 51 |     """
 52 |     num_topks_correct = topks_correct(preds, labels, ks)
 53 |     return [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct]
 54 | 
 55 | 
 56 | def topk_accuracies(preds, labels, ks):
 57 |     """
 58 |     Computes the top-k accuracy for each k.
 59 |     Args:
 60 |         preds (array): array of predictions. Dimension is N.
 61 |         labels (array): array of labels. Dimension is N.
 62 |         ks (list): list of ks to calculate the top accuracies.
 63 |     """
 64 |     num_topks_correct = topks_correct(preds, labels, ks)
 65 |     return [(x / preds.size(0)) * 100.0 for x in num_topks_correct]
 66 | 
 67 | 
 68 | def multitask_topks_correct(preds, labels, ks=(1,)):
 69 |     """
 70 |     Args:
 71 |         preds: tuple(torch.FloatTensor), each tensor should be of shape
 72 |             [batch_size, class_count], class_count can vary on a per task basis, i.e.
 73 |             outputs[i].shape[1] can be different to outputs[j].shape[j].
 74 |         labels: tuple(torch.LongTensor), each tensor should be of shape [batch_size]
 75 |         ks: tuple(int), compute accuracy at top-k for the values of k specified
 76 |             in this parameter.
 77 |     Returns:
 78 |         tuple(float), same length at topk with the corresponding accuracy@k in.
 79 |     """
 80 |     max_k = int(np.max(ks))
 81 |     task_count = len(preds)
 82 |     batch_size = labels[0].size(0)
 83 |     all_correct = torch.zeros(max_k, batch_size).type(torch.ByteTensor)
 84 |     all_correct = all_correct.to(preds[0].device)
 85 |     for output, label in zip(preds, labels):
 86 |         _, max_k_idx = output.topk(max_k, dim=1, largest=True, sorted=True)
 87 |         # Flip batch_size, class_count as .view doesn't work on non-contiguous
 88 |         max_k_idx = max_k_idx.t()
 89 |         correct_for_task = max_k_idx.eq(label.view(1, -1).expand_as(max_k_idx))
 90 |         all_correct.add_(correct_for_task)
 91 | 
 92 |     multitask_topks_correct = [
 93 |         torch.ge(all_correct[:k].float().sum(0), task_count).float().sum(0) for k in ks
 94 |     ]
 95 | 
 96 |     return multitask_topks_correct
 97 | 
 98 | 
 99 | def multitask_topk_accuracies(preds, labels, ks):
100 |     """
101 |     Computes the top-k accuracy for each k.
102 |     Args:
103 |         preds (array): array of predictions. Dimension is N.
104 |         labels (array): array of labels. Dimension is N.
105 |         ks (list): list of ks to calculate the top accuracies.
106 |    """
107 |     num_multitask_topks_correct = multitask_topks_correct(preds, labels, ks)
108 |     return [(x / preds[0].size(0)) * 100.0 for x in num_multitask_topks_correct]


--------------------------------------------------------------------------------
/slowfast/utils/multigrid.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Helper functions for multigrid training."""
  5 | 
  6 | import numpy as np
  7 | 
  8 | import slowfast.utils.logging as logging
  9 | 
 10 | logger = logging.get_logger(__name__)
 11 | 
 12 | 
 13 | class MultigridSchedule(object):
 14 |     """
 15 |     This class defines multigrid training schedule and update cfg accordingly.
 16 |     """
 17 | 
 18 |     def init_multigrid(self, cfg):
 19 |         """
 20 |         Update cfg based on multigrid settings.
 21 |         Args:
 22 |             cfg (configs): configs that contains training and multigrid specific
 23 |                 hyperparameters. Details can be seen in
 24 |                 slowfast/config/defaults.py.
 25 |         Returns:
 26 |             cfg (configs): the updated cfg.
 27 |         """
 28 |         self.schedule = None
 29 |         # We may modify cfg.TRAIN.BATCH_SIZE, cfg.DATA.NUM_FRAMES, and
 30 |         # cfg.DATA.TRAIN_CROP_SIZE during training, so we store their original
 31 |         # value in cfg and use them as global variables.
 32 |         cfg.MULTIGRID.DEFAULT_B = cfg.TRAIN.BATCH_SIZE
 33 |         cfg.MULTIGRID.DEFAULT_T = cfg.DATA.NUM_FRAMES
 34 |         cfg.MULTIGRID.DEFAULT_S = cfg.DATA.TRAIN_CROP_SIZE
 35 | 
 36 |         if cfg.MULTIGRID.LONG_CYCLE:
 37 |             self.schedule = self.get_long_cycle_schedule(cfg)
 38 |             cfg.SOLVER.STEPS = [0] + [s[-1] for s in self.schedule]
 39 |             # Fine-tuning phase.
 40 |             cfg.SOLVER.STEPS[-1] = (
 41 |                 cfg.SOLVER.STEPS[-2] + cfg.SOLVER.STEPS[-1]
 42 |             ) // 2
 43 |             cfg.SOLVER.LRS = [
 44 |                 cfg.SOLVER.GAMMA ** s[0] * s[1][0] for s in self.schedule
 45 |             ]
 46 |             # Fine-tuning phase.
 47 |             cfg.SOLVER.LRS = cfg.SOLVER.LRS[:-1] + [
 48 |                 cfg.SOLVER.LRS[-2],
 49 |                 cfg.SOLVER.LRS[-1],
 50 |             ]
 51 | 
 52 |             cfg.SOLVER.MAX_EPOCH = self.schedule[-1][-1]
 53 | 
 54 |         elif cfg.MULTIGRID.SHORT_CYCLE:
 55 |             cfg.SOLVER.STEPS = [
 56 |                 int(s * cfg.MULTIGRID.EPOCH_FACTOR) for s in cfg.SOLVER.STEPS
 57 |             ]
 58 |             cfg.SOLVER.MAX_EPOCH = int(
 59 |                 cfg.SOLVER.MAX_EPOCH * cfg.MULTIGRID.EPOCH_FACTOR
 60 |             )
 61 |         return cfg
 62 | 
 63 |     def update_long_cycle(self, cfg, cur_epoch):
 64 |         """
 65 |         Before every epoch, check if long cycle shape should change. If it
 66 |             should, update cfg accordingly.
 67 |         Args:
 68 |             cfg (configs): configs that contains training and multigrid specific
 69 |                 hyperparameters. Details can be seen in
 70 |                 slowfast/config/defaults.py.
 71 |             cur_epoch (int): current epoch index.
 72 |         Returns:
 73 |             cfg (configs): the updated cfg.
 74 |             changed (bool): do we change long cycle shape at this epoch?
 75 |         """
 76 |         base_b, base_t, base_s = get_current_long_cycle_shape(
 77 |             self.schedule, cur_epoch
 78 |         )
 79 |         if base_s != cfg.DATA.TRAIN_CROP_SIZE or base_t != cfg.DATA.NUM_FRAMES:
 80 | 
 81 |             cfg.DATA.NUM_FRAMES = base_t
 82 |             cfg.DATA.TRAIN_CROP_SIZE = base_s
 83 |             cfg.TRAIN.BATCH_SIZE = base_b * cfg.MULTIGRID.DEFAULT_B
 84 | 
 85 |             bs_factor = (
 86 |                 float(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS)
 87 |                 / cfg.MULTIGRID.BN_BASE_SIZE
 88 |             )
 89 | 
 90 |             if bs_factor < 1:
 91 |                 cfg.BN.NORM_TYPE = "sync_batchnorm"
 92 |                 cfg.BN.NUM_SYNC_DEVICES = int(1.0 / bs_factor)
 93 |             elif bs_factor > 1:
 94 |                 cfg.BN.NORM_TYPE = "sub_batchnorm"
 95 |                 cfg.BN.NUM_SPLITS = int(bs_factor)
 96 |             else:
 97 |                 cfg.BN.NORM_TYPE = "batchnorm"
 98 | 
 99 |             cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = cfg.DATA.SAMPLING_RATE * (
100 |                 cfg.MULTIGRID.DEFAULT_T // cfg.DATA.NUM_FRAMES
101 |             )
102 |             logger.info("Long cycle updates:")
103 |             logger.info("\tBN.NORM_TYPE: {}".format(cfg.BN.NORM_TYPE))
104 |             if cfg.BN.NORM_TYPE == "sync_batchnorm":
105 |                 logger.info(
106 |                     "\tBN.NUM_SYNC_DEVICES: {}".format(cfg.BN.NUM_SYNC_DEVICES)
107 |                 )
108 |             elif cfg.BN.NORM_TYPE == "sub_batchnorm":
109 |                 logger.info("\tBN.NUM_SPLITS: {}".format(cfg.BN.NUM_SPLITS))
110 |             logger.info("\tTRAIN.BATCH_SIZE: {}".format(cfg.TRAIN.BATCH_SIZE))
111 |             logger.info(
112 |                 "\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format(
113 |                     cfg.DATA.NUM_FRAMES, cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE
114 |                 )
115 |             )
116 |             logger.info(
117 |                 "\tDATA.TRAIN_CROP_SIZE: {}".format(cfg.DATA.TRAIN_CROP_SIZE)
118 |             )
119 |             return cfg, True
120 |         else:
121 |             return cfg, False
122 | 
123 |     def get_long_cycle_schedule(self, cfg):
124 |         """
125 |         Based on multigrid hyperparameters, define the schedule of a long cycle.
126 |         Args:
127 |             cfg (configs): configs that contains training and multigrid specific
128 |                 hyperparameters. Details can be seen in
129 |                 slowfast/config/defaults.py.
130 |         Returns:
131 |             schedule (list): Specifies a list long cycle base shapes and their
132 |                 corresponding training epochs.
133 |         """
134 | 
135 |         steps = cfg.SOLVER.STEPS
136 | 
137 |         default_size = float(
138 |             cfg.DATA.NUM_FRAMES * cfg.DATA.TRAIN_CROP_SIZE ** 2
139 |         )
140 |         default_iters = steps[-1]
141 | 
142 |         # Get shapes and average batch size for each long cycle shape.
143 |         avg_bs = []
144 |         all_shapes = []
145 |         for t_factor, s_factor in cfg.MULTIGRID.LONG_CYCLE_FACTORS:
146 |             base_t = int(round(cfg.DATA.NUM_FRAMES * t_factor))
147 |             base_s = int(round(cfg.DATA.TRAIN_CROP_SIZE * s_factor))
148 |             if cfg.MULTIGRID.SHORT_CYCLE:
149 |                 shapes = [
150 |                     [
151 |                         base_t,
152 |                         cfg.MULTIGRID.DEFAULT_S
153 |                         * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[0],
154 |                     ],
155 |                     [
156 |                         base_t,
157 |                         cfg.MULTIGRID.DEFAULT_S
158 |                         * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[1],
159 |                     ],
160 |                     [base_t, base_s],
161 |                 ]
162 |             else:
163 |                 shapes = [[base_t, base_s]]
164 | 
165 |             # (T, S) -> (B, T, S)
166 |             shapes = [
167 |                 [int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]]
168 |                 for s in shapes
169 |             ]
170 |             avg_bs.append(np.mean([s[0] for s in shapes]))
171 |             all_shapes.append(shapes)
172 | 
173 |         # Get schedule regardless of cfg.MULTIGRID.EPOCH_FACTOR.
174 |         total_iters = 0
175 |         schedule = []
176 |         for step_index in range(len(steps) - 1):
177 |             step_epochs = steps[step_index + 1] - steps[step_index]
178 | 
179 |             for long_cycle_index, shapes in enumerate(all_shapes):
180 |                 cur_epochs = (
181 |                     step_epochs * avg_bs[long_cycle_index] / sum(avg_bs)
182 |                 )
183 | 
184 |                 cur_iters = cur_epochs / avg_bs[long_cycle_index]
185 |                 total_iters += cur_iters
186 |                 schedule.append((step_index, shapes[-1], cur_epochs))
187 | 
188 |         iter_saving = default_iters / total_iters
189 | 
190 |         final_step_epochs = cfg.SOLVER.MAX_EPOCH - steps[-1]
191 | 
192 |         # We define the fine-tuning phase to have the same amount of iteration
193 |         # saving as the rest of the training.
194 |         ft_epochs = final_step_epochs / iter_saving * avg_bs[-1]
195 | 
196 |         schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs))
197 | 
198 |         # Obtrain final schedule given desired cfg.MULTIGRID.EPOCH_FACTOR.
199 |         x = (
200 |             cfg.SOLVER.MAX_EPOCH
201 |             * cfg.MULTIGRID.EPOCH_FACTOR
202 |             / sum(s[-1] for s in schedule)
203 |         )
204 | 
205 |         final_schedule = []
206 |         total_epochs = 0
207 |         for s in schedule:
208 |             epochs = s[2] * x
209 |             total_epochs += epochs
210 |             final_schedule.append((s[0], s[1], int(round(total_epochs))))
211 |         print_schedule(final_schedule)
212 |         return final_schedule
213 | 
214 | 
215 | def print_schedule(schedule):
216 |     """
217 |     Log schedule.
218 |     """
219 |     logger.info("Long cycle index\tBase shape\tEpochs")
220 |     for s in schedule:
221 |         logger.info("{}\t{}\t{}".format(s[0], s[1], s[2]))
222 | 
223 | 
224 | def get_current_long_cycle_shape(schedule, epoch):
225 |     """
226 |     Given a schedule and epoch index, return the long cycle base shape.
227 |     Args:
228 |         schedule (configs): configs that contains training and multigrid specific
229 |             hyperparameters. Details can be seen in
230 |             slowfast/config/defaults.py.
231 |         cur_epoch (int): current epoch index.
232 |     Returns:
233 |         shapes (list): A list describing the base shape in a long cycle:
234 |             [batch size relative to default,
235 |             number of frames, spatial dimension].
236 |     """
237 |     for s in schedule:
238 |         if epoch < s[-1]:
239 |             return s[1]
240 |     return schedule[-1][1]
241 | 


--------------------------------------------------------------------------------
/slowfast/utils/multiprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Multiprocessing helpers."""
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | def run(
10 |     local_rank,
11 |     num_proc,
12 |     func,
13 |     init_method,
14 |     shard_id,
15 |     num_shards,
16 |     backend,
17 |     cfg,
18 |     output_queue=None,
19 | ):
20 |     """
21 |     Runs a function from a child process.
22 |     Args:
23 |         local_rank (int): rank of the current process on the current machine.
24 |         num_proc (int): number of processes per machine.
25 |         func (function): function to execute on each of the process.
26 |         init_method (string): method to initialize the distributed training.
27 |             TCP initialization: equiring a network address reachable from all
28 |             processes followed by the port.
29 |             Shared file-system initialization: makes use of a file system that
30 |             is shared and visible from all machines. The URL should start with
31 |             file:// and contain a path to a non-existent file on a shared file
32 |             system.
33 |         shard_id (int): the rank of the current machine.
34 |         num_shards (int): number of overall machines for the distributed
35 |             training job.
36 |         backend (string): three distributed backends ('nccl', 'gloo', 'mpi') are
37 |             supports, each with different capabilities. Details can be found
38 |             here:
39 |             https://pytorch.org/docs/stable/distributed.html
40 |         cfg (CfgNode): configs. Details can be found in
41 |             slowfast/config/defaults.py
42 |         output_queue (queue): can optionally be used to return values from the
43 |             master process.
44 |     """
45 |     # Initialize the process group.
46 |     world_size = num_proc * num_shards
47 |     rank = shard_id * num_proc + local_rank
48 | 
49 |     try:
50 |         torch.distributed.init_process_group(
51 |             backend=backend,
52 |             init_method=init_method,
53 |             world_size=world_size,
54 |             rank=rank,
55 |         )
56 |     except Exception as e:
57 |         raise e
58 | 
59 |     torch.cuda.set_device(local_rank)
60 |     ret = func(cfg)
61 |     if output_queue is not None and local_rank == 0:
62 |         output_queue.put(ret)
63 | 


--------------------------------------------------------------------------------
/slowfast/utils/parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Argument parser functions."""
 5 | 
 6 | import argparse
 7 | import sys
 8 | 
 9 | import slowfast.utils.checkpoint as cu
10 | from slowfast.config.defaults import get_cfg
11 | 
12 | 
13 | def parse_args():
14 |     """
15 |     Parse the following arguments for a default parser for PySlowFast users.
16 |     Args:
17 |         shard_id (int): shard id for the current machine. Starts from 0 to
18 |             num_shards - 1. If single machine is used, then set shard id to 0.
19 |         num_shards (int): number of shards using by the job.
20 |         init_method (str): initialization method to launch the job with multiple
21 |             devices. Options includes TCP or shared file-system for
22 |             initialization. details can be find in
23 |             https://pytorch.org/docs/stable/distributed.html#tcp-initialization
24 |         cfg (str): path to the config file.
25 |         opts (argument): provide addtional options from the command line, it
26 |             overwrites the config loaded from file.
27 |     """
28 |     parser = argparse.ArgumentParser(
29 |         description="Provide SlowFast video training and testing pipeline."
30 |     )
31 |     parser.add_argument(
32 |         "--shard_id",
33 |         help="The shard id of current node, Starts from 0 to num_shards - 1",
34 |         default=0,
35 |         type=int,
36 |     )
37 |     parser.add_argument(
38 |         "--num_shards",
39 |         help="Number of shards using by the job",
40 |         default=1,
41 |         type=int,
42 |     )
43 |     parser.add_argument(
44 |         "--init_method",
45 |         help="Initialization method, includes TCP or shared file-system",
46 |         default="tcp://localhost:9999",
47 |         type=str,
48 |     )
49 |     parser.add_argument(
50 |         "--cfg",
51 |         dest="cfg_file",
52 |         help="Path to the config file",
53 |         default="configs/Kinetics/SLOWFAST_4x16_R50.yaml",
54 |         type=str,
55 |     )
56 |     parser.add_argument(
57 |         "opts",
58 |         help="See slowfast/config/defaults.py for all options",
59 |         default=None,
60 |         nargs=argparse.REMAINDER,
61 |     )
62 |     if len(sys.argv) == 1:
63 |         parser.print_help()
64 |     return parser.parse_args()
65 | 
66 | 
67 | def load_config(args):
68 |     """
69 |     Given the arguemnts, load and initialize the configs.
70 |     Args:
71 |         args (argument): arguments includes `shard_id`, `num_shards`,
72 |             `init_method`, `cfg_file`, and `opts`.
73 |     """
74 |     # Setup cfg.
75 |     cfg = get_cfg()
76 |     # Load config from cfg.
77 |     if args.cfg_file is not None:
78 |         cfg.merge_from_file(args.cfg_file)
79 |     # Load config from command line, overwrite config from opts.
80 |     if args.opts is not None:
81 |         cfg.merge_from_list(args.opts)
82 | 
83 |     # Inherit parameters from args.
84 |     if hasattr(args, "num_shards") and hasattr(args, "shard_id"):
85 |         cfg.NUM_SHARDS = args.num_shards
86 |         cfg.SHARD_ID = args.shard_id
87 |     if hasattr(args, "rng_seed"):
88 |         cfg.RNG_SEED = args.rng_seed
89 |     if hasattr(args, "output_dir"):
90 |         cfg.OUTPUT_DIR = args.output_dir
91 | 
92 |     # Create the checkpoint dir.
93 |     cu.make_checkpoint_dir(cfg.OUTPUT_DIR)
94 |     return cfg
95 | 


--------------------------------------------------------------------------------
/slowfast/utils/weight_init_helper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Utility function for weight initialization"""
 5 | 
 6 | import torch.nn as nn
 7 | from fvcore.nn.weight_init import c2_msra_fill
 8 | 
 9 | 
10 | def init_weights(model, fc_init_std=0.01, zero_init_final_bn=True):
11 |     """
12 |     Performs ResNet style weight initialization.
13 |     Args:
14 |         fc_init_std (float): the expected standard deviation for fc layer.
15 |         zero_init_final_bn (bool): if True, zero initialize the final bn for
16 |             every bottleneck.
17 |     """
18 |     for m in model.modules():
19 |         if isinstance(m, nn.Conv3d):
20 |             """
21 |             Follow the initialization method proposed in:
22 |             {He, Kaiming, et al.
23 |             "Delving deep into rectifiers: Surpassing human-level
24 |             performance on imagenet classification."
25 |             arXiv preprint arXiv:1502.01852 (2015)}
26 |             """
27 |             c2_msra_fill(m)
28 |         elif isinstance(m, nn.BatchNorm3d):
29 |             if (
30 |                 hasattr(m, "transform_final_bn")
31 |                 and m.transform_final_bn
32 |                 and zero_init_final_bn
33 |             ):
34 |                 batchnorm_weight = 0.0
35 |             else:
36 |                 batchnorm_weight = 1.0
37 |             if m.weight is not None:
38 |                 m.weight.data.fill_(batchnorm_weight)
39 |             if m.bias is not None:
40 |                 m.bias.data.zero_()
41 |         if isinstance(m, nn.Linear):
42 |             m.weight.data.normal_(mean=0.0, std=fc_init_std)
43 |             if m.bias is not None:
44 |                 m.bias.data.zero_()
45 | 


--------------------------------------------------------------------------------
/slowfast/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
3 | 


--------------------------------------------------------------------------------
/slowfast/visualization/gradcam_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | import torch
  6 | import torch.nn.functional as F
  7 | 
  8 | import slowfast.datasets.utils as data_utils
  9 | from slowfast.visualization.utils import get_layer
 10 | 
 11 | 
 12 | class GradCAM:
 13 |     """
 14 |     GradCAM class helps create localization maps using the Grad-CAM method for input videos
 15 |     and overlap the maps over the input videos as heatmaps.
 16 |     https://arxiv.org/pdf/1610.02391.pdf
 17 |     """
 18 | 
 19 |     def __init__(
 20 |         self, model, target_layers, data_mean, data_std, colormap="viridis"
 21 |     ):
 22 |         """
 23 |         Args:
 24 |             model (model): the model to be used.
 25 |             target_layers (list of str(s)): name of convolutional layer to be used to get
 26 |                 gradients and feature maps from for creating localization maps.
 27 |             data_mean (tensor or list): mean value to add to input videos.
 28 |             data_std (tensor or list): std to multiply for input videos.
 29 |             colormap (Optional[str]): matplotlib colormap used to create heatmap.
 30 |                 See https://matplotlib.org/3.3.0/tutorials/colors/colormaps.html
 31 |         """
 32 | 
 33 |         self.model = model
 34 |         # Run in eval mode.
 35 |         self.model.eval()
 36 |         self.target_layers = target_layers
 37 | 
 38 |         self.gradients = {}
 39 |         self.activations = {}
 40 |         self.colormap = plt.get_cmap(colormap)
 41 |         self.data_mean = data_mean
 42 |         self.data_std = data_std
 43 |         self._register_hooks()
 44 | 
 45 |     def _register_single_hook(self, layer_name):
 46 |         """
 47 |         Register forward and backward hook to a layer, given layer_name,
 48 |         to obtain gradients and activations.
 49 |         Args:
 50 |             layer_name (str): name of the layer.
 51 |         """
 52 | 
 53 |         def get_gradients(module, grad_input, grad_output):
 54 |             self.gradients[layer_name] = grad_output[0].detach()
 55 | 
 56 |         def get_activations(module, input, output):
 57 |             self.activations[layer_name] = output.clone().detach()
 58 | 
 59 |         target_layer = get_layer(self.model, layer_name=layer_name)
 60 |         target_layer.register_forward_hook(get_activations)
 61 |         target_layer.register_backward_hook(get_gradients)
 62 | 
 63 |     def _register_hooks(self):
 64 |         """
 65 |         Register hooks to layers in `self.target_layers`.
 66 |         """
 67 |         for layer_name in self.target_layers:
 68 |             self._register_single_hook(layer_name=layer_name)
 69 | 
 70 |     def _calculate_localization_map(self, inputs, labels=None):
 71 |         """
 72 |         Calculate localization map for all inputs with Grad-CAM.
 73 |         Args:
 74 |             inputs (list of tensor(s)): the input clips.
 75 |             labels (Optional[tensor]): labels of the current input clips.
 76 |         Returns:
 77 |             localization_maps (list of ndarray(s)): the localization map for
 78 |                 each corresponding input.
 79 |             preds (tensor): shape (n_instances, n_class). Model predictions for `inputs`.
 80 |         """
 81 |         assert len(inputs) == len(
 82 |             self.target_layers
 83 |         ), "Must register the same number of target layers as the number of input pathways."
 84 |         input_clone = [inp.clone() for inp in inputs]
 85 |         preds = self.model(input_clone)
 86 | 
 87 |         if labels is None:
 88 |             score = torch.max(preds, dim=-1)[0]
 89 |         else:
 90 |             if labels.ndim == 1:
 91 |                 labels = labels.unsqueeze(-1)
 92 |             score = torch.gather(preds, dim=1, index=labels)
 93 | 
 94 |         self.model.zero_grad()
 95 |         score = torch.sum(score)
 96 |         score.backward()
 97 |         localization_maps = []
 98 |         for i, inp in enumerate(inputs):
 99 |             _, _, T, H, W = inp.size()
100 | 
101 |             gradients = self.gradients[self.target_layers[i]]
102 |             activations = self.activations[self.target_layers[i]]
103 |             B, C, Tg, _, _ = gradients.size()
104 | 
105 |             weights = torch.mean(gradients.view(B, C, Tg, -1), dim=3)
106 | 
107 |             weights = weights.view(B, C, Tg, 1, 1)
108 |             localization_map = torch.sum(
109 |                 weights * activations, dim=1, keepdim=True
110 |             )
111 |             localization_map = F.relu(localization_map)
112 |             localization_map = F.interpolate(
113 |                 localization_map,
114 |                 size=(T, H, W),
115 |                 mode="trilinear",
116 |                 align_corners=False,
117 |             )
118 |             localization_map_min, localization_map_max = (
119 |                 torch.min(localization_map.view(B, -1), dim=-1, keepdim=True)[
120 |                     0
121 |                 ],
122 |                 torch.max(localization_map.view(B, -1), dim=-1, keepdim=True)[
123 |                     0
124 |                 ],
125 |             )
126 |             localization_map_min = torch.reshape(
127 |                 localization_map_min, shape=(B, 1, 1, 1, 1)
128 |             )
129 |             localization_map_max = torch.reshape(
130 |                 localization_map_max, shape=(B, 1, 1, 1, 1)
131 |             )
132 |             # Normalize the localization map.
133 |             localization_map = (localization_map - localization_map_min) / (
134 |                 localization_map_max - localization_map_min + 1e-6
135 |             )
136 |             localization_map = localization_map.data
137 | 
138 |             localization_maps.append(localization_map)
139 | 
140 |         return localization_maps, preds
141 | 
142 |     def __call__(self, inputs, labels=None, alpha=0.5):
143 |         """
144 |         Visualize the localization maps on their corresponding inputs as heatmap,
145 |         using Grad-CAM.
146 |         Args:
147 |             inputs (list of tensor(s)): the input clips.
148 |             labels (Optional[tensor]): labels of the current input clips.
149 |             alpha (float): transparency level of the heatmap, in the range [0, 1].
150 |         Returns:
151 |             result_ls (list of tensor(s)): the visualized inputs.
152 |             preds (tensor): shape (n_instances, n_class). Model predictions for `inputs`.
153 |         """
154 |         result_ls = []
155 |         localization_maps, preds = self._calculate_localization_map(
156 |             inputs, labels=labels
157 |         )
158 |         for i, localization_map in enumerate(localization_maps):
159 |             # Convert (B, 1, T, H, W) to (B, T, H, W)
160 |             localization_map = localization_map.squeeze(dim=1)
161 |             if localization_map.device != torch.device("cpu"):
162 |                 localization_map = localization_map.cpu()
163 |             heatmap = self.colormap(localization_map)
164 |             heatmap = heatmap[:, :, :, :, :3]
165 |             # Permute input from (B, C, T, H, W) to (B, T, H, W, C)
166 |             curr_inp = inputs[i].permute(0, 2, 3, 4, 1)
167 |             if curr_inp.device != torch.device("cpu"):
168 |                 curr_inp = curr_inp.cpu()
169 |             curr_inp = data_utils.revert_tensor_normalize(
170 |                 curr_inp, self.data_mean, self.data_std
171 |             )
172 |             heatmap = torch.from_numpy(heatmap)
173 |             curr_inp = alpha * heatmap + (1 - alpha) * curr_inp
174 |             # Permute inp to (B, T, C, H, W)
175 |             curr_inp = curr_inp.permute(0, 1, 4, 2, 3)
176 |             result_ls.append(curr_inp)
177 | 
178 |         return result_ls, preds
179 | 


--------------------------------------------------------------------------------
/slowfast/visualization/prediction_vis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | import slowfast.datasets.utils as data_utils
  8 | import slowfast.utils.logging as logging
  9 | import slowfast.visualization.tensorboard_vis as tb
 10 | from slowfast.utils.misc import get_class_names
 11 | from slowfast.visualization.video_visualizer import VideoVisualizer
 12 | 
 13 | logger = logging.get_logger(__name__)
 14 | 
 15 | 
 16 | class WrongPredictionVis:
 17 |     """
 18 |     WrongPredictionVis class for visualizing video inputs to Tensorboard
 19 |     for instances that the model makes wrong predictions.
 20 |     """
 21 | 
 22 |     def __init__(self, cfg):
 23 |         """
 24 |         Args:
 25 |             cfg (CfgNode): configs. Details can be found in
 26 |                 slowfast/config/defaults.py
 27 |         """
 28 |         self.cfg = cfg
 29 |         self.class_names, _, self.subset = get_class_names(
 30 |             cfg.TENSORBOARD.CLASS_NAMES_PATH,
 31 |             subset_path=cfg.TENSORBOARD.WRONG_PRED_VIS.SUBSET_PATH,
 32 |         )
 33 |         if self.subset is not None:
 34 |             self.subset = set(self.subset)
 35 |         self.num_class = cfg.MODEL.NUM_CLASSES
 36 |         self.video_vis = VideoVisualizer(
 37 |             cfg.MODEL.NUM_CLASSES,
 38 |             cfg.TENSORBOARD.CLASS_NAMES_PATH,
 39 |             1,
 40 |             cfg.TENSORBOARD.MODEL_VIS.COLORMAP,
 41 |         )
 42 |         self.tag = cfg.TENSORBOARD.WRONG_PRED_VIS.TAG
 43 |         self.writer = tb.TensorboardWriter(cfg)
 44 |         self.model_incorrect_classes = set()
 45 | 
 46 |     def _pick_wrong_preds(self, labels, preds):
 47 |         """
 48 |         Returns a 1D tensor that contains the indices of instances that have
 49 |         wrong predictions, where true labels in in the specified subset.
 50 |         Args:
 51 |             labels (tensor): tensor of shape (n_instances,) containing class ids.
 52 |             preds (tensor): class scores from model, shape (n_intances, n_classes)
 53 |         Returns:
 54 |             mask (tensor): boolean tensor. `mask[i]` is True if `model` makes a wrong prediction.
 55 |         """
 56 |         subset_mask = torch.ones(size=(len(labels),), dtype=torch.bool)
 57 |         if self.subset is not None:
 58 |             for i, label in enumerate(labels):
 59 |                 if label not in self.subset:
 60 |                     subset_mask[i] = False
 61 | 
 62 |         preds_ids = torch.argmax(preds, dim=-1)
 63 | 
 64 |         mask = preds_ids != labels
 65 |         mask &= subset_mask
 66 |         for i, wrong_pred in enumerate(mask):
 67 |             if wrong_pred:
 68 |                 self.model_incorrect_classes.add(labels[i])
 69 | 
 70 |         return mask
 71 | 
 72 |     def visualize_vid(self, video_input, labels, preds, batch_idx):
 73 |         """
 74 |         Draw predicted labels on video inputs and visualize all incorrectly classified
 75 |         videos in the current batch.
 76 |         Args:
 77 |             video_input (list of list of tensor(s)): list of videos for all pathways.
 78 |             labels (array-like): shape (n_instances,) of true label for each instance.
 79 |             preds (tensor): shape (n, instances, n_classes). The predicted scores for all instances.
 80 |             tag (Optional[str]): all visualized video will be added under this tag. This is for organization
 81 |                 purposes in Tensorboard.
 82 |             batch_idx (int): batch index of the current videos.
 83 |         """
 84 | 
 85 |         def add_video(vid, preds, tag, true_class_name):
 86 |             """
 87 |             Draw predicted label on video and add it to Tensorboard.
 88 |             Args:
 89 |                 vid (array-like): shape (C, T, H, W). Each image in `vid` is a RGB image.
 90 |                 preds (tensor): shape (n_classes,) or (1, n_classes). The predicted scores
 91 |                     for the current `vid`.
 92 |                 tag (str): tag for `vid` in Tensorboard.
 93 |                 true_class_name (str): the ground-truth class name of the current `vid` instance.
 94 |             """
 95 |             # Permute to (T, H, W, C).
 96 |             vid = vid.permute(1, 2, 3, 0)
 97 |             vid = data_utils.revert_tensor_normalize(
 98 |                 vid.cpu(), self.cfg.DATA.MEAN, self.cfg.DATA.STD
 99 |             )
100 |             vid = self.video_vis.draw_clip(vid, preds)
101 |             vid = torch.from_numpy(np.array(vid)).permute(0, 3, 1, 2)
102 |             vid = torch.unsqueeze(vid, dim=0)
103 |             self.writer.add_video(
104 |                 vid, tag="{}: {}".format(tag, true_class_name)
105 |             )
106 | 
107 |         mask = self._pick_wrong_preds(labels, preds)
108 |         video_indices = torch.squeeze(mask.nonzero(), dim=-1)
109 |         # Visualize each wrongly classfied video.
110 |         for vid_idx in video_indices:
111 |             cur_vid_idx = batch_idx * len(video_input[0]) + vid_idx
112 |             for pathway in range(len(video_input)):
113 |                 add_video(
114 |                     video_input[pathway][vid_idx],
115 |                     preds=preds[vid_idx],
116 |                     tag=self.tag
117 |                     + "/Video {}, Pathway {}".format(cur_vid_idx, pathway),
118 |                     true_class_name=self.class_names[labels[vid_idx]],
119 |                 )
120 | 
121 |     @property
122 |     def wrong_class_prediction(self):
123 |         """
124 |         Return class ids that the model predicted incorrectly.
125 |         """
126 |         incorrect_class_names = [
127 |             self.class_names[i] for i in self.model_incorrect_classes
128 |         ]
129 |         return list(set(incorrect_class_names))
130 | 
131 |     def clean(self):
132 |         """
133 |         Close Tensorboard writer.
134 |         """
135 |         self.writer.close()
136 | 


--------------------------------------------------------------------------------
/slowfast/visualization/predictor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | import queue
  5 | import cv2
  6 | import torch
  7 | #from detectron2 import model_zoo
  8 | #from detectron2.config import get_cfg
  9 | #from detectron2.engine import DefaultPredictor
 10 | 
 11 | import slowfast.utils.checkpoint as cu
 12 | from slowfast.datasets import cv2_transform
 13 | from slowfast.models import build_model
 14 | from slowfast.utils import logging
 15 | from slowfast.visualization.utils import process_cv2_inputs
 16 | 
 17 | logger = logging.get_logger(__name__)
 18 | 
 19 | 
 20 | class Predictor:
 21 |     """
 22 |     Action Predictor for action recognition.
 23 |     """
 24 | 
 25 |     def __init__(self, cfg, gpu_id=None):
 26 |         """
 27 |         Args:
 28 |             cfg (CfgNode): configs. Details can be found in
 29 |                 slowfast/config/defaults.py
 30 |             gpu_id (Optional[int]): GPU id.
 31 |         """
 32 |         if cfg.NUM_GPUS:
 33 |             self.gpu_id = (
 34 |                 torch.cuda.current_device() if gpu_id is None else gpu_id
 35 |             )
 36 | 
 37 |         # Build the video model and print model statistics.
 38 |         self.model = build_model(cfg, gpu_id=gpu_id)
 39 |         self.model.eval()
 40 |         self.cfg = cfg
 41 | 
 42 |         if cfg.DETECTION.ENABLE:
 43 |             self.object_detector = Detectron2Predictor(cfg, gpu_id=self.gpu_id)
 44 | 
 45 |         logger.info("Start loading model weights.")
 46 |         cu.load_test_checkpoint(cfg, self.model)
 47 |         logger.info("Finish loading model weights")
 48 | 
 49 |     def __call__(self, task):
 50 |         """
 51 |         Returns the prediction results for the current task.
 52 |         Args:
 53 |             task (TaskInfo object): task object that contain
 54 |                 the necessary information for action prediction. (e.g. frames, boxes)
 55 |         Returns:
 56 |             task (TaskInfo object): the same task info object but filled with
 57 |                 prediction values (a tensor) and the corresponding boxes for
 58 |                 action detection task.
 59 |         """
 60 |         if self.cfg.DETECTION.ENABLE:
 61 |             task = self.object_detector(task)
 62 | 
 63 |         frames, bboxes = task.frames, task.bboxes
 64 |         if bboxes is not None:
 65 |             bboxes = cv2_transform.scale_boxes(
 66 |                 self.cfg.DATA.TEST_CROP_SIZE,
 67 |                 bboxes,
 68 |                 task.img_height,
 69 |                 task.img_width,
 70 |             )
 71 |         if self.cfg.DEMO.INPUT_FORMAT == "BGR":
 72 |             frames = [
 73 |                 cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
 74 |             ]
 75 | 
 76 |         frames = [
 77 |             cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
 78 |             for frame in frames
 79 |         ]
 80 |         inputs = process_cv2_inputs(frames, self.cfg)
 81 |         if bboxes is not None:
 82 |             index_pad = torch.full(
 83 |                 size=(bboxes.shape[0], 1),
 84 |                 fill_value=float(0),
 85 |                 device=bboxes.device,
 86 |             )
 87 | 
 88 |             # Pad frame index for each box.
 89 |             bboxes = torch.cat([index_pad, bboxes], axis=1)
 90 |         if self.cfg.NUM_GPUS > 0:
 91 |             # Transfer the data to the current GPU device.
 92 |             if isinstance(inputs, (list,)):
 93 |                 for i in range(len(inputs)):
 94 |                     inputs[i] = inputs[i].cuda(
 95 |                         device=torch.device(self.gpu_id), non_blocking=True
 96 |                     )
 97 |             else:
 98 |                 inputs = inputs.cuda(
 99 |                     device=torch.device(self.gpu_id), non_blocking=True
100 |                 )
101 |         if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]:
102 |             preds = torch.tensor([])
103 |         else:
104 |             preds = self.model(inputs, bboxes)
105 | 
106 |         if self.cfg.NUM_GPUS:
107 |             preds = preds.cpu()
108 |             if bboxes is not None:
109 |                 bboxes = bboxes.detach().cpu()
110 | 
111 |         preds = preds.detach()
112 |         task.add_action_preds(preds)
113 |         if bboxes is not None:
114 |             task.add_bboxes(bboxes[:, 1:])
115 | 
116 |         return task
117 | 
118 | 
119 | class ActionPredictor:
120 |     """
121 |     Synchronous Action Prediction and Visualization pipeline with AsyncVis.
122 |     """
123 | 
124 |     def __init__(self, cfg, async_vis=None, gpu_id=None):
125 |         """
126 |         Args:
127 |             cfg (CfgNode): configs. Details can be found in
128 |                 slowfast/config/defaults.py
129 |             async_vis (AsyncVis object): asynchronous visualizer.
130 |             gpu_id (Optional[int]): GPU id.
131 |         """
132 |         self.predictor = Predictor(cfg=cfg, gpu_id=gpu_id)
133 |         self.async_vis = async_vis
134 | 
135 |     def put(self, task):
136 |         """
137 |         Make prediction and put the results in `async_vis` task queue.
138 |         Args:
139 |             task (TaskInfo object): task object that contain
140 |                 the necessary information for action prediction. (e.g. frames, boxes)
141 |         """
142 |         task = self.predictor(task)
143 |         self.async_vis.get_indices_ls.append(task.id)
144 |         self.async_vis.put(task)
145 | 
146 |     def get(self):
147 |         """
148 |         Get the visualized clips if any.
149 |         """
150 |         try:
151 |             task = self.async_vis.get()
152 |         except (queue.Empty, IndexError):
153 |             raise IndexError("Results are not available yet.")
154 | 
155 |         return task
156 | 
157 | 
158 | class Detectron2Predictor:
159 |     """
160 |     Wrapper around Detectron2 to return the required predicted bounding boxes
161 |     as a ndarray.
162 |     """
163 | 
164 |     def __init__(self, cfg, gpu_id=None):
165 |         """
166 |         Args:
167 |             cfg (CfgNode): configs. Details can be found in
168 |                 slowfast/config/defaults.py
169 |             gpu_id (Optional[int]): GPU id.
170 |         """
171 | 
172 |         self.cfg = get_cfg()
173 |         self.cfg.merge_from_file(
174 |             model_zoo.get_config_file(cfg.DEMO.DETECTRON2_CFG)
175 |         )
176 |         self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = cfg.DEMO.DETECTRON2_THRESH
177 |         self.cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_WEIGHTS
178 |         self.cfg.INPUT.FORMAT = cfg.DEMO.INPUT_FORMAT
179 |         if cfg.NUM_GPUS and gpu_id is None:
180 |             gpu_id = torch.cuda.current_device()
181 |         self.cfg.MODEL.DEVICE = (
182 |             "cuda:{}".format(gpu_id) if cfg.NUM_GPUS > 0 else "cpu"
183 |         )
184 | 
185 |         logger.info("Initialized Detectron2 Object Detection Model.")
186 | 
187 |         self.predictor = DefaultPredictor(self.cfg)
188 | 
189 |     def __call__(self, task):
190 |         """
191 |         Return bounding boxes predictions as a tensor.
192 |         Args:
193 |             task (TaskInfo object): task object that contain
194 |                 the necessary information for action prediction. (e.g. frames)
195 |         Returns:
196 |             task (TaskInfo object): the same task info object but filled with
197 |                 prediction values (a tensor) and the corresponding boxes for
198 |                 action detection task.
199 |         """
200 |         middle_frame = task.frames[len(task.frames) // 2]
201 |         outputs = self.predictor(middle_frame)
202 |         # Get only human instances
203 |         mask = outputs["instances"].pred_classes == 0
204 |         pred_boxes = outputs["instances"].pred_boxes.tensor[mask]
205 |         task.add_bboxes(pred_boxes)
206 | 
207 |         return task
208 | 


--------------------------------------------------------------------------------
/slurm_scripts/run_multi_node_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --constraint=volta32gb
 5 | #SBATCH --cpus-per-task=10
 6 | #SBATCH --error=/checkpoint/%u/jobs/%j.err
 7 | #SBATCH --gres=gpu:8
 8 | #SBATCH --job-name=vtf
 9 | #SBATCH --mem=450GB
10 | #SBATCH --nodes=8
11 | #SBATCH --ntasks-per-node=8
12 | #SBATCH --open-mode=append
13 | #SBATCH --output=/checkpoint/%u/jobs/%j.out
14 | #SBATCH --partition=learnfair
15 | #SBATCH --signal=USR1@600
16 | #SBATCH --time=72:00:00
17 | # #SBATCH --mail-type=END,FAIL,REQUEUE
18 | 
19 | module load anaconda3
20 | source activate motionformer
21 | 
22 | export MASTER_ADDR=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4}
23 | export MASTER_PORT=19500
24 | 
25 | # debugging flags (optional)
26 | export NCCL_DEBUG=INFO
27 | export PYTHONFAULTHANDLER=1
28 | 
29 | # set the network interface
30 | export NCCL_SOCKET_IFNAME=^docker0,lo
31 | echo $SLURMD_NODENAME $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES
32 | master_node=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4}
33 | dist_url="tcp://"
34 | dist_url+=$master_node
35 | dist_url+=:40000
36 | echo $dist_url
37 | 
38 | 
39 | if [ -z "$1" ]
40 | then
41 | 	CFG='configs/K400/joint_224_16x4.yaml'
42 | else
43 | 	CFG=$1
44 | fi
45 | 
46 | if [ -z "$2" ]
47 | then
48 | 	ROOT_FOLDER="/checkpoint/motionformer"
49 | else
50 | 	ROOT_FOLDER=$2
51 | fi
52 | 
53 | SAV_FOLDER="${ROOT_FOLDER}/${SLURM_JOB_ID}"
54 | mkdir -p ${SAV_FOLDER}
55 | 
56 | # command
57 | srun --label python tools/run_net.py --init_method $dist_url --num_shards 8 --cfg $CFG \
58 | NUM_GPUS 8 \
59 | OUTPUT_DIR ${SAV_FOLDER} \


--------------------------------------------------------------------------------
/slurm_scripts/run_single_node_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --constraint=volta32gb
 5 | #SBATCH --cpus-per-task=10
 6 | #SBATCH --error=/checkpoint/%u/jobs/%j.err
 7 | #SBATCH --gres=gpu:8
 8 | #SBATCH --job-name=vtf
 9 | #SBATCH --mem=450GB
10 | #SBATCH --nodes=1
11 | #SBATCH --ntasks-per-node=8
12 | #SBATCH --open-mode=append
13 | #SBATCH --output=/checkpoint/%u/jobs/%j.out
14 | #SBATCH --partition=learnfair
15 | #SBATCH --signal=USR1@600
16 | #SBATCH --time=72:00:00
17 | # #SBATCH --mail-type=END,FAIL,REQUEUE
18 | 
19 | module load anaconda3
20 | source activate motionformer
21 | 
22 | export MASTER_ADDR=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4}
23 | export MASTER_PORT=19500
24 | 
25 | # debugging flags (optional)
26 | export NCCL_DEBUG=INFO
27 | export PYTHONFAULTHANDLER=1
28 | 
29 | # set the network interface
30 | export NCCL_SOCKET_IFNAME=^docker0,lo
31 | echo $SLURMD_NODENAME $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES
32 | master_node=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4}
33 | dist_url="tcp://"
34 | dist_url+=$master_node
35 | dist_url+=:40000
36 | echo $dist_url
37 | 
38 | 
39 | if [ -z "$1" ]
40 | then
41 | 	CFG='configs/K400/joint_224_16x4.yaml'
42 | else
43 | 	CFG=$1
44 | fi
45 | 
46 | if [ -z "$2" ]
47 | then
48 | 	ROOT_FOLDER="/checkpoint/motionformer"
49 | else
50 | 	ROOT_FOLDER=$2
51 | fi
52 | 
53 | SAV_FOLDER="${ROOT_FOLDER}/${SLURM_JOB_ID}"
54 | mkdir -p ${SAV_FOLDER}
55 | 
56 | # command
57 | srun --label python tools/run_net.py --init_method $dist_url --num_shards 1 --cfg $CFG \
58 | NUM_GPUS 8 \
59 | OUTPUT_DIR ${SAV_FOLDER} \


--------------------------------------------------------------------------------
/slurm_scripts/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --constraint=volta32gb
 5 | #SBATCH --cpus-per-task=10
 6 | #SBATCH --error=/checkpoint/%u/jobs/%j.err
 7 | #SBATCH --gres=gpu:8
 8 | #SBATCH --job-name=vtf_test
 9 | #SBATCH --mem=450GB
10 | #SBATCH --nodes=1
11 | #SBATCH --ntasks-per-node=8
12 | #SBATCH --open-mode=append
13 | #SBATCH --output=/checkpoint/%u/jobs/%j.out
14 | #SBATCH --partition=learnfair
15 | #SBATCH --signal=USR1@600
16 | #SBATCH --comment=icml21-deadline
17 | #SBATCH --time=12:00:00
18 | # #SBATCH --mail-user=mandelapatrick@fb.com
19 | # #SBATCH --mail-type=END,FAIL,REQUEUE
20 | 
21 | module load anaconda3
22 | source activate pysf23_18
23 | 
24 | export MASTER_ADDR=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4}
25 | export MASTER_PORT=19500
26 | 
27 | # debugging flags (optional)
28 | export NCCL_DEBUG=INFO
29 | export PYTHONFAULTHANDLER=1
30 | 
31 | # set the network interface
32 | export NCCL_SOCKET_IFNAME=^docker0,lo
33 | echo $SLURMD_NODENAME $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES
34 | master_node=${SLURM_NODELIST:0:9}${SLURM_NODELIST:10:4}
35 | dist_url="tcp://"
36 | dist_url+=$master_node
37 | dist_url+=:40000
38 | echo $dist_url
39 | 
40 | if [ -z "$1" ]
41 | then
42 | 	CFG='configs/Kinetics/ViT_base_ST_8x16.yaml'
43 | else
44 | 	CFG=$1
45 | fi
46 | if [ -z "$2" ]
47 | then
48 | 	CKPT_PATH='/checkpoint/mandelapatrick/slowfast/36328386/checkpoints/checkpoint_epoch_00030.pyth'
49 | else
50 | 	CKPT_PATH=$2
51 | fi
52 | 
53 | 
54 | SAV_FOLDER="/checkpoint/${USER}/slowfast/${SLURM_JOB_ID}_test"
55 | mkdir -p ${SAV_FOLDER}
56 | 
57 | # command
58 | python tools/run_net.py --cfg $CFG \
59 | NUM_GPUS 8 \
60 | TRAIN.ENABLE False \
61 | TEST.CHECKPOINT_FILE_PATH $CKPT_PATH  \


--------------------------------------------------------------------------------
/tools/benchmark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | """
 4 | A script to benchmark data loading.
 5 | """
 6 | 
 7 | import slowfast.utils.logging as logging
 8 | from slowfast.utils.benchmark import benchmark_data_loading
 9 | from slowfast.utils.misc import launch_job
10 | from slowfast.utils.parser import load_config, parse_args
11 | 
12 | logger = logging.get_logger(__name__)
13 | 
14 | 
15 | def main():
16 |     args = parse_args()
17 |     cfg = load_config(args)
18 | 
19 |     launch_job(
20 |         cfg=cfg, init_method=args.init_method, func=benchmark_data_loading
21 |     )
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 


--------------------------------------------------------------------------------
/tools/run_net.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | 
 4 | """Wrapper to train and test a video classification model."""
 5 | from slowfast.utils.misc import launch_job
 6 | from slowfast.utils.parser import load_config, parse_args
 7 | 
 8 | from test_net import test
 9 | from train_net import train
10 | 
11 | 
12 | def main():
13 |     """
14 |     Main function to spawn the train and test process.
15 |     """
16 |     args = parse_args()
17 |     cfg = load_config(args)
18 | 
19 |     # Perform training.
20 |     if cfg.TRAIN.ENABLE:
21 |         launch_job(cfg=cfg, init_method=args.init_method, func=train)
22 | 
23 |     # Perform multi-clip testing.
24 |     if cfg.TEST.ENABLE:
25 |         launch_job(cfg=cfg, init_method=args.init_method, func=test)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     main()
30 | 


--------------------------------------------------------------------------------
/tools/test_net.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3 | 
  4 | """Multi-view test a video classification model."""
  5 | 
  6 | import numpy as np
  7 | import os
  8 | import pickle
  9 | import torch
 10 | from iopath.common.file_io import g_pathmgr
 11 | 
 12 | import slowfast.utils.checkpoint as cu
 13 | import slowfast.utils.distributed as du
 14 | import slowfast.utils.logging as logging
 15 | import slowfast.utils.misc as misc
 16 | import slowfast.visualization.tensorboard_vis as tb
 17 | from slowfast.datasets import loader
 18 | from slowfast.models import build_model
 19 | from slowfast.utils.meters import AVAMeter, TestMeter, EPICTestMeter
 20 | 
 21 | logger = logging.get_logger(__name__)
 22 | 
 23 | 
 24 | @torch.no_grad()
 25 | def perform_test(test_loader, model, test_meter, cfg, writer=None):
 26 |     """
 27 |     For classification:
 28 |     Perform mutli-view testing that uniformly samples N clips from a video along
 29 |     its temporal axis. For each clip, it takes 3 crops to cover the spatial
 30 |     dimension, followed by averaging the softmax scores across all Nx3 views to
 31 |     form a video-level prediction. All video predictions are compared to
 32 |     ground-truth labels and the final testing performance is logged.
 33 |     For detection:
 34 |     Perform fully-convolutional testing on the full frames without crop.
 35 |     Args:
 36 |         test_loader (loader): video testing loader.
 37 |         model (model): the pretrained video model to test.
 38 |         test_meter (TestMeter): testing meters to log and ensemble the testing
 39 |             results.
 40 |         cfg (CfgNode): configs. Details can be found in
 41 |             slowfast/config/defaults.py
 42 |         writer (TensorboardWriter object, optional): TensorboardWriter object
 43 |             to writer Tensorboard log.
 44 |     """
 45 |     # Enable eval mode.
 46 |     model.eval()
 47 |     test_meter.iter_tic()
 48 | 
 49 |     for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader):
 50 |         if cfg.NUM_GPUS:
 51 |             # Transfer the data to the current GPU device.
 52 |             if isinstance(inputs, (list,)):
 53 |                 for i in range(len(inputs)):
 54 |                     inputs[i] = inputs[i].cuda(non_blocking=True)
 55 |             else:
 56 |                 inputs = inputs.cuda(non_blocking=True)
 57 | 
 58 |             # Transfer the data to the current GPU device.
 59 |             if isinstance(labels, (dict,)):
 60 |                 labels = {k: v.cuda() for k, v in labels.items()}
 61 |             else:
 62 |                 labels = labels.cuda()
 63 |             video_idx = video_idx.cuda()
 64 |             for key, val in meta.items():
 65 |                 if isinstance(val, (list,)):
 66 |                     for i in range(len(val)):
 67 |                         if not isinstance(val[i], (str,)):
 68 |                             val[i] = val[i].cuda(non_blocking=True)
 69 |                 else:
 70 |                     meta[key] = val.cuda(non_blocking=True)
 71 |         test_meter.data_toc()
 72 | 
 73 |         with torch.cuda.amp.autocast(enabled=cfg.SOLVER.USE_MIXED_PRECISION):
 74 |             # Perform the forward pass.
 75 |             shuffle_frames = cfg.TEST.SHUFFLE_FRAMES
 76 |             if shuffle_frames:
 77 |                 N = len(inputs)
 78 |                 B, C, T, H, W = inputs[0].shape
 79 |                 shuffled_indices = np.random.permutation(T)
 80 |                 inputs = [inputs[0][:, :, shuffled_indices, :, :]]
 81 |             preds = model(inputs)
 82 | 
 83 |             # Gather all the predictions across all the devices to perform ensemble.
 84 |             if isinstance(labels, (dict,)):
 85 |                 # Gather all the predictions across all the devices to perform ensemble.
 86 |                 if cfg.NUM_GPUS > 1:
 87 |                     verb_preds, verb_labels, video_idx = du.all_gather(
 88 |                         [preds[0], labels['verb'], video_idx]
 89 |                     )
 90 | 
 91 |                     noun_preds, noun_labels, video_idx = du.all_gather(
 92 |                         [preds[1], labels['noun'], video_idx]
 93 |                     )
 94 |                     meta = du.all_gather_unaligned(meta)
 95 |                     metadata = {'narration_id': []}
 96 |                     for i in range(len(meta)):
 97 |                         metadata['narration_id'].extend(meta[i]['narration_id'])
 98 |                 else:
 99 |                     metadata = meta
100 |                     verb_preds, verb_labels, video_idx = preds[0], labels['verb'], video_idx
101 |                     noun_preds, noun_labels, video_idx = preds[1], labels['noun'], video_idx
102 |                 test_meter.iter_toc()
103 |                 # Update and log stats.
104 |                 test_meter.update_stats(
105 |                     (verb_preds.detach().cpu(), noun_preds.detach().cpu()),
106 |                     (verb_labels.detach().cpu(), noun_labels.detach().cpu()),
107 |                     metadata,
108 |                     video_idx.detach().cpu(),
109 |                 )
110 |                 test_meter.log_iter_stats(cur_iter)
111 |             else:
112 |                 if cfg.NUM_GPUS > 1:
113 |                     preds, labels, video_idx = du.all_gather(
114 |                         [preds, labels, video_idx]
115 |                     )
116 |                 if cfg.NUM_GPUS:
117 |                     preds = preds.cpu()
118 |                     labels = labels.cpu()
119 |                     video_idx = video_idx.cpu()
120 | 
121 |                 test_meter.iter_toc()
122 |                 # Update and log stats.
123 |                 test_meter.update_stats(
124 |                     preds.detach(), labels.detach(), video_idx.detach()
125 |                 )
126 |                 test_meter.log_iter_stats(cur_iter)
127 | 
128 |         test_meter.iter_tic()
129 | 
130 |     # Log epoch stats and print the final testing results.
131 |     if not cfg.DETECTION.ENABLE:
132 |         if cfg.TEST.DATASET == 'Epickitchens':
133 |             if du.is_master_proc():
134 |                 results = {'verb_output': preds[0],
135 |                         'noun_output': preds[1],
136 |                         'narration_id': metadata}
137 |                 scores_path = os.path.join(cfg.OUTPUT_DIR, 'scores')
138 |                 if not os.path.exists(scores_path):
139 |                     os.makedirs(scores_path)
140 |                 TEST_SPLIT = "validation"
141 |                 file_path = os.path.join(scores_path, TEST_SPLIT + '.pkl')
142 |                 pickle.dump(results, open(file_path, 'wb'))
143 |         else: 
144 |             all_preds = test_meter.video_preds.clone().detach()
145 |             all_labels = test_meter.video_labels
146 |             if cfg.NUM_GPUS:
147 |                 all_preds = all_preds.cpu()
148 |                 all_labels = all_labels.cpu()
149 |             if writer is not None:
150 |                 writer.plot_eval(preds=all_preds, labels=all_labels)
151 | 
152 |             if cfg.TEST.SAVE_RESULTS_PATH != "":
153 |                 save_path = os.path.join(cfg.OUTPUT_DIR, cfg.TEST.SAVE_RESULTS_PATH)
154 | 
155 |                 if du.is_root_proc():
156 |                     with g_pathmgr.open(save_path, "wb") as f:
157 |                         pickle.dump([all_preds, all_labels], f)
158 | 
159 |                 logger.info(
160 |                     "Successfully saved prediction results to {}".format(save_path)
161 |                 )
162 | 
163 |     test_meter.finalize_metrics()
164 |     return test_meter
165 | 
166 | 
167 | def test(cfg):
168 |     """
169 |     Perform multi-view testing on the pretrained video model.
170 |     Args:
171 |         cfg (CfgNode): configs. Details can be found in
172 |             slowfast/config/defaults.py
173 |     """
174 |     # Set up environment.
175 |     du.init_distributed_training(cfg)
176 |     # Set random seed from configs.
177 |     np.random.seed(cfg.RNG_SEED)
178 |     torch.manual_seed(cfg.RNG_SEED)
179 | 
180 |     # Setup logging format.
181 |     logging.setup_logging(cfg.OUTPUT_DIR)
182 | 
183 |     # Print config.
184 |     logger.info("Test with config:")
185 |     logger.info(cfg)
186 | 
187 |     # Build the video model and print model statistics.
188 |     model = build_model(cfg)
189 |     if du.is_master_proc() and cfg.LOG_MODEL_INFO:
190 |         misc.log_model_info(model, cfg, use_train_input=False)
191 | 
192 |     cu.load_test_checkpoint(cfg, model)
193 | 
194 |     # Create video testing loaders.
195 |     test_loader = loader.construct_loader(cfg, "test")
196 |     logger.info("Testing model for {} iterations".format(len(test_loader)))
197 | 
198 |     if cfg.DETECTION.ENABLE:
199 |         assert cfg.NUM_GPUS == cfg.TEST.BATCH_SIZE or cfg.NUM_GPUS == 0
200 |         test_meter = AVAMeter(len(test_loader), cfg, mode="test")
201 |     else:
202 |         assert (
203 |             test_loader.dataset.num_videos
204 |             % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS)
205 |             == 0
206 |         )
207 |         # Create meters for multi-view testing.
208 |         if cfg.TEST.DATASET == 'Epickitchens':
209 |             test_meter = EPICTestMeter(
210 |                 len(test_loader.dataset)
211 |                 // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS),
212 |                 cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS,
213 |                 [97, 300],
214 |                 len(test_loader),
215 |             )
216 |         else:
217 |             test_meter = TestMeter(
218 |                 len(test_loader.dataset)
219 |                 // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS),
220 |                 cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS,
221 |                 cfg.MODEL.NUM_CLASSES,
222 |                 len(test_loader),
223 |             )
224 | 
225 |     # Set up writer for logging to Tensorboard format.
226 |     if cfg.TENSORBOARD.ENABLE and du.is_master_proc(
227 |         cfg.NUM_GPUS * cfg.NUM_SHARDS
228 |     ):
229 |         writer = tb.TensorboardWriter(cfg)
230 |     else:
231 |         writer = None
232 | 
233 |     # # Perform multi-view test on the entire dataset.
234 |     test_meter = perform_test(test_loader, model, test_meter, cfg, writer)
235 |     if writer is not None:
236 |         writer.close()
237 | 


--------------------------------------------------------------------------------