├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── configs
    ├── Kinetics
    │   ├── SLOWFAST_4x16_R50.yaml
    │   ├── SLOWFAST_8x8_R101.yaml
    │   ├── SLOWFAST_8x8_R50.yaml
    │   ├── TimeSformer_divST_16x16_448.yaml
    │   ├── TimeSformer_divST_8x32_224.yaml
    │   ├── TimeSformer_divST_8x32_224_4gpus.yaml
    │   ├── TimeSformer_divST_8x32_224_TEST.yaml
    │   ├── TimeSformer_divST_96x4_224.yaml
    │   ├── TimeSformer_jointST_8x32_224.yaml
    │   └── TimeSformer_spaceOnly_8x32_224.yaml
    └── SSv2
    │   ├── SLOWFAST_16x8_R50.yaml
    │   ├── TimeSformer_divST_16_448.yaml
    │   ├── TimeSformer_divST_64_224.yaml
    │   └── TimeSformer_divST_8_224.yaml
├── environment.yml
├── example.ipynb
├── setup.cfg
├── setup.py
├── slurm_scripts
    ├── run_multi_node_job.sh
    └── run_single_node_job.sh
├── timesformer
    ├── __init__.py
    ├── config
    │   ├── __init__.py
    │   └── defaults.py
    ├── datasets
    │   ├── DATASET.md
    │   ├── __init__.py
    │   ├── build.py
    │   ├── cv2_transform.py
    │   ├── decoder.py
    │   ├── kinetics.py
    │   ├── loader.py
    │   ├── multigrid_helper.py
    │   ├── ssv2.py
    │   ├── transform.py
    │   ├── utils.py
    │   └── video_container.py
    ├── models
    │   ├── __init__.py
    │   ├── batchnorm_helper.py
    │   ├── build.py
    │   ├── conv2d_same.py
    │   ├── custom_video_model_builder.py
    │   ├── features.py
    │   ├── head_helper.py
    │   ├── helpers.py
    │   ├── linear.py
    │   ├── losses.py
    │   ├── nonlocal_helper.py
    │   ├── operators.py
    │   ├── optimizer.py
    │   ├── resnet_helper.py
    │   ├── stem_helper.py
    │   ├── video_model_builder.py
    │   ├── vit.py
    │   └── vit_utils.py
    ├── utils
    │   ├── __init__.py
    │   ├── ava_eval_helper.py
    │   ├── ava_evaluation
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt
    │   │   ├── label_map_util.py
    │   │   ├── metrics.py
    │   │   ├── np_box_list.py
    │   │   ├── np_box_list_ops.py
    │   │   ├── np_box_mask_list.py
    │   │   ├── np_box_mask_list_ops.py
    │   │   ├── np_box_ops.py
    │   │   ├── np_mask_ops.py
    │   │   ├── object_detection_evaluation.py
    │   │   ├── per_image_evaluation.py
    │   │   └── standard_fields.py
    │   ├── benchmark.py
    │   ├── bn_helper.py
    │   ├── c2_model_loading.py
    │   ├── checkpoint.py
    │   ├── distributed.py
    │   ├── env.py
    │   ├── logging.py
    │   ├── lr_policy.py
    │   ├── meters.py
    │   ├── metrics.py
    │   ├── misc.py
    │   ├── multigrid.py
    │   ├── multiprocessing.py
    │   ├── parser.py
    │   └── weight_init_helper.py
    └── visualization
    │   ├── __init__.py
    │   ├── tensorboard_vis.py
    │   └── utils.py
└── tools
    ├── benchmark.py
    ├── run_net.py
    ├── submit.py
    ├── test_net.py
    ├── train_net.py
    └── visualization.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | # Docker file from Python is inspired from here :
  6 | # https://github.com/github/gitignore/blob/master/Python.gitignore
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | tests/report/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | # pytype static type analyzer
139 | .pytype/
140 | 
141 | 
142 | # Cython debug symbols
143 | cython_debug/
144 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to TimeSformer
 2 | 
 3 | ## Pull Requests
 4 | We actively welcome your pull requests.
 5 | 
 6 | 1. Fork the repo and create your branch from `master`.
 7 | 2. If you've added code that should be tested, add tests.
 8 | 3. If you've changed APIs, update the documentation.
 9 | 4. Ensure the test suite passes.
10 | 5. Make sure your code lints.
11 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
12 | 
13 | ## Contributor License Agreement ("CLA")
14 | In order to accept your pull request, we need you to submit a CLA. You only need
15 | to do this once to work on any of Facebook's open source projects.
16 | 
17 | Complete your CLA here: <https://code.facebook.com/cla>
18 | 
19 | ## Issues
20 | We use GitHub issues to track public bugs. Please ensure your description is
21 | clear and has sufficient instructions to be able to reproduce the issue.
22 | 
23 | ## License
24 | By contributing to TimeSformer, you agree that your contributions will be licensed
25 | under the [LICENSE.md](LICENSE.md) file in the root directory of this source tree.
26 | 


--------------------------------------------------------------------------------
/configs/Kinetics/SLOWFAST_4x16_R50.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 64
 5 |   EVAL_PERIOD: 10
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 32
11 |   SAMPLING_RATE: 2
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 256
15 |   INPUT_CHANNEL_NUM: [3, 3]
16 | SLOWFAST:
17 |   ALPHA: 8
18 |   BETA_INV: 8
19 |   FUSION_CONV_CHANNEL_RATIO: 2
20 |   FUSION_KERNEL_SZ: 5
21 | RESNET:
22 |   ZERO_INIT_FINAL_BN: True
23 |   WIDTH_PER_GROUP: 64
24 |   NUM_GROUPS: 1
25 |   DEPTH: 50
26 |   TRANS_FUNC: bottleneck_transform
27 |   STRIDE_1X1: False
28 |   NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
29 |   SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
30 |   SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
31 | NONLOCAL:
32 |   LOCATION: [[[], []], [[], []], [[], []], [[], []]]
33 |   GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
34 |   INSTANTIATION: dot_product
35 | BN:
36 |   USE_PRECISE_STATS: True
37 |   NUM_BATCHES_PRECISE: 200
38 | SOLVER:
39 |   BASE_LR: 0.8
40 |   LR_POLICY: cosine
41 |   MAX_EPOCH: 196
42 |   MOMENTUM: 0.9
43 |   WEIGHT_DECAY: 1e-4
44 |   WARMUP_EPOCHS: 34.0
45 |   WARMUP_START_LR: 0.01
46 |   OPTIMIZING_METHOD: sgd
47 | MODEL:
48 |   NUM_CLASSES: 400
49 |   ARCH: slowfast
50 |   MODEL_NAME: SlowFast
51 |   LOSS_FUNC: cross_entropy
52 |   DROPOUT_RATE: 0.5
53 | TEST:
54 |   ENABLE: True
55 |   DATASET: kinetics
56 |   BATCH_SIZE: 64
57 | DATA_LOADER:
58 |   NUM_WORKERS: 8
59 |   PIN_MEMORY: True
60 | NUM_GPUS: 8
61 | NUM_SHARDS: 1
62 | RNG_SEED: 0
63 | OUTPUT_DIR: .
64 | 


--------------------------------------------------------------------------------
/configs/Kinetics/SLOWFAST_8x8_R101.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 64
 5 |   EVAL_PERIOD: 10
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR:  /path/to/kinetics/
10 |   NUM_FRAMES: 32
11 |   SAMPLING_RATE: 2
12 |   TRAIN_JITTER_SCALES: [256, 340]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 256
15 |   INPUT_CHANNEL_NUM: [3, 3]
16 | SLOWFAST:
17 |   ALPHA: 4
18 |   BETA_INV: 8
19 |   FUSION_CONV_CHANNEL_RATIO: 2
20 |   FUSION_KERNEL_SZ: 5
21 | RESNET:
22 |   ZERO_INIT_FINAL_BN: True
23 |   WIDTH_PER_GROUP: 64
24 |   NUM_GROUPS: 1
25 |   DEPTH: 101
26 |   TRANS_FUNC: bottleneck_transform
27 |   STRIDE_1X1: False
28 |   NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
29 |   SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
30 |   SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
31 | NONLOCAL:
32 |   LOCATION: [[[], []], [[], []], [[], []], [[], []]]
33 |   GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
34 |   INSTANTIATION: dot_product
35 | BN:
36 |   USE_PRECISE_STATS: True
37 |   NUM_BATCHES_PRECISE: 200
38 | SOLVER:
39 |   BASE_LR: 0.8 ## 8 nodes
40 |   LR_POLICY: cosine
41 |   MAX_EPOCH: 196
42 |   MOMENTUM: 0.9
43 |   WEIGHT_DECAY: 1e-4
44 |   WARMUP_EPOCHS: 34.0
45 |   WARMUP_START_LR: 0.01
46 |   OPTIMIZING_METHOD: sgd
47 | MODEL:
48 |   NUM_CLASSES: 400
49 |   ARCH: slowfast
50 |   MODEL_NAME: SlowFast
51 |   LOSS_FUNC: cross_entropy
52 |   DROPOUT_RATE: 0.5
53 | TEST:
54 |   ENABLE: True
55 |   DATASET: kinetics
56 |   BATCH_SIZE: 64
57 | DATA_LOADER:
58 |   NUM_WORKERS: 8
59 |   PIN_MEMORY: True
60 | NUM_GPUS: 8
61 | NUM_SHARDS: 1
62 | RNG_SEED: 0
63 | OUTPUT_DIR: .
64 | 


--------------------------------------------------------------------------------
/configs/Kinetics/SLOWFAST_8x8_R50.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 64
 5 |   EVAL_PERIOD: 10
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 32
11 |   SAMPLING_RATE: 2
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 256
15 |   INPUT_CHANNEL_NUM: [3, 3]
16 | SLOWFAST:
17 |   ALPHA: 4
18 |   BETA_INV: 8
19 |   FUSION_CONV_CHANNEL_RATIO: 2
20 |   FUSION_KERNEL_SZ: 7
21 | RESNET:
22 |   ZERO_INIT_FINAL_BN: True
23 |   WIDTH_PER_GROUP: 64
24 |   NUM_GROUPS: 1
25 |   DEPTH: 50
26 |   TRANS_FUNC: bottleneck_transform
27 |   STRIDE_1X1: False
28 |   NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
29 |   SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
30 |   SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
31 | NONLOCAL:
32 |   LOCATION: [[[], []], [[], []], [[], []], [[], []]]
33 |   GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
34 |   INSTANTIATION: dot_product
35 | BN:
36 |   USE_PRECISE_STATS: True
37 |   NUM_BATCHES_PRECISE: 200
38 | SOLVER:
39 |   BASE_LR: 0.8
40 |   LR_POLICY: cosine
41 |   MAX_EPOCH: 196
42 |   MOMENTUM: 0.9
43 |   WEIGHT_DECAY: 1e-4
44 |   WARMUP_EPOCHS: 34.0
45 |   WARMUP_START_LR: 0.01
46 |   OPTIMIZING_METHOD: sgd
47 | MODEL:
48 |   NUM_CLASSES: 400
49 |   ARCH: slowfast
50 |   MODEL_NAME: SlowFast
51 |   LOSS_FUNC: cross_entropy
52 |   DROPOUT_RATE: 0.5
53 | TEST:
54 |   ENABLE: True
55 |   DATASET: kinetics
56 |   BATCH_SIZE: 64
57 | DATA_LOADER:
58 |   NUM_WORKERS: 8
59 |   PIN_MEMORY: True
60 | NUM_GPUS: 8
61 | NUM_SHARDS: 1
62 | RNG_SEED: 0
63 | OUTPUT_DIR: .
64 | 


--------------------------------------------------------------------------------
/configs/Kinetics/TimeSformer_divST_16x16_448.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 16
11 |   SAMPLING_RATE: 16
12 |   TRAIN_JITTER_SCALES: [448, 512]
13 |   TRAIN_CROP_SIZE: 448
14 |   TEST_CROP_SIZE: 448
15 |   INPUT_CHANNEL_NUM: [3]
16 | TIMESFORMER:
17 |   ATTENTION_TYPE: 'divided_space_time'
18 | SOLVER:
19 |   BASE_LR: 0.005
20 |   LR_POLICY: steps_with_relative_lrs
21 |   STEPS: [0, 11, 14]
22 |   LRS: [1, 0.1, 0.01]
23 |   MAX_EPOCH: 15
24 |   MOMENTUM: 0.9
25 |   WEIGHT_DECAY: 1e-4
26 |   OPTIMIZING_METHOD: sgd
27 | MODEL:
28 |   MODEL_NAME: vit_base_patch16_224
29 |   NUM_CLASSES: 400
30 |   ARCH: vit
31 |   LOSS_FUNC: cross_entropy
32 |   DROPOUT_RATE: 0.5
33 | TEST:
34 |   ENABLE: True
35 |   DATASET: kinetics
36 |   BATCH_SIZE: 8
37 |   NUM_ENSEMBLE_VIEWS: 1
38 |   NUM_SPATIAL_CROPS: 3
39 | DATA_LOADER:
40 |   NUM_WORKERS: 8
41 |   PIN_MEMORY: True
42 | NUM_GPUS: 8
43 | NUM_SHARDS: 1
44 | RNG_SEED: 0
45 | OUTPUT_DIR: .
46 | 


--------------------------------------------------------------------------------
/configs/Kinetics/TimeSformer_divST_8x32_224.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 8
11 |   SAMPLING_RATE: 32
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 224
15 |   INPUT_CHANNEL_NUM: [3]
16 | TIMESFORMER:
17 |   ATTENTION_TYPE: 'divided_space_time'
18 | SOLVER:
19 |   BASE_LR: 0.005
20 |   LR_POLICY: steps_with_relative_lrs
21 |   STEPS: [0, 11, 14]
22 |   LRS: [1, 0.1, 0.01]
23 |   MAX_EPOCH: 15
24 |   MOMENTUM: 0.9
25 |   WEIGHT_DECAY: 1e-4
26 |   OPTIMIZING_METHOD: sgd
27 | MODEL:
28 |   MODEL_NAME: vit_base_patch16_224
29 |   NUM_CLASSES: 400
30 |   ARCH: vit
31 |   LOSS_FUNC: cross_entropy
32 |   DROPOUT_RATE: 0.5
33 | TEST:
34 |   ENABLE: True
35 |   DATASET: kinetics
36 |   BATCH_SIZE: 8
37 |   NUM_ENSEMBLE_VIEWS: 1
38 |   NUM_SPATIAL_CROPS: 3
39 | DATA_LOADER:
40 |   NUM_WORKERS: 8
41 |   PIN_MEMORY: True
42 | NUM_GPUS: 8
43 | NUM_SHARDS: 1
44 | RNG_SEED: 0
45 | OUTPUT_DIR: .
46 | 


--------------------------------------------------------------------------------
/configs/Kinetics/TimeSformer_divST_8x32_224_4gpus.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 4
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 8
11 |   SAMPLING_RATE: 32
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 224
15 |   INPUT_CHANNEL_NUM: [3]
16 | TIMESFORMER:
17 |   ATTENTION_TYPE: 'divided_space_time'
18 | SOLVER:
19 |   BASE_LR: 0.005
20 |   LR_POLICY: steps_with_relative_lrs
21 |   STEPS: [0, 11, 14]
22 |   LRS: [1, 0.1, 0.01]
23 |   MAX_EPOCH: 15
24 |   MOMENTUM: 0.9
25 |   WEIGHT_DECAY: 1e-4
26 |   OPTIMIZING_METHOD: sgd
27 | MODEL:
28 |   MODEL_NAME: vit_base_patch16_224
29 |   NUM_CLASSES: 400
30 |   ARCH: vit
31 |   LOSS_FUNC: cross_entropy
32 |   DROPOUT_RATE: 0.5
33 | TEST:
34 |   ENABLE: True
35 |   DATASET: kinetics
36 |   BATCH_SIZE: 4
37 |   NUM_ENSEMBLE_VIEWS: 1
38 |   NUM_SPATIAL_CROPS: 3
39 | DATA_LOADER:
40 |   NUM_WORKERS: 4
41 |   PIN_MEMORY: True
42 | NUM_GPUS: 4
43 | NUM_SHARDS: 1
44 | RNG_SEED: 0
45 | OUTPUT_DIR: .
46 | 


--------------------------------------------------------------------------------
/configs/Kinetics/TimeSformer_divST_8x32_224_TEST.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: False
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 8
11 |   SAMPLING_RATE: 32
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 224
15 |   INPUT_CHANNEL_NUM: [3]
16 | TIMESFORMER:
17 |   ATTENTION_TYPE: 'divided_space_time'
18 | SOLVER:
19 |   BASE_LR: 0.005
20 |   LR_POLICY: steps_with_relative_lrs
21 |   STEPS: [0, 11, 14]
22 |   LRS: [1, 0.1, 0.01]
23 |   MAX_EPOCH: 15
24 |   MOMENTUM: 0.9
25 |   WEIGHT_DECAY: 1e-4
26 |   OPTIMIZING_METHOD: sgd
27 | MODEL:
28 |   MODEL_NAME: vit_base_patch16_224
29 |   NUM_CLASSES: 400
30 |   ARCH: vit
31 |   LOSS_FUNC: cross_entropy
32 |   DROPOUT_RATE: 0.5
33 | TEST:
34 |   ENABLE: True
35 |   DATASET: kinetics
36 |   BATCH_SIZE: 8
37 |   NUM_ENSEMBLE_VIEWS: 1
38 |   NUM_SPATIAL_CROPS: 3
39 |   CHECKPOINT_FILE_PATH: '/checkpoint/gedas/jobs/timesformer/kinetics_400/TimeSformer_divST_8x32_224/checkpoints/checkpoint_epoch_00025.pyth'
40 | DATA_LOADER:
41 |   NUM_WORKERS: 8
42 |   PIN_MEMORY: True
43 | NUM_GPUS: 8
44 | NUM_SHARDS: 1
45 | RNG_SEED: 0
46 | OUTPUT_DIR: .
47 | 


--------------------------------------------------------------------------------
/configs/Kinetics/TimeSformer_divST_96x4_224.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 96
11 |   SAMPLING_RATE: 4
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 224
15 |   INPUT_CHANNEL_NUM: [3]
16 | TIMESFORMER:
17 |   ATTENTION_TYPE: 'divided_space_time'
18 | SOLVER:
19 |   BASE_LR: 0.005
20 |   LR_POLICY: steps_with_relative_lrs
21 |   STEPS: [0, 11, 14]
22 |   LRS: [1, 0.1, 0.01]
23 |   MAX_EPOCH: 15
24 |   MOMENTUM: 0.9
25 |   WEIGHT_DECAY: 1e-4
26 |   OPTIMIZING_METHOD: sgd
27 | MODEL:
28 |   MODEL_NAME: vit_base_patch16_224
29 |   NUM_CLASSES: 400
30 |   ARCH: vit
31 |   LOSS_FUNC: cross_entropy
32 |   DROPOUT_RATE: 0.5
33 | TEST:
34 |   ENABLE: True
35 |   DATASET: kinetics
36 |   BATCH_SIZE: 8
37 |   NUM_ENSEMBLE_VIEWS: 1
38 |   NUM_SPATIAL_CROPS: 3
39 | DATA_LOADER:
40 |   NUM_WORKERS: 8
41 |   PIN_MEMORY: True
42 | NUM_GPUS: 8
43 | NUM_SHARDS: 1
44 | RNG_SEED: 0
45 | OUTPUT_DIR: .
46 | 


--------------------------------------------------------------------------------
/configs/Kinetics/TimeSformer_jointST_8x32_224.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 8
11 |   SAMPLING_RATE: 32
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 224
15 |   INPUT_CHANNEL_NUM: [3]
16 | TIMESFORMER:
17 |   ATTENTION_TYPE: 'joint_space_time'
18 | SOLVER:
19 |   BASE_LR: 0.005
20 |   LR_POLICY: steps_with_relative_lrs
21 |   STEPS: [0, 11, 14]
22 |   LRS: [1, 0.1, 0.01]
23 |   MAX_EPOCH: 15
24 |   MOMENTUM: 0.9
25 |   WEIGHT_DECAY: 1e-4
26 |   OPTIMIZING_METHOD: sgd
27 | MODEL:
28 |   MODEL_NAME: vit_base_patch16_224
29 |   NUM_CLASSES: 400
30 |   ARCH: vit
31 |   LOSS_FUNC: cross_entropy
32 |   DROPOUT_RATE: 0.5
33 | TEST:
34 |   ENABLE: True
35 |   DATASET: kinetics
36 |   BATCH_SIZE: 8
37 |   NUM_ENSEMBLE_VIEWS: 1
38 |   NUM_SPATIAL_CROPS: 3
39 | DATA_LOADER:
40 |   NUM_WORKERS: 8
41 |   PIN_MEMORY: True
42 | NUM_GPUS: 8
43 | NUM_SHARDS: 1
44 | RNG_SEED: 0
45 | OUTPUT_DIR: .
46 | 


--------------------------------------------------------------------------------
/configs/Kinetics/TimeSformer_spaceOnly_8x32_224.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: kinetics
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: /path/to/kinetics/
10 |   NUM_FRAMES: 8
11 |   SAMPLING_RATE: 32
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 224
15 |   INPUT_CHANNEL_NUM: [3]
16 | TIMESFORMER:
17 |   ATTENTION_TYPE: 'space_only'
18 | SOLVER:
19 |   BASE_LR: 0.005
20 |   LR_POLICY: steps_with_relative_lrs
21 |   STEPS: [0, 11, 14]
22 |   LRS: [1, 0.1, 0.01]
23 |   MAX_EPOCH: 15
24 |   MOMENTUM: 0.9
25 |   WEIGHT_DECAY: 1e-4
26 |   OPTIMIZING_METHOD: sgd
27 | MODEL:
28 |   MODEL_NAME: vit_base_patch16_224
29 |   NUM_CLASSES: 400
30 |   ARCH: vit
31 |   LOSS_FUNC: cross_entropy
32 |   DROPOUT_RATE: 0.5
33 | TEST:
34 |   ENABLE: True
35 |   DATASET: kinetics
36 |   BATCH_SIZE: 8
37 |   NUM_ENSEMBLE_VIEWS: 1
38 |   NUM_SPATIAL_CROPS: 3
39 | DATA_LOADER:
40 |   NUM_WORKERS: 8
41 |   PIN_MEMORY: True
42 | NUM_GPUS: 8
43 | NUM_SHARDS: 1
44 | RNG_SEED: 0
45 | OUTPUT_DIR: .
46 | 


--------------------------------------------------------------------------------
/configs/SSv2/SLOWFAST_16x8_R50.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: ssv2
 4 |   BATCH_SIZE: 16
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/"
10 |   PATH_PREFIX: "/path/to/ssv2/frames/"
11 |   NUM_FRAMES: 64
12 |   SAMPLING_RATE: 2
13 |   TRAIN_JITTER_SCALES: [256, 320]
14 |   TRAIN_CROP_SIZE: 224
15 |   TEST_CROP_SIZE: 256
16 |   INPUT_CHANNEL_NUM: [3, 3]
17 |   INV_UNIFORM_SAMPLE: True
18 |   RANDOM_FLIP: False
19 |   REVERSE_INPUT_CHANNEL: True
20 | SLOWFAST:
21 |   ALPHA: 4
22 |   BETA_INV: 8
23 |   FUSION_CONV_CHANNEL_RATIO: 2
24 |   FUSION_KERNEL_SZ: 7
25 | RESNET:
26 |   SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
27 |   SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
28 |   ZERO_INIT_FINAL_BN: True
29 |   WIDTH_PER_GROUP: 64
30 |   NUM_GROUPS: 1
31 |   DEPTH: 50
32 |   TRANS_FUNC: bottleneck_transform
33 |   STRIDE_1X1: False
34 |   NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
35 | NONLOCAL:
36 |   LOCATION: [[[], []], [[], []], [[], []], [[], []]]
37 |   GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
38 |   INSTANTIATION: dot_product
39 | BN:
40 |   USE_PRECISE_STATS: True
41 |   NUM_BATCHES_PRECISE: 200
42 |   NORM_TYPE: sync_batchnorm
43 |   NUM_SYNC_DEVICES: 4
44 | SOLVER:
45 |   BASE_LR: 0.2 #8 nodes
46 |   LR_POLICY: cosine
47 |   MAX_EPOCH: 200
48 |   MOMENTUM: 0.9
49 |   WEIGHT_DECAY: 1e-4
50 |   WARMUP_EPOCHS: 34.0
51 |   WARMUP_START_LR: 0.01
52 |   OPTIMIZING_METHOD: sgd
53 |   #SOLVER:
54 |   #  BASE_LR: 0.03
55 |   #  LR_POLICY: steps_with_relative_lrs
56 |   #  LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
57 |   #  STEPS: [0, 14, 18]
58 |   #  MAX_EPOCH: 22
59 |   #  MOMENTUM: 0.9
60 |   #  WEIGHT_DECAY: 1e-6
61 |   #  WARMUP_EPOCHS: 0.19
62 |   #  WARMUP_START_LR: 0.0001
63 |   #  OPTIMIZING_METHOD: sgd
64 | MODEL:
65 |   NUM_CLASSES: 174
66 |   ARCH: slowfast
67 |   LOSS_FUNC: cross_entropy
68 |   DROPOUT_RATE: 0.5
69 | TEST:
70 |   ENABLE: True
71 |   DATASET: ssv2
72 |   BATCH_SIZE: 16
73 |   NUM_ENSEMBLE_VIEWS: 1
74 |   NUM_SPATIAL_CROPS: 1
75 | DATA_LOADER:
76 |   NUM_WORKERS: 4
77 |   PIN_MEMORY: True
78 | NUM_GPUS: 8
79 | NUM_SHARDS: 1
80 | RNG_SEED: 0
81 | OUTPUT_DIR: .
82 | #LOG_MODEL_INFO: False
83 | LOG_MODEL_INFO: True
84 | 


--------------------------------------------------------------------------------
/configs/SSv2/TimeSformer_divST_16_448.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: ssv2
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/"
10 |   PATH_PREFIX: "/path/to/ssv2/frames/"
11 |   NUM_FRAMES: 16
12 |   TRAIN_JITTER_SCALES: [448, 512]
13 |   TRAIN_CROP_SIZE: 448
14 |   TEST_CROP_SIZE: 448
15 |   INPUT_CHANNEL_NUM: [3]
16 |   INV_UNIFORM_SAMPLE: True
17 |   RANDOM_FLIP: False
18 |   REVERSE_INPUT_CHANNEL: True
19 | TIMESFORMER:
20 |   ATTENTION_TYPE: 'divided_space_time'
21 | SOLVER:
22 |   BASE_LR: 0.005
23 |   LR_POLICY: steps_with_relative_lrs
24 |   STEPS: [0, 11, 14]
25 |   LRS: [1, 0.1, 0.01]
26 |   MAX_EPOCH: 15
27 |   MOMENTUM: 0.9
28 |   WEIGHT_DECAY: 1e-4
29 |   OPTIMIZING_METHOD: sgd
30 | MODEL:
31 |   MODEL_NAME: vit_base_patch16_224
32 |   NUM_CLASSES: 174
33 |   ARCH: vit
34 |   LOSS_FUNC: cross_entropy
35 |   DROPOUT_RATE: 0.5
36 | TEST:
37 |   ENABLE: True
38 |   DATASET: ssv2
39 |   BATCH_SIZE: 8
40 |   NUM_ENSEMBLE_VIEWS: 1
41 |   NUM_SPATIAL_CROPS: 3
42 | DATA_LOADER:
43 |   NUM_WORKERS: 4
44 |   PIN_MEMORY: True
45 | NUM_GPUS: 8
46 | NUM_SHARDS: 1
47 | RNG_SEED: 0
48 | OUTPUT_DIR: .
49 | 


--------------------------------------------------------------------------------
/configs/SSv2/TimeSformer_divST_64_224.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: ssv2
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/"
10 |   PATH_PREFIX: "/path/to/ssv2/frames/"
11 |   NUM_FRAMES: 64
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 224
15 |   INPUT_CHANNEL_NUM: [3]
16 |   INV_UNIFORM_SAMPLE: True
17 |   RANDOM_FLIP: False
18 |   REVERSE_INPUT_CHANNEL: True
19 | TIMESFORMER:
20 |   ATTENTION_TYPE: 'divided_space_time'
21 | SOLVER:
22 |   BASE_LR: 0.005
23 |   LR_POLICY: steps_with_relative_lrs
24 |   STEPS: [0, 11, 14]
25 |   LRS: [1, 0.1, 0.01]
26 |   MAX_EPOCH: 15
27 |   MOMENTUM: 0.9
28 |   WEIGHT_DECAY: 1e-4
29 |   OPTIMIZING_METHOD: sgd
30 | MODEL:
31 |   MODEL_NAME: vit_base_patch16_224
32 |   NUM_CLASSES: 174
33 |   ARCH: vit
34 |   LOSS_FUNC: cross_entropy
35 |   DROPOUT_RATE: 0.5
36 | TEST:
37 |   ENABLE: True
38 |   DATASET: ssv2
39 |   BATCH_SIZE: 8
40 |   NUM_ENSEMBLE_VIEWS: 1
41 |   NUM_SPATIAL_CROPS: 3
42 | DATA_LOADER:
43 |   NUM_WORKERS: 4
44 |   PIN_MEMORY: True
45 | NUM_GPUS: 8
46 | NUM_SHARDS: 1
47 | RNG_SEED: 0
48 | OUTPUT_DIR: .
49 | 


--------------------------------------------------------------------------------
/configs/SSv2/TimeSformer_divST_8_224.yaml:
--------------------------------------------------------------------------------
 1 | TRAIN:
 2 |   ENABLE: True
 3 |   DATASET: ssv2
 4 |   BATCH_SIZE: 8
 5 |   EVAL_PERIOD: 5
 6 |   CHECKPOINT_PERIOD: 5
 7 |   AUTO_RESUME: True
 8 | DATA:
 9 |   PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/"
10 |   PATH_PREFIX: "/path/to/ssv2/frames/"
11 |   NUM_FRAMES: 8
12 |   TRAIN_JITTER_SCALES: [256, 320]
13 |   TRAIN_CROP_SIZE: 224
14 |   TEST_CROP_SIZE: 224
15 |   INPUT_CHANNEL_NUM: [3]
16 |   INV_UNIFORM_SAMPLE: True
17 |   RANDOM_FLIP: False
18 |   REVERSE_INPUT_CHANNEL: True
19 | TIMESFORMER:
20 |   ATTENTION_TYPE: 'divided_space_time'
21 | SOLVER:
22 |   BASE_LR: 0.005
23 |   LR_POLICY: steps_with_relative_lrs
24 |   STEPS: [0, 11, 14]
25 |   LRS: [1, 0.1, 0.01]
26 |   MAX_EPOCH: 15
27 |   MOMENTUM: 0.9
28 |   WEIGHT_DECAY: 1e-4
29 |   OPTIMIZING_METHOD: sgd
30 | MODEL:
31 |   MODEL_NAME: vit_base_patch16_224
32 |   NUM_CLASSES: 174
33 |   ARCH: vit
34 |   LOSS_FUNC: cross_entropy
35 |   DROPOUT_RATE: 0.5
36 | TEST:
37 |   ENABLE: True
38 |   DATASET: ssv2
39 |   BATCH_SIZE: 8
40 |   NUM_ENSEMBLE_VIEWS: 1
41 |   NUM_SPATIAL_CROPS: 3
42 | DATA_LOADER:
43 |   NUM_WORKERS: 4
44 |   PIN_MEMORY: True
45 | NUM_GPUS: 8
46 | NUM_SHARDS: 1
47 | RNG_SEED: 0
48 | OUTPUT_DIR: .
49 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: timesformer
 2 | channels:
 3 | - pytorch 
 4 | - conda-forge
 5 | - defaults
 6 | dependencies:
 7 | - python>3.7
 8 | - jupyterlab
 9 | - pandas>=1.2
10 | - numpy>1.19
11 | - pytorch>=1.6
12 | - torchvision>=0.7
13 | - scikit-learn>=0.22
14 | - opencv>=4.2
15 | - pyyaml>=5.1
16 | - yacs>=0.1.6
17 | - einops>=0.3
18 | - tensorboard
19 | - psutil
20 | - tqdm
21 | - matplotlib
22 | - simplejson
23 | - pip
24 | - pip:
25 |   - fvcore
26 |   - av


--------------------------------------------------------------------------------
/example.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "id": "08fe0c59",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "from pathlib import Path\n",
11 |     "\n",
12 |     "import torch\n",
13 |     "from timesformer.models.vit import TimeSformer"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": 2,
19 |    "id": "10239d32",
20 |    "metadata": {},
21 |    "outputs": [
22 |     {
23 |      "data": {
24 |       "text/plain": [
25 |        "True"
26 |       ]
27 |      },
28 |      "execution_count": 2,
29 |      "metadata": {},
30 |      "output_type": "execute_result"
31 |     }
32 |    ],
33 |    "source": [
34 |     "model_file = Path.home()/'TimeSformer/models/TimeSformer_divST_8x32_224_K600.pyth'\n",
35 |     "model_file.exists()"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "code",
40 |    "execution_count": 3,
41 |    "id": "652fb03e",
42 |    "metadata": {},
43 |    "outputs": [],
44 |    "source": [
45 |     "model = TimeSformer(img_size=224, num_classes=600, num_frames=8, attention_type='divided_space_time',  pretrained_model=str(model_file))\n",
46 |     "\n",
47 |     "dummy_video = torch.randn(2, 3, 8, 224, 224) # (batch x channels x frames x height x width)\n",
48 |     "\n",
49 |     "pred = model(dummy_video,) # (2, 600)"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": 6,
55 |    "id": "83de13c5-791c-4db7-aba4-6d29ce88584e",
56 |    "metadata": {},
57 |    "outputs": [],
58 |    "source": [
59 |     "assert pred.shape == (2,600)"
60 |    ]
61 |   }
62 |  ],
63 |  "metadata": {
64 |   "kernelspec": {
65 |    "display_name": "Python 3",
66 |    "language": "python",
67 |    "name": "python3"
68 |   },
69 |   "language_info": {
70 |    "codemirror_mode": {
71 |     "name": "ipython",
72 |     "version": 3
73 |    },
74 |    "file_extension": ".py",
75 |    "mimetype": "text/x-python",
76 |    "name": "python",
77 |    "nbconvert_exporter": "python",
78 |    "pygments_lexer": "ipython3",
79 |    "version": "3.9.4"
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 5
84 | }
85 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | line_length=100
 3 | multi_line_output=4
 4 | known_standard_library=numpy,setuptools
 5 | known_myself=timesformer
 6 | known_third_party=fvcore,av,torch,pycocotools,yacs,termcolor,scipy,simplejson,matplotlib,torchvision,yaml,tqdm,psutil,opencv-python,pandas,tensorboard,moviepy,sklearn,cv2
 7 | no_lines_before=STDLIB,THIRDPARTY
 8 | sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
 9 | default_section=FIRSTPARTY
10 | 
11 | [mypy]
12 | python_version=3.6
13 | ignore_missing_imports = True
14 | warn_unused_configs = True
15 | disallow_untyped_defs = True
16 | check_untyped_defs = True
17 | warn_unused_ignores = True
18 | warn_redundant_casts = True
19 | show_column_numbers = True
20 | follow_imports = silent
21 | allow_redefinition = True
22 | ; Require all functions to be annotated
23 | disallow_incomplete_defs = True
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | setup(
 6 |     name="timesformer",
 7 |     version="1.0",
 8 |     author="FBAI",
 9 |     url="unknown",
10 |     description="TimeSformer",
11 |     keywords = [
12 |     'artificial intelligence',
13 |     'attention mechanism',
14 |     'transformers',
15 |     'video classification',
16 |     ],
17 |     install_requires=[
18 |         'einops>=0.3',
19 |         'torch>=1.6'
20 |     ],
21 |     extras_require={"tensorboard_video_visualization": ["moviepy"]},
22 |     packages=find_packages(exclude=("configs", "tests")),
23 | )
24 | 


--------------------------------------------------------------------------------
/slurm_scripts/run_multi_node_job.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # A script with a list of commands for submitting SLURM jobs
 3 | 
 4 | #### Kinetics training
 5 | JOB_NAME=TimeSformer_divST_8x32_224
 6 | python tools/submit.py --cfg configs/Kinetics/TimeSformer_divST_8x32_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition dev --comment "" --name ${JOB_NAME} --use_volta32
 7 | 
 8 | #JOB_NAME=TimeSformer_jointST_8x32_224
 9 | #python tools/submit.py --cfg configs/Kinetics/TimeSformer_jointST_8x32_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32
10 | 
11 | #JOB_NAME=TimeSformer_spaceOnly_8x32_224
12 | #python tools/submit.py --cfg configs/Kinetics/TimeSformer_spaceOnly_8x32_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32
13 | 
14 | #### Kinetics inference
15 | #JOB_NAME=TimeSformer_divST_8x32_224_TEST_3clips
16 | #python tools/submit.py --cfg configs/Kinetics/TimeSformer_divST_8x32_224_TEST.yaml --job_dir /your/job/dir/${JOB_NAME}/  --num_shards 4 --partition dev --comment "" --name ${JOB_NAME} --use_volta32
17 | 
18 | 
19 | ##### SSv2 training
20 | #JOB_NAME=TimeSformer_divST_8_224
21 | #python tools/submit.py --cfg configs/SSv2/TimeSformer_divST_8_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32
22 | 
23 | ##### Sth-Sth_v2 inference
24 | #JOB_NAME=TimeSformer_divST_8_224_TEST_3clips
25 | #python tools/submit.py --cfg configs/SSv2/TimeSformer_divST_8_224_TEST.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32
26 | 


--------------------------------------------------------------------------------
/slurm_scripts/run_single_node_job.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # A script with a list of commands for submitting SLURM jobs
 3 | 
 4 | #SBATCH --job-name=timesformer
 5 | #SBATCH --mail-type=END,FAIL,REQUEUE
 6 | #SBATCH --mail-user=name@domain.com
 7 | 
 8 | ## %j is the job id, %u is the user id
 9 | #SBATCH --output=/path/to/output/logs/slog-%A-%a.out
10 | 
11 | ## filename for job standard error output (stderr)
12 | #SBATCH --error=/path/to/error/logs/slog-%A-%a.err
13 | 
14 | #SBATCH --array=1
15 | #SBATCH --partition=partition_of_your_choice
16 | #SBATCH --nodes=1 -C volta32gb
17 | #SBATCH --ntasks-per-node=1
18 | #SBATCH --gpus-per-node=8
19 | #SBATCH --cpus-per-task=80
20 | #SBATCH --mem=480GB
21 | #SBATCH --signal=USR1@600
22 | #SBATCH --time=72:00:00
23 | #SBATCH --open-mode=append
24 | 
25 | module purge
26 | module load cuda/10.0
27 | module load NCCL/2.4.7-1-cuda.10.0
28 | module load cudnn/v7.4-cuda.10.0
29 | source activate timesformer
30 | 
31 | WORKINGDIR=/path/to/TimeSformer
32 | CURPYTHON=/path/to/python
33 | 
34 | srun --label ${CURPYTHON} ${WORKINGDIR}/tools/run_net.py --cfg ${WORKINGDIR}/configs/Kinetics/TimeSformer_divST_8x32_224.yaml NUM_GPUS 8 TRAIN.BATCH_SIZE 8
35 | 
36 | 


--------------------------------------------------------------------------------
/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 
3 | from timesformer.utils.env import setup_environment
4 | 
5 | setup_environment()
6 | 


--------------------------------------------------------------------------------
/timesformer/config/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/timesformer/datasets/DATASET.md:
--------------------------------------------------------------------------------
 1 | # Dataset Preparation
 2 | 
 3 | ## Kinetics
 4 | 
 5 | The Kinetics Dataset could be downloaded from the following [link](https://github.com/cvdfoundation/kinetics-dataset):
 6 | 
 7 | After all the videos were downloaded, resize the video to the short edge size of 256, then prepare the csv files for training, validation, and testing set as `train.csv`, `val.csv`, `test.csv`. The format of the csv file is:
 8 | 
 9 | ```
10 | path_to_video_1 label_1
11 | path_to_video_2 label_2
12 | path_to_video_3 label_3
13 | ...
14 | path_to_video_N label_N
15 | ```
16 | 
17 | ## Something-Something V2
18 | 1. Please download the dataset and annotations from [dataset provider](https://20bn.com/datasets/something-something).
19 | 
20 | 2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/val.csv)).
21 | 
22 | 3. Extract the frames at 30 FPS. (We used ffmpeg-4.1.3 with command
23 | `ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}"`
24 |    in experiments.) Please put the frames in a structure consistent with the frame lists.
25 | 
26 | Please put all annotation json files and the frame lists in the same folder, and set `DATA.PATH_TO_DATA_DIR` to the path. Set `DATA.PATH_PREFIX` to be the path to the folder containing extracted frames.
27 | 


--------------------------------------------------------------------------------
/timesformer/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 
3 | from .build import DATASET_REGISTRY, build_dataset  # noqa
4 | from .kinetics import Kinetics  # noqa
5 | from .ssv2 import Ssv2  # noqa
6 | 


--------------------------------------------------------------------------------
/timesformer/datasets/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | from fvcore.common.registry import Registry
 4 | 
 5 | DATASET_REGISTRY = Registry("DATASET")
 6 | DATASET_REGISTRY.__doc__ = """
 7 | Registry for dataset.
 8 | 
 9 | The registered object will be called with `obj(cfg, split)`.
10 | The call should return a `torch.utils.data.Dataset` object.
11 | """
12 | 
13 | 
14 | def build_dataset(dataset_name, cfg, split):
15 |     """
16 |     Build a dataset, defined by `dataset_name`.
17 |     Args:
18 |         dataset_name (str): the name of the dataset to be constructed.
19 |         cfg (CfgNode): configs. Details can be found in
20 |             slowfast/config/defaults.py
21 |         split (str): the split of the data loader. Options include `train`,
22 |             `val`, and `test`.
23 |     Returns:
24 |         Dataset: a constructed dataset specified by dataset_name.
25 |     """
26 |     # Capitalize the the first letter of the dataset_name since the dataset_name
27 |     # in configs may be in lowercase but the name of dataset class should always
28 |     # start with an uppercase letter.
29 |     name = dataset_name.capitalize()
30 |     return DATASET_REGISTRY.get(name)(cfg, split)
31 | 


--------------------------------------------------------------------------------
/timesformer/datasets/loader.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """Data loader."""
  4 | 
  5 | import itertools
  6 | import numpy as np
  7 | import torch
  8 | from torch.utils.data._utils.collate import default_collate
  9 | from torch.utils.data.distributed import DistributedSampler
 10 | from torch.utils.data.sampler import RandomSampler
 11 | 
 12 | from timesformer.datasets.multigrid_helper import ShortCycleBatchSampler
 13 | 
 14 | from . import utils as utils
 15 | from .build import build_dataset
 16 | 
 17 | 
 18 | def detection_collate(batch):
 19 |     """
 20 |     Collate function for detection task. Concatanate bboxes, labels and
 21 |     metadata from different samples in the first dimension instead of
 22 |     stacking them to have a batch-size dimension.
 23 |     Args:
 24 |         batch (tuple or list): data batch to collate.
 25 |     Returns:
 26 |         (tuple): collated detection data batch.
 27 |     """
 28 |     inputs, labels, video_idx, extra_data = zip(*batch)
 29 |     inputs, video_idx = default_collate(inputs), default_collate(video_idx)
 30 |     labels = torch.tensor(np.concatenate(labels, axis=0)).float()
 31 | 
 32 |     collated_extra_data = {}
 33 |     for key in extra_data[0].keys():
 34 |         data = [d[key] for d in extra_data]
 35 |         if key == "boxes" or key == "ori_boxes":
 36 |             # Append idx info to the bboxes before concatenating them.
 37 |             bboxes = [
 38 |                 np.concatenate(
 39 |                     [np.full((data[i].shape[0], 1), float(i)), data[i]], axis=1
 40 |                 )
 41 |                 for i in range(len(data))
 42 |             ]
 43 |             bboxes = np.concatenate(bboxes, axis=0)
 44 |             collated_extra_data[key] = torch.tensor(bboxes).float()
 45 |         elif key == "metadata":
 46 |             collated_extra_data[key] = torch.tensor(
 47 |                 list(itertools.chain(*data))
 48 |             ).view(-1, 2)
 49 |         else:
 50 |             collated_extra_data[key] = default_collate(data)
 51 | 
 52 |     return inputs, labels, video_idx, collated_extra_data
 53 | 
 54 | 
 55 | def construct_loader(cfg, split, is_precise_bn=False):
 56 |     """
 57 |     Constructs the data loader for the given dataset.
 58 |     Args:
 59 |         cfg (CfgNode): configs. Details can be found in
 60 |             slowfast/config/defaults.py
 61 |         split (str): the split of the data loader. Options include `train`,
 62 |             `val`, and `test`.
 63 |     """
 64 |     assert split in ["train", "val", "test"]
 65 |     if split in ["train"]:
 66 |         dataset_name = cfg.TRAIN.DATASET
 67 |         batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 68 |         shuffle = True
 69 |         drop_last = True
 70 |     elif split in ["val"]:
 71 |         dataset_name = cfg.TRAIN.DATASET
 72 |         batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 73 |         shuffle = False
 74 |         drop_last = False
 75 |     elif split in ["test"]:
 76 |         dataset_name = cfg.TEST.DATASET
 77 |         batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 78 |         shuffle = False
 79 |         drop_last = False
 80 | 
 81 |     # Construct the dataset
 82 |     dataset = build_dataset(dataset_name, cfg, split)
 83 | 
 84 |     if cfg.MULTIGRID.SHORT_CYCLE and split in ["train"] and not is_precise_bn:
 85 |         # Create a sampler for multi-process training
 86 |         sampler = utils.create_sampler(dataset, shuffle, cfg)
 87 |         batch_sampler = ShortCycleBatchSampler(
 88 |             sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg
 89 |         )
 90 |         # Create a loader
 91 |         loader = torch.utils.data.DataLoader(
 92 |             dataset,
 93 |             batch_sampler=batch_sampler,
 94 |             num_workers=cfg.DATA_LOADER.NUM_WORKERS,
 95 |             pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
 96 |             worker_init_fn=utils.loader_worker_init_fn(dataset),
 97 |         )
 98 |     else:
 99 |         # Create a sampler for multi-process training
100 |         sampler = utils.create_sampler(dataset, shuffle, cfg)
101 |         # Create a loader
102 |         loader = torch.utils.data.DataLoader(
103 |             dataset,
104 |             batch_size=batch_size,
105 |             shuffle=(False if sampler else shuffle),
106 |             sampler=sampler,
107 |             num_workers=cfg.DATA_LOADER.NUM_WORKERS,
108 |             pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
109 |             drop_last=drop_last,
110 |             collate_fn=detection_collate if cfg.DETECTION.ENABLE else None,
111 |             worker_init_fn=utils.loader_worker_init_fn(dataset),
112 |         )
113 |     return loader
114 | 
115 | 
116 | def shuffle_dataset(loader, cur_epoch):
117 |     """ "
118 |     Shuffles the data.
119 |     Args:
120 |         loader (loader): data loader to perform shuffle.
121 |         cur_epoch (int): number of the current epoch.
122 |     """
123 |     sampler = (
124 |         loader.batch_sampler.sampler
125 |         if isinstance(loader.batch_sampler, ShortCycleBatchSampler)
126 |         else loader.sampler
127 |     )
128 |     assert isinstance(
129 |         sampler, (RandomSampler, DistributedSampler)
130 |     ), "Sampler type '{}' not supported".format(type(sampler))
131 |     # RandomSampler handles shuffling automatically
132 |     if isinstance(sampler, DistributedSampler):
133 |         # DistributedSampler shuffles data based on epoch
134 |         sampler.set_epoch(cur_epoch)
135 | 


--------------------------------------------------------------------------------
/timesformer/datasets/multigrid_helper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Helper functions for multigrid training."""
 4 | 
 5 | import numpy as np
 6 | from torch._six import int_classes as _int_classes
 7 | from torch.utils.data.sampler import Sampler
 8 | 
 9 | 
10 | class ShortCycleBatchSampler(Sampler):
11 |     """
12 |     Extend Sampler to support "short cycle" sampling.
13 |     See paper "A Multigrid Method for Efficiently Training Video Models",
14 |     Wu et al., 2019 (https://arxiv.org/abs/1912.00998) for details.
15 |     """
16 | 
17 |     def __init__(self, sampler, batch_size, drop_last, cfg):
18 |         if not isinstance(sampler, Sampler):
19 |             raise ValueError(
20 |                 "sampler should be an instance of "
21 |                 "torch.utils.data.Sampler, but got sampler={}".format(sampler)
22 |             )
23 |         if (
24 |             not isinstance(batch_size, _int_classes)
25 |             or isinstance(batch_size, bool)
26 |             or batch_size <= 0
27 |         ):
28 |             raise ValueError(
29 |                 "batch_size should be a positive integer value, "
30 |                 "but got batch_size={}".format(batch_size)
31 |             )
32 |         if not isinstance(drop_last, bool):
33 |             raise ValueError(
34 |                 "drop_last should be a boolean value, but got "
35 |                 "drop_last={}".format(drop_last)
36 |             )
37 |         self.sampler = sampler
38 |         self.drop_last = drop_last
39 | 
40 |         bs_factor = [
41 |             int(
42 |                 round(
43 |                     (
44 |                         float(cfg.DATA.TRAIN_CROP_SIZE)
45 |                         / (s * cfg.MULTIGRID.DEFAULT_S)
46 |                     )
47 |                     ** 2
48 |                 )
49 |             )
50 |             for s in cfg.MULTIGRID.SHORT_CYCLE_FACTORS
51 |         ]
52 | 
53 |         self.batch_sizes = [
54 |             batch_size * bs_factor[0],
55 |             batch_size * bs_factor[1],
56 |             batch_size,
57 |         ]
58 | 
59 |     def __iter__(self):
60 |         counter = 0
61 |         batch_size = self.batch_sizes[0]
62 |         batch = []
63 |         for idx in self.sampler:
64 |             batch.append((idx, counter % 3))
65 |             if len(batch) == batch_size:
66 |                 yield batch
67 |                 counter += 1
68 |                 batch_size = self.batch_sizes[counter % 3]
69 |                 batch = []
70 |         if len(batch) > 0 and not self.drop_last:
71 |             yield batch
72 | 
73 |     def __len__(self):
74 |         avg_batch_size = sum(self.batch_sizes) / 3.0
75 |         if self.drop_last:
76 |             return int(np.floor(len(self.sampler) / avg_batch_size))
77 |         else:
78 |             return int(np.ceil(len(self.sampler) / avg_batch_size))
79 | 


--------------------------------------------------------------------------------
/timesformer/datasets/video_container.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | import av
 4 | 
 5 | 
 6 | def get_video_container(path_to_vid, multi_thread_decode=False, backend="pyav"):
 7 |     """
 8 |     Given the path to the video, return the pyav video container.
 9 |     Args:
10 |         path_to_vid (str): path to the video.
11 |         multi_thread_decode (bool): if True, perform multi-thread decoding.
12 |         backend (str): decoder backend, options include `pyav` and
13 |             `torchvision`, default is `pyav`.
14 |     Returns:
15 |         container (container): video container.
16 |     """
17 |     if backend == "torchvision":
18 |         with open(path_to_vid, "rb") as fp:
19 |             container = fp.read()
20 |         return container
21 |     elif backend == "pyav":
22 |         #try:
23 |         container = av.open(path_to_vid)
24 |         if multi_thread_decode:
25 |             # Enable multiple threads for decoding.
26 |             container.streams.video[0].thread_type = "AUTO"
27 |         #except:
28 |         #  container = None
29 |         return container
30 |     else:
31 |         raise NotImplementedError("Unknown backend {}".format(backend))
32 | 


--------------------------------------------------------------------------------
/timesformer/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 
3 | from .build import MODEL_REGISTRY, build_model  # noqa
4 | from .custom_video_model_builder import *  # noqa
5 | from .video_model_builder import ResNet, SlowFast # noqa
6 | 


--------------------------------------------------------------------------------
/timesformer/models/batchnorm_helper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """BatchNorm (BN) utility functions and custom batch-size BN implementations"""
  4 | 
  5 | from functools import partial
  6 | import torch
  7 | import torch.distributed as dist
  8 | import torch.nn as nn
  9 | from torch.autograd.function import Function
 10 | 
 11 | import timesformer.utils.distributed as du
 12 | 
 13 | 
 14 | def get_norm(cfg):
 15 |     """
 16 |     Args:
 17 |         cfg (CfgNode): model building configs, details are in the comments of
 18 |             the config file.
 19 |     Returns:
 20 |         nn.Module: the normalization layer.
 21 |     """
 22 |     if cfg.BN.NORM_TYPE == "batchnorm":
 23 |         return nn.BatchNorm3d
 24 |     elif cfg.BN.NORM_TYPE == "sub_batchnorm":
 25 |         return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS)
 26 |     elif cfg.BN.NORM_TYPE == "sync_batchnorm":
 27 |         return partial(
 28 |             NaiveSyncBatchNorm3d, num_sync_devices=cfg.BN.NUM_SYNC_DEVICES
 29 |         )
 30 |     else:
 31 |         raise NotImplementedError(
 32 |             "Norm type {} is not supported".format(cfg.BN.NORM_TYPE)
 33 |         )
 34 | 
 35 | 
 36 | class SubBatchNorm3d(nn.Module):
 37 |     """
 38 |     The standard BN layer computes stats across all examples in a GPU. In some
 39 |     cases it is desirable to compute stats across only a subset of examples
 40 |     (e.g., in multigrid training https://arxiv.org/abs/1912.00998).
 41 |     SubBatchNorm3d splits the batch dimension into N splits, and run BN on
 42 |     each of them separately (so that the stats are computed on each subset of
 43 |     examples (1/N of batch) independently. During evaluation, it aggregates
 44 |     the stats from all splits into one BN.
 45 |     """
 46 | 
 47 |     def __init__(self, num_splits, **args):
 48 |         """
 49 |         Args:
 50 |             num_splits (int): number of splits.
 51 |             args (list): other arguments.
 52 |         """
 53 |         super(SubBatchNorm3d, self).__init__()
 54 |         self.num_splits = num_splits
 55 |         num_features = args["num_features"]
 56 |         # Keep only one set of weight and bias.
 57 |         if args.get("affine", True):
 58 |             self.affine = True
 59 |             args["affine"] = False
 60 |             self.weight = torch.nn.Parameter(torch.ones(num_features))
 61 |             self.bias = torch.nn.Parameter(torch.zeros(num_features))
 62 |         else:
 63 |             self.affine = False
 64 |         self.bn = nn.BatchNorm3d(**args)
 65 |         args["num_features"] = num_features * num_splits
 66 |         self.split_bn = nn.BatchNorm3d(**args)
 67 | 
 68 |     def _get_aggregated_mean_std(self, means, stds, n):
 69 |         """
 70 |         Calculate the aggregated mean and stds.
 71 |         Args:
 72 |             means (tensor): mean values.
 73 |             stds (tensor): standard deviations.
 74 |             n (int): number of sets of means and stds.
 75 |         """
 76 |         mean = means.view(n, -1).sum(0) / n
 77 |         std = (
 78 |             stds.view(n, -1).sum(0) / n
 79 |             + ((means.view(n, -1) - mean) ** 2).view(n, -1).sum(0) / n
 80 |         )
 81 |         return mean.detach(), std.detach()
 82 | 
 83 |     def aggregate_stats(self):
 84 |         """
 85 |         Synchronize running_mean, and running_var. Call this before eval.
 86 |         """
 87 |         if self.split_bn.track_running_stats:
 88 |             (
 89 |                 self.bn.running_mean.data,
 90 |                 self.bn.running_var.data,
 91 |             ) = self._get_aggregated_mean_std(
 92 |                 self.split_bn.running_mean,
 93 |                 self.split_bn.running_var,
 94 |                 self.num_splits,
 95 |             )
 96 | 
 97 |     def forward(self, x):
 98 |         if self.training:
 99 |             n, c, t, h, w = x.shape
100 |             x = x.view(n // self.num_splits, c * self.num_splits, t, h, w)
101 |             x = self.split_bn(x)
102 |             x = x.view(n, c, t, h, w)
103 |         else:
104 |             x = self.bn(x)
105 |         if self.affine:
106 |             x = x * self.weight.view((-1, 1, 1, 1))
107 |             x = x + self.bias.view((-1, 1, 1, 1))
108 |         return x
109 | 
110 | 
111 | class GroupGather(Function):
112 |     """
113 |     GroupGather performs all gather on each of the local process/ GPU groups.
114 |     """
115 | 
116 |     @staticmethod
117 |     def forward(ctx, input, num_sync_devices, num_groups):
118 |         """
119 |         Perform forwarding, gathering the stats across different process/ GPU
120 |         group.
121 |         """
122 |         ctx.num_sync_devices = num_sync_devices
123 |         ctx.num_groups = num_groups
124 | 
125 |         input_list = [
126 |             torch.zeros_like(input) for k in range(du.get_local_size())
127 |         ]
128 |         dist.all_gather(
129 |             input_list, input, async_op=False, group=du._LOCAL_PROCESS_GROUP
130 |         )
131 | 
132 |         inputs = torch.stack(input_list, dim=0)
133 |         if num_groups > 1:
134 |             rank = du.get_local_rank()
135 |             group_idx = rank // num_sync_devices
136 |             inputs = inputs[
137 |                 group_idx
138 |                 * num_sync_devices : (group_idx + 1)
139 |                 * num_sync_devices
140 |             ]
141 |         inputs = torch.sum(inputs, dim=0)
142 |         return inputs
143 | 
144 |     @staticmethod
145 |     def backward(ctx, grad_output):
146 |         """
147 |         Perform backwarding, gathering the gradients across different process/ GPU
148 |         group.
149 |         """
150 |         grad_output_list = [
151 |             torch.zeros_like(grad_output) for k in range(du.get_local_size())
152 |         ]
153 |         dist.all_gather(
154 |             grad_output_list,
155 |             grad_output,
156 |             async_op=False,
157 |             group=du._LOCAL_PROCESS_GROUP,
158 |         )
159 | 
160 |         grads = torch.stack(grad_output_list, dim=0)
161 |         if ctx.num_groups > 1:
162 |             rank = du.get_local_rank()
163 |             group_idx = rank // ctx.num_sync_devices
164 |             grads = grads[
165 |                 group_idx
166 |                 * ctx.num_sync_devices : (group_idx + 1)
167 |                 * ctx.num_sync_devices
168 |             ]
169 |         grads = torch.sum(grads, dim=0)
170 |         return grads, None, None
171 | 
172 | 
173 | class NaiveSyncBatchNorm3d(nn.BatchNorm3d):
174 |     def __init__(self, num_sync_devices, **args):
175 |         """
176 |         Naive version of Synchronized 3D BatchNorm.
177 |         Args:
178 |             num_sync_devices (int): number of device to sync.
179 |             args (list): other arguments.
180 |         """
181 |         self.num_sync_devices = num_sync_devices
182 |         if self.num_sync_devices > 0:
183 |             assert du.get_local_size() % self.num_sync_devices == 0, (
184 |                 du.get_local_size(),
185 |                 self.num_sync_devices,
186 |             )
187 |             self.num_groups = du.get_local_size() // self.num_sync_devices
188 |         else:
189 |             self.num_sync_devices = du.get_local_size()
190 |             self.num_groups = 1
191 |         super(NaiveSyncBatchNorm3d, self).__init__(**args)
192 | 
193 |     def forward(self, input):
194 |         if du.get_local_size() == 1 or not self.training:
195 |             return super().forward(input)
196 | 
197 |         assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs"
198 |         C = input.shape[1]
199 |         mean = torch.mean(input, dim=[0, 2, 3, 4])
200 |         meansqr = torch.mean(input * input, dim=[0, 2, 3, 4])
201 | 
202 |         vec = torch.cat([mean, meansqr], dim=0)
203 |         vec = GroupGather.apply(vec, self.num_sync_devices, self.num_groups) * (
204 |             1.0 / self.num_sync_devices
205 |         )
206 | 
207 |         mean, meansqr = torch.split(vec, C)
208 |         var = meansqr - mean * mean
209 |         self.running_mean += self.momentum * (mean.detach() - self.running_mean)
210 |         self.running_var += self.momentum * (var.detach() - self.running_var)
211 | 
212 |         invstd = torch.rsqrt(var + self.eps)
213 |         scale = self.weight * invstd
214 |         bias = self.bias - mean * scale
215 |         scale = scale.reshape(1, -1, 1, 1, 1)
216 |         bias = bias.reshape(1, -1, 1, 1, 1)
217 |         return input * scale + bias
218 | 


--------------------------------------------------------------------------------
/timesformer/models/build.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Model construction functions."""
 4 | 
 5 | import torch
 6 | from fvcore.common.registry import Registry
 7 | 
 8 | MODEL_REGISTRY = Registry("MODEL")
 9 | MODEL_REGISTRY.__doc__ = """
10 | Registry for video model.
11 | 
12 | The registered object will be called with `obj(cfg)`.
13 | The call should return a `torch.nn.Module` object.
14 | """
15 | 
16 | 
17 | def build_model(cfg, gpu_id=None):
18 |     """
19 |     Builds the video model.
20 |     Args:
21 |         cfg (configs): configs that contains the hyper-parameters to build the
22 |         backbone. Details can be seen in slowfast/config/defaults.py.
23 |         gpu_id (Optional[int]): specify the gpu index to build model.
24 |     """
25 |     if torch.cuda.is_available():
26 |         assert (
27 |             cfg.NUM_GPUS <= torch.cuda.device_count()
28 |         ), "Cannot use more GPU devices than available"
29 |     else:
30 |         assert (
31 |             cfg.NUM_GPUS == 0
32 |         ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs."
33 | 
34 |     # Construct the model
35 |     name = cfg.MODEL.MODEL_NAME
36 |     model = MODEL_REGISTRY.get(name)(cfg)
37 | 
38 |     if cfg.NUM_GPUS:
39 |         if gpu_id is None:
40 |             # Determine the GPU used by the current process
41 |             cur_device = torch.cuda.current_device()
42 |         else:
43 |             cur_device = gpu_id
44 |         # Transfer the model to the current GPU device
45 |         model = model.cuda(device=cur_device)
46 | 
47 | 
48 |     # Use multi-process data parallel model in the multi-gpu setting
49 |     if cfg.NUM_GPUS > 1:
50 |         # Make model replica operate on the current device
51 |         model = torch.nn.parallel.DistributedDataParallel(
52 |             module=model, device_ids=[cur_device], output_device=cur_device
53 |         )
54 |     return model
55 | 


--------------------------------------------------------------------------------
/timesformer/models/conv2d_same.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Ross Wightman
 2 | # Conv2d w/ Same Padding
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from typing import Tuple, Optional
 8 | 
 9 | import math
10 | from typing import List, Tuple
11 | #from .padding import pad_same, get_padding_value
12 | 
13 | # Dynamically pad input x with 'SAME' padding for conv with specified args
14 | def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0):
15 |     ih, iw = x.size()[-2:]
16 |     pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
17 |     if pad_h > 0 or pad_w > 0:
18 |         x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value)
19 |     return x
20 | 
21 | # Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
22 | def get_same_padding(x: int, k: int, s: int, d: int):
23 |     return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
24 | 
25 | def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
26 |     dynamic = False
27 |     if isinstance(padding, str):
28 |         # for any string padding, the padding will be calculated for you, one of three ways
29 |         padding = padding.lower()
30 |         if padding == 'same':
31 |             # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
32 |             if is_static_pad(kernel_size, **kwargs):
33 |                 # static case, no extra overhead
34 |                 padding = get_padding(kernel_size, **kwargs)
35 |             else:
36 |                 # dynamic 'SAME' padding, has runtime/GPU memory overhead
37 |                 padding = 0
38 |                 dynamic = True
39 |         elif padding == 'valid':
40 |             # 'VALID' padding, same as padding=0
41 |             padding = 0
42 |         else:
43 |             # Default to PyTorch style 'same'-ish symmetric padding
44 |             padding = get_padding(kernel_size, **kwargs)
45 |     return padding, dynamic
46 | 
47 | def conv2d_same(
48 |         x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1),
49 |         padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1):
50 |     x = pad_same(x, weight.shape[-2:], stride, dilation)
51 |     return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
52 | 
53 | 
54 | class Conv2dSame(nn.Conv2d):
55 |     """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
56 |     """
57 | 
58 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
59 |                  padding=0, dilation=1, groups=1, bias=True):
60 |         super(Conv2dSame, self).__init__(
61 |             in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
62 | 
63 |     def forward(self, x):
64 |         return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
65 | 
66 | 
67 | def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
68 |     padding = kwargs.pop('padding', '')
69 |     kwargs.setdefault('bias', False)
70 |     padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
71 |     if is_dynamic:
72 |         return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
73 |     else:
74 |         return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
75 | 


--------------------------------------------------------------------------------
/timesformer/models/custom_video_model_builder.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 
3 | 
4 | """A More Flexible Video models."""
5 | 


--------------------------------------------------------------------------------
/timesformer/models/head_helper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """ResNe(X)t Head helper."""
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | class ResNetBasicHead(nn.Module):
  9 |     """
 10 |     ResNe(X)t 3D head.
 11 |     This layer performs a fully-connected projection during training, when the
 12 |     input size is 1x1x1. It performs a convolutional projection during testing
 13 |     when the input size is larger than 1x1x1. If the inputs are from multiple
 14 |     different pathways, the inputs will be concatenated after pooling.
 15 |     """
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         dim_in,
 20 |         num_classes,
 21 |         pool_size,
 22 |         dropout_rate=0.0,
 23 |         act_func="softmax",
 24 |     ):
 25 |         """
 26 |         The `__init__` method of any subclass should also contain these
 27 |             arguments.
 28 |         ResNetBasicHead takes p pathways as input where p in [1, infty].
 29 | 
 30 |         Args:
 31 |             dim_in (list): the list of channel dimensions of the p inputs to the
 32 |                 ResNetHead.
 33 |             num_classes (int): the channel dimensions of the p outputs to the
 34 |                 ResNetHead.
 35 |             pool_size (list): the list of kernel sizes of p spatial temporal
 36 |                 poolings, temporal pool kernel size, spatial pool kernel size,
 37 |                 spatial pool kernel size in order.
 38 |             dropout_rate (float): dropout rate. If equal to 0.0, perform no
 39 |                 dropout.
 40 |             act_func (string): activation function to use. 'softmax': applies
 41 |                 softmax on the output. 'sigmoid': applies sigmoid on the output.
 42 |         """
 43 |         super(ResNetBasicHead, self).__init__()
 44 |         assert (
 45 |             len({len(pool_size), len(dim_in)}) == 1
 46 |         ), "pathway dimensions are not consistent."
 47 |         self.num_pathways = len(pool_size)
 48 | 
 49 |         for pathway in range(self.num_pathways):
 50 |             if pool_size[pathway] is None:
 51 |                 avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
 52 |             else:
 53 |                 avg_pool = nn.AvgPool3d(pool_size[pathway], stride=1)
 54 |             self.add_module("pathway{}_avgpool".format(pathway), avg_pool)
 55 | 
 56 |         if dropout_rate > 0.0:
 57 |             self.dropout = nn.Dropout(dropout_rate)
 58 |         # Perform FC in a fully convolutional manner. The FC layer will be
 59 |         # initialized with a different std comparing to convolutional layers.
 60 |         self.projection = nn.Linear(sum(dim_in), num_classes, bias=True)
 61 | 
 62 |         # Softmax for evaluation and testing.
 63 |         if act_func == "softmax":
 64 |             self.act = nn.Softmax(dim=4)
 65 |         elif act_func == "sigmoid":
 66 |             self.act = nn.Sigmoid()
 67 |         else:
 68 |             raise NotImplementedError(
 69 |                 "{} is not supported as an activation"
 70 |                 "function.".format(act_func)
 71 |             )
 72 | 
 73 |     def forward(self, inputs):
 74 |         assert (
 75 |             len(inputs) == self.num_pathways
 76 |         ), "Input tensor does not contain {} pathway".format(self.num_pathways)
 77 |         pool_out = []
 78 |         for pathway in range(self.num_pathways):
 79 |             m = getattr(self, "pathway{}_avgpool".format(pathway))
 80 |             pool_out.append(m(inputs[pathway]))
 81 |         x = torch.cat(pool_out, 1)
 82 |         # (N, C, T, H, W) -> (N, T, H, W, C).
 83 |         x = x.permute((0, 2, 3, 4, 1))
 84 |         # Perform dropout.
 85 |         if hasattr(self, "dropout"):
 86 |             x = self.dropout(x)
 87 |         x = self.projection(x)
 88 | 
 89 |         # Performs fully convlutional inference.
 90 |         if not self.training:
 91 |             x = self.act(x)
 92 |             x = x.mean([1, 2, 3])
 93 | 
 94 |         x = x.view(x.shape[0], -1)
 95 |         return x
 96 | 
 97 | 
 98 | class X3DHead(nn.Module):
 99 |     """
100 |     X3D head.
101 |     This layer performs a fully-connected projection during training, when the
102 |     input size is 1x1x1. It performs a convolutional projection during testing
103 |     when the input size is larger than 1x1x1. If the inputs are from multiple
104 |     different pathways, the inputs will be concatenated after pooling.
105 |     """
106 | 
107 |     def __init__(
108 |         self,
109 |         dim_in,
110 |         dim_inner,
111 |         dim_out,
112 |         num_classes,
113 |         pool_size,
114 |         dropout_rate=0.0,
115 |         act_func="softmax",
116 |         inplace_relu=True,
117 |         eps=1e-5,
118 |         bn_mmt=0.1,
119 |         norm_module=nn.BatchNorm3d,
120 |         bn_lin5_on=False,
121 |     ):
122 |         """
123 |         The `__init__` method of any subclass should also contain these
124 |             arguments.
125 |         X3DHead takes a 5-dim feature tensor (BxCxTxHxW) as input.
126 | 
127 |         Args:
128 |             dim_in (float): the channel dimension C of the input.
129 |             num_classes (int): the channel dimensions of the output.
130 |             pool_size (float): a single entry list of kernel size for
131 |                 spatiotemporal pooling for the TxHxW dimensions.
132 |             dropout_rate (float): dropout rate. If equal to 0.0, perform no
133 |                 dropout.
134 |             act_func (string): activation function to use. 'softmax': applies
135 |                 softmax on the output. 'sigmoid': applies sigmoid on the output.
136 |             inplace_relu (bool): if True, calculate the relu on the original
137 |                 input without allocating new memory.
138 |             eps (float): epsilon for batch norm.
139 |             bn_mmt (float): momentum for batch norm. Noted that BN momentum in
140 |                 PyTorch = 1 - BN momentum in Caffe2.
141 |             norm_module (nn.Module): nn.Module for the normalization layer. The
142 |                 default is nn.BatchNorm3d.
143 |             bn_lin5_on (bool): if True, perform normalization on the features
144 |                 before the classifier.
145 |         """
146 |         super(X3DHead, self).__init__()
147 |         self.pool_size = pool_size
148 |         self.dropout_rate = dropout_rate
149 |         self.num_classes = num_classes
150 |         self.act_func = act_func
151 |         self.eps = eps
152 |         self.bn_mmt = bn_mmt
153 |         self.inplace_relu = inplace_relu
154 |         self.bn_lin5_on = bn_lin5_on
155 |         self._construct_head(dim_in, dim_inner, dim_out, norm_module)
156 | 
157 |     def _construct_head(self, dim_in, dim_inner, dim_out, norm_module):
158 | 
159 |         self.conv_5 = nn.Conv3d(
160 |             dim_in,
161 |             dim_inner,
162 |             kernel_size=(1, 1, 1),
163 |             stride=(1, 1, 1),
164 |             padding=(0, 0, 0),
165 |             bias=False,
166 |         )
167 |         self.conv_5_bn = norm_module(
168 |             num_features=dim_inner, eps=self.eps, momentum=self.bn_mmt
169 |         )
170 |         self.conv_5_relu = nn.ReLU(self.inplace_relu)
171 | 
172 |         if self.pool_size is None:
173 |             self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
174 |         else:
175 |             self.avg_pool = nn.AvgPool3d(self.pool_size, stride=1)
176 | 
177 |         self.lin_5 = nn.Conv3d(
178 |             dim_inner,
179 |             dim_out,
180 |             kernel_size=(1, 1, 1),
181 |             stride=(1, 1, 1),
182 |             padding=(0, 0, 0),
183 |             bias=False,
184 |         )
185 |         if self.bn_lin5_on:
186 |             self.lin_5_bn = norm_module(
187 |                 num_features=dim_out, eps=self.eps, momentum=self.bn_mmt
188 |             )
189 |         self.lin_5_relu = nn.ReLU(self.inplace_relu)
190 | 
191 |         if self.dropout_rate > 0.0:
192 |             self.dropout = nn.Dropout(self.dropout_rate)
193 |         # Perform FC in a fully convolutional manner. The FC layer will be
194 |         # initialized with a different std comparing to convolutional layers.
195 |         self.projection = nn.Linear(dim_out, self.num_classes, bias=True)
196 | 
197 |         # Softmax for evaluation and testing.
198 |         if self.act_func == "softmax":
199 |             self.act = nn.Softmax(dim=4)
200 |         elif self.act_func == "sigmoid":
201 |             self.act = nn.Sigmoid()
202 |         else:
203 |             raise NotImplementedError(
204 |                 "{} is not supported as an activation"
205 |                 "function.".format(self.act_func)
206 |             )
207 | 
208 |     def forward(self, inputs):
209 |         # In its current design the X3D head is only useable for a single
210 |         # pathway input.
211 |         assert len(inputs) == 1, "Input tensor does not contain 1 pathway"
212 |         x = self.conv_5(inputs[0])
213 |         x = self.conv_5_bn(x)
214 |         x = self.conv_5_relu(x)
215 |         x = self.avg_pool(x)
216 | 
217 |         x = self.lin_5(x)
218 |         if self.bn_lin5_on:
219 |             x = self.lin_5_bn(x)
220 |         x = self.lin_5_relu(x)
221 | 
222 |         # (N, C, T, H, W) -> (N, T, H, W, C).
223 |         x = x.permute((0, 2, 3, 4, 1))
224 |         # Perform dropout.
225 |         if hasattr(self, "dropout"):
226 |             x = self.dropout(x)
227 |         x = self.projection(x)
228 | 
229 |         # Performs fully convlutional inference.
230 |         if not self.training:
231 |             x = self.act(x)
232 |             x = x.mean([1, 2, 3])
233 | 
234 |         x = x.view(x.shape[0], -1)
235 |         return x
236 | 


--------------------------------------------------------------------------------
/timesformer/models/linear.py:
--------------------------------------------------------------------------------
 1 | """ Linear layer (alternate definition)
 2 | """
 3 | import torch
 4 | import torch.nn.functional as F
 5 | from torch import nn as nn
 6 | 
 7 | class Linear(nn.Linear):
 8 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
 9 |         if torch.jit.is_scripting():
10 |             bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
11 |             return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
12 |         else:
13 |             return F.linear(input, self.weight, self.bias)
14 | 


--------------------------------------------------------------------------------
/timesformer/models/losses.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Loss functions."""
 4 | 
 5 | import torch.nn as nn
 6 | 
 7 | _LOSSES = {
 8 |     "cross_entropy": nn.CrossEntropyLoss,
 9 |     "bce": nn.BCELoss,
10 |     "bce_logit": nn.BCEWithLogitsLoss,
11 | }
12 | 
13 | 
14 | def get_loss_func(loss_name):
15 |     """
16 |     Retrieve the loss given the loss name.
17 |     Args (int):
18 |         loss_name: the name of the loss to use.
19 |     """
20 |     if loss_name not in _LOSSES.keys():
21 |         raise NotImplementedError("Loss {} is not supported".format(loss_name))
22 |     return _LOSSES[loss_name]
23 | 


--------------------------------------------------------------------------------
/timesformer/models/nonlocal_helper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """Non-local helper"""
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | 
  9 | class Nonlocal(nn.Module):
 10 |     """
 11 |     Builds Non-local Neural Networks as a generic family of building
 12 |     blocks for capturing long-range dependencies. Non-local Network
 13 |     computes the response at a position as a weighted sum of the
 14 |     features at all positions. This building block can be plugged into
 15 |     many computer vision architectures.
 16 |     More details in the paper: https://arxiv.org/pdf/1711.07971.pdf
 17 |     """
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         dim,
 22 |         dim_inner,
 23 |         pool_size=None,
 24 |         instantiation="softmax",
 25 |         zero_init_final_conv=False,
 26 |         zero_init_final_norm=True,
 27 |         norm_eps=1e-5,
 28 |         norm_momentum=0.1,
 29 |         norm_module=nn.BatchNorm3d,
 30 |     ):
 31 |         """
 32 |         Args:
 33 |             dim (int): number of dimension for the input.
 34 |             dim_inner (int): number of dimension inside of the Non-local block.
 35 |             pool_size (list): the kernel size of spatial temporal pooling,
 36 |                 temporal pool kernel size, spatial pool kernel size, spatial
 37 |                 pool kernel size in order. By default pool_size is None,
 38 |                 then there would be no pooling used.
 39 |             instantiation (string): supports two different instantiation method:
 40 |                 "dot_product": normalizing correlation matrix with L2.
 41 |                 "softmax": normalizing correlation matrix with Softmax.
 42 |             zero_init_final_conv (bool): If true, zero initializing the final
 43 |                 convolution of the Non-local block.
 44 |             zero_init_final_norm (bool):
 45 |                 If true, zero initializing the final batch norm of the Non-local
 46 |                 block.
 47 |             norm_module (nn.Module): nn.Module for the normalization layer. The
 48 |                 default is nn.BatchNorm3d.
 49 |         """
 50 |         super(Nonlocal, self).__init__()
 51 |         self.dim = dim
 52 |         self.dim_inner = dim_inner
 53 |         self.pool_size = pool_size
 54 |         self.instantiation = instantiation
 55 |         self.use_pool = (
 56 |             False
 57 |             if pool_size is None
 58 |             else any((size > 1 for size in pool_size))
 59 |         )
 60 |         self.norm_eps = norm_eps
 61 |         self.norm_momentum = norm_momentum
 62 |         self._construct_nonlocal(
 63 |             zero_init_final_conv, zero_init_final_norm, norm_module
 64 |         )
 65 | 
 66 |     def _construct_nonlocal(
 67 |         self, zero_init_final_conv, zero_init_final_norm, norm_module
 68 |     ):
 69 |         # Three convolution heads: theta, phi, and g.
 70 |         self.conv_theta = nn.Conv3d(
 71 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 72 |         )
 73 |         self.conv_phi = nn.Conv3d(
 74 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 75 |         )
 76 |         self.conv_g = nn.Conv3d(
 77 |             self.dim, self.dim_inner, kernel_size=1, stride=1, padding=0
 78 |         )
 79 | 
 80 |         # Final convolution output.
 81 |         self.conv_out = nn.Conv3d(
 82 |             self.dim_inner, self.dim, kernel_size=1, stride=1, padding=0
 83 |         )
 84 |         # Zero initializing the final convolution output.
 85 |         self.conv_out.zero_init = zero_init_final_conv
 86 | 
 87 |         # TODO: change the name to `norm`
 88 |         self.bn = norm_module(
 89 |             num_features=self.dim,
 90 |             eps=self.norm_eps,
 91 |             momentum=self.norm_momentum,
 92 |         )
 93 |         # Zero initializing the final bn.
 94 |         self.bn.transform_final_bn = zero_init_final_norm
 95 | 
 96 |         # Optional to add the spatial-temporal pooling.
 97 |         if self.use_pool:
 98 |             self.pool = nn.MaxPool3d(
 99 |                 kernel_size=self.pool_size,
100 |                 stride=self.pool_size,
101 |                 padding=[0, 0, 0],
102 |             )
103 | 
104 |     def forward(self, x):
105 |         x_identity = x
106 |         N, C, T, H, W = x.size()
107 | 
108 |         theta = self.conv_theta(x)
109 | 
110 |         # Perform temporal-spatial pooling to reduce the computation.
111 |         if self.use_pool:
112 |             x = self.pool(x)
113 | 
114 |         phi = self.conv_phi(x)
115 |         g = self.conv_g(x)
116 | 
117 |         theta = theta.view(N, self.dim_inner, -1)
118 |         phi = phi.view(N, self.dim_inner, -1)
119 |         g = g.view(N, self.dim_inner, -1)
120 | 
121 |         # (N, C, TxHxW) * (N, C, TxHxW) => (N, TxHxW, TxHxW).
122 |         theta_phi = torch.einsum("nct,ncp->ntp", (theta, phi))
123 |         # For original Non-local paper, there are two main ways to normalize
124 |         # the affinity tensor:
125 |         #   1) Softmax normalization (norm on exp).
126 |         #   2) dot_product normalization.
127 |         if self.instantiation == "softmax":
128 |             # Normalizing the affinity tensor theta_phi before softmax.
129 |             theta_phi = theta_phi * (self.dim_inner ** -0.5)
130 |             theta_phi = nn.functional.softmax(theta_phi, dim=2)
131 |         elif self.instantiation == "dot_product":
132 |             spatial_temporal_dim = theta_phi.shape[2]
133 |             theta_phi = theta_phi / spatial_temporal_dim
134 |         else:
135 |             raise NotImplementedError(
136 |                 "Unknown norm type {}".format(self.instantiation)
137 |             )
138 | 
139 |         # (N, TxHxW, TxHxW) * (N, C, TxHxW) => (N, C, TxHxW).
140 |         theta_phi_g = torch.einsum("ntg,ncg->nct", (theta_phi, g))
141 | 
142 |         # (N, C, TxHxW) => (N, C, T, H, W).
143 |         theta_phi_g = theta_phi_g.view(N, self.dim_inner, T, H, W)
144 | 
145 |         p = self.conv_out(theta_phi_g)
146 |         p = self.bn(p)
147 |         return x_identity + p
148 | 


--------------------------------------------------------------------------------
/timesformer/models/operators.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Custom operators."""
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | 
 9 | class Swish(nn.Module):
10 |     """Swish activation function: x * sigmoid(x)."""
11 | 
12 |     def __init__(self):
13 |         super(Swish, self).__init__()
14 | 
15 |     def forward(self, x):
16 |         return SwishEfficient.apply(x)
17 | 
18 | 
19 | class SwishEfficient(torch.autograd.Function):
20 |     """Swish activation function: x * sigmoid(x)."""
21 | 
22 |     @staticmethod
23 |     def forward(ctx, x):
24 |         result = x * torch.sigmoid(x)
25 |         ctx.save_for_backward(x)
26 |         return result
27 | 
28 |     @staticmethod
29 |     def backward(ctx, grad_output):
30 |         x = ctx.saved_variables[0]
31 |         sigmoid_x = torch.sigmoid(x)
32 |         return grad_output * (sigmoid_x * (1 + x * (1 - sigmoid_x)))
33 | 
34 | 
35 | class SE(nn.Module):
36 |     """Squeeze-and-Excitation (SE) block w/ Swish: AvgPool, FC, Swish, FC, Sigmoid."""
37 | 
38 |     def _round_width(self, width, multiplier, min_width=8, divisor=8):
39 |         """
40 |         Round width of filters based on width multiplier
41 |         Args:
42 |             width (int): the channel dimensions of the input.
43 |             multiplier (float): the multiplication factor.
44 |             min_width (int): the minimum width after multiplication.
45 |             divisor (int): the new width should be dividable by divisor.
46 |         """
47 |         if not multiplier:
48 |             return width
49 | 
50 |         width *= multiplier
51 |         min_width = min_width or divisor
52 |         width_out = max(
53 |             min_width, int(width + divisor / 2) // divisor * divisor
54 |         )
55 |         if width_out < 0.9 * width:
56 |             width_out += divisor
57 |         return int(width_out)
58 | 
59 |     def __init__(self, dim_in, ratio, relu_act=True):
60 |         """
61 |         Args:
62 |             dim_in (int): the channel dimensions of the input.
63 |             ratio (float): the channel reduction ratio for squeeze.
64 |             relu_act (bool): whether to use ReLU activation instead
65 |                 of Swish (default).
66 |             divisor (int): the new width should be dividable by divisor.
67 |         """
68 |         super(SE, self).__init__()
69 |         self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
70 |         dim_fc = self._round_width(dim_in, ratio)
71 |         self.fc1 = nn.Conv3d(dim_in, dim_fc, 1, bias=True)
72 |         self.fc1_act = nn.ReLU() if relu_act else Swish()
73 |         self.fc2 = nn.Conv3d(dim_fc, dim_in, 1, bias=True)
74 | 
75 |         self.fc2_sig = nn.Sigmoid()
76 | 
77 |     def forward(self, x):
78 |         x_in = x
79 |         for module in self.children():
80 |             x = module(x)
81 |         return x_in * x
82 | 


--------------------------------------------------------------------------------
/timesformer/models/optimizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """Optimizer."""
  4 | 
  5 | import torch
  6 | 
  7 | import timesformer.utils.lr_policy as lr_policy
  8 | 
  9 | 
 10 | def construct_optimizer(model, cfg):
 11 |     """
 12 |     Construct a stochastic gradient descent or ADAM optimizer with momentum.
 13 |     Details can be found in:
 14 |     Herbert Robbins, and Sutton Monro. "A stochastic approximation method."
 15 |     and
 16 |     Diederik P.Kingma, and Jimmy Ba.
 17 |     "Adam: A Method for Stochastic Optimization."
 18 | 
 19 |     Args:
 20 |         model (model): model to perform stochastic gradient descent
 21 |         optimization or ADAM optimization.
 22 |         cfg (config): configs of hyper-parameters of SGD or ADAM, includes base
 23 |         learning rate,  momentum, weight_decay, dampening, and etc.
 24 |     """
 25 |     # Batchnorm parameters.
 26 |     bn_params = []
 27 |     # Non-batchnorm parameters.
 28 |     non_bn_parameters = []
 29 |     for name, p in model.named_parameters():
 30 |         if "bn" in name:
 31 |             bn_params.append(p)
 32 |         else:
 33 |             non_bn_parameters.append(p)
 34 |     # Apply different weight decay to Batchnorm and non-batchnorm parameters.
 35 |     # In Caffe2 classification codebase the weight decay for batchnorm is 0.0.
 36 |     # Having a different weight decay on batchnorm might cause a performance
 37 |     # drop.
 38 |     optim_params = [
 39 |         {"params": bn_params, "weight_decay": cfg.BN.WEIGHT_DECAY},
 40 |         {"params": non_bn_parameters, "weight_decay": cfg.SOLVER.WEIGHT_DECAY},
 41 |     ]
 42 |     # Check all parameters will be passed into optimizer.
 43 |     assert len(list(model.parameters())) == len(non_bn_parameters) + len(
 44 |         bn_params
 45 |     ), "parameter size does not match: {} + {} != {}".format(
 46 |         len(non_bn_parameters), len(bn_params), len(list(model.parameters()))
 47 |     )
 48 | 
 49 |     if cfg.SOLVER.OPTIMIZING_METHOD == "sgd":
 50 |         return torch.optim.SGD(
 51 |             optim_params,
 52 |             lr=cfg.SOLVER.BASE_LR,
 53 |             momentum=cfg.SOLVER.MOMENTUM,
 54 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
 55 |             dampening=cfg.SOLVER.DAMPENING,
 56 |             nesterov=cfg.SOLVER.NESTEROV,
 57 |         )
 58 |     elif cfg.SOLVER.OPTIMIZING_METHOD == "adam":
 59 |         return torch.optim.Adam(
 60 |             optim_params,
 61 |             lr=cfg.SOLVER.BASE_LR,
 62 |             betas=(0.9, 0.999),
 63 |             eps=1e-08,
 64 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
 65 |         )
 66 |     elif cfg.SOLVER.OPTIMIZING_METHOD == "adamw":
 67 |         return torch.optim.AdamW(
 68 |             optim_params,
 69 |             lr=cfg.SOLVER.BASE_LR,
 70 |             betas=(0.9, 0.999),
 71 |             eps=1e-08,
 72 |             weight_decay=cfg.SOLVER.WEIGHT_DECAY,
 73 |         )
 74 |     else:
 75 |         raise NotImplementedError(
 76 |             "Does not support {} optimizer".format(cfg.SOLVER.OPTIMIZING_METHOD)
 77 |         )
 78 | 
 79 | 
 80 | def get_epoch_lr(cur_epoch, cfg):
 81 |     """
 82 |     Retrieves the lr for the given epoch (as specified by the lr policy).
 83 |     Args:
 84 |         cfg (config): configs of hyper-parameters of ADAM, includes base
 85 |         learning rate, betas, and weight decays.
 86 |         cur_epoch (float): the number of epoch of the current training stage.
 87 |     """
 88 |     return lr_policy.get_lr_at_epoch(cfg, cur_epoch)
 89 | 
 90 | 
 91 | def set_lr(optimizer, new_lr):
 92 |     """
 93 |     Sets the optimizer lr to the specified value.
 94 |     Args:
 95 |         optimizer (optim): the optimizer using to optimize the current network.
 96 |         new_lr (float): the new learning rate to set.
 97 |     """
 98 |     for param_group in optimizer.param_groups:
 99 |         param_group["lr"] = new_lr
100 | 


--------------------------------------------------------------------------------
/timesformer/models/stem_helper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """ResNe(X)t 3D stem helper."""
  4 | 
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | def get_stem_func(name):
  9 |     """
 10 |     Retrieves the stem module by name.
 11 |     """
 12 |     trans_funcs = {"x3d_stem": X3DStem, "basic_stem": ResNetBasicStem}
 13 |     assert (
 14 |         name in trans_funcs.keys()
 15 |     ), "Transformation function '{}' not supported".format(name)
 16 |     return trans_funcs[name]
 17 | 
 18 | 
 19 | class VideoModelStem(nn.Module):
 20 |     """
 21 |     Video 3D stem module. Provides stem operations of Conv, BN, ReLU, MaxPool
 22 |     on input data tensor for one or multiple pathways.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         dim_in,
 28 |         dim_out,
 29 |         kernel,
 30 |         stride,
 31 |         padding,
 32 |         inplace_relu=True,
 33 |         eps=1e-5,
 34 |         bn_mmt=0.1,
 35 |         norm_module=nn.BatchNorm3d,
 36 |         stem_func_name="basic_stem",
 37 |     ):
 38 |         """
 39 |         The `__init__` method of any subclass should also contain these
 40 |         arguments. List size of 1 for single pathway models (C2D, I3D, Slow
 41 |         and etc), list size of 2 for two pathway models (SlowFast).
 42 | 
 43 |         Args:
 44 |             dim_in (list): the list of channel dimensions of the inputs.
 45 |             dim_out (list): the output dimension of the convolution in the stem
 46 |                 layer.
 47 |             kernel (list): the kernels' size of the convolutions in the stem
 48 |                 layers. Temporal kernel size, height kernel size, width kernel
 49 |                 size in order.
 50 |             stride (list): the stride sizes of the convolutions in the stem
 51 |                 layer. Temporal kernel stride, height kernel size, width kernel
 52 |                 size in order.
 53 |             padding (list): the paddings' sizes of the convolutions in the stem
 54 |                 layer. Temporal padding size, height padding size, width padding
 55 |                 size in order.
 56 |             inplace_relu (bool): calculate the relu on the original input
 57 |                 without allocating new memory.
 58 |             eps (float): epsilon for batch norm.
 59 |             bn_mmt (float): momentum for batch norm. Noted that BN momentum in
 60 |                 PyTorch = 1 - BN momentum in Caffe2.
 61 |             norm_module (nn.Module): nn.Module for the normalization layer. The
 62 |                 default is nn.BatchNorm3d.
 63 |             stem_func_name (string): name of the the stem function applied on
 64 |                 input to the network.
 65 |         """
 66 |         super(VideoModelStem, self).__init__()
 67 | 
 68 |         assert (
 69 |             len(
 70 |                 {
 71 |                     len(dim_in),
 72 |                     len(dim_out),
 73 |                     len(kernel),
 74 |                     len(stride),
 75 |                     len(padding),
 76 |                 }
 77 |             )
 78 |             == 1
 79 |         ), "Input pathway dimensions are not consistent."
 80 |         self.num_pathways = len(dim_in)
 81 |         self.kernel = kernel
 82 |         self.stride = stride
 83 |         self.padding = padding
 84 |         self.inplace_relu = inplace_relu
 85 |         self.eps = eps
 86 |         self.bn_mmt = bn_mmt
 87 |         # Construct the stem layer.
 88 |         self._construct_stem(dim_in, dim_out, norm_module, stem_func_name)
 89 | 
 90 |     def _construct_stem(self, dim_in, dim_out, norm_module, stem_func_name):
 91 |         trans_func = get_stem_func(stem_func_name)
 92 | 
 93 |         for pathway in range(len(dim_in)):
 94 |             stem = trans_func(
 95 |                 dim_in[pathway],
 96 |                 dim_out[pathway],
 97 |                 self.kernel[pathway],
 98 |                 self.stride[pathway],
 99 |                 self.padding[pathway],
100 |                 self.inplace_relu,
101 |                 self.eps,
102 |                 self.bn_mmt,
103 |                 norm_module,
104 |             )
105 |             self.add_module("pathway{}_stem".format(pathway), stem)
106 | 
107 |     def forward(self, x):
108 |         assert (
109 |             len(x) == self.num_pathways
110 |         ), "Input tensor does not contain {} pathway".format(self.num_pathways)
111 |         for pathway in range(len(x)):
112 |             m = getattr(self, "pathway{}_stem".format(pathway))
113 |             x[pathway] = m(x[pathway])
114 |         return x
115 | 
116 | 
117 | class ResNetBasicStem(nn.Module):
118 |     """
119 |     ResNe(X)t 3D stem module.
120 |     Performs spatiotemporal Convolution, BN, and Relu following by a
121 |         spatiotemporal pooling.
122 |     """
123 | 
124 |     def __init__(
125 |         self,
126 |         dim_in,
127 |         dim_out,
128 |         kernel,
129 |         stride,
130 |         padding,
131 |         inplace_relu=True,
132 |         eps=1e-5,
133 |         bn_mmt=0.1,
134 |         norm_module=nn.BatchNorm3d,
135 |     ):
136 |         """
137 |         The `__init__` method of any subclass should also contain these arguments.
138 | 
139 |         Args:
140 |             dim_in (int): the channel dimension of the input. Normally 3 is used
141 |                 for rgb input, and 2 or 3 is used for optical flow input.
142 |             dim_out (int): the output dimension of the convolution in the stem
143 |                 layer.
144 |             kernel (list): the kernel size of the convolution in the stem layer.
145 |                 temporal kernel size, height kernel size, width kernel size in
146 |                 order.
147 |             stride (list): the stride size of the convolution in the stem layer.
148 |                 temporal kernel stride, height kernel size, width kernel size in
149 |                 order.
150 |             padding (int): the padding size of the convolution in the stem
151 |                 layer, temporal padding size, height padding size, width
152 |                 padding size in order.
153 |             inplace_relu (bool): calculate the relu on the original input
154 |                 without allocating new memory.
155 |             eps (float): epsilon for batch norm.
156 |             bn_mmt (float): momentum for batch norm. Noted that BN momentum in
157 |                 PyTorch = 1 - BN momentum in Caffe2.
158 |             norm_module (nn.Module): nn.Module for the normalization layer. The
159 |                 default is nn.BatchNorm3d.
160 |         """
161 |         super(ResNetBasicStem, self).__init__()
162 |         self.kernel = kernel
163 |         self.stride = stride
164 |         self.padding = padding
165 |         self.inplace_relu = inplace_relu
166 |         self.eps = eps
167 |         self.bn_mmt = bn_mmt
168 |         # Construct the stem layer.
169 |         self._construct_stem(dim_in, dim_out, norm_module)
170 | 
171 |     def _construct_stem(self, dim_in, dim_out, norm_module):
172 |         self.conv = nn.Conv3d(
173 |             dim_in,
174 |             dim_out,
175 |             self.kernel,
176 |             stride=self.stride,
177 |             padding=self.padding,
178 |             bias=False,
179 |         )
180 |         self.bn = norm_module(
181 |             num_features=dim_out, eps=self.eps, momentum=self.bn_mmt
182 |         )
183 |         self.relu = nn.ReLU(self.inplace_relu)
184 |         self.pool_layer = nn.MaxPool3d(
185 |             kernel_size=[1, 3, 3], stride=[1, 2, 2], padding=[0, 1, 1]
186 |         )
187 | 
188 |     def forward(self, x):
189 |         x = self.conv(x)
190 |         x = self.bn(x)
191 |         x = self.relu(x)
192 |         x = self.pool_layer(x)
193 |         return x
194 | 
195 | 
196 | class X3DStem(nn.Module):
197 |     """
198 |     X3D's 3D stem module.
199 |     Performs a spatial followed by a depthwise temporal Convolution, BN, and Relu following by a
200 |         spatiotemporal pooling.
201 |     """
202 | 
203 |     def __init__(
204 |         self,
205 |         dim_in,
206 |         dim_out,
207 |         kernel,
208 |         stride,
209 |         padding,
210 |         inplace_relu=True,
211 |         eps=1e-5,
212 |         bn_mmt=0.1,
213 |         norm_module=nn.BatchNorm3d,
214 |     ):
215 |         """
216 |         The `__init__` method of any subclass should also contain these arguments.
217 | 
218 |         Args:
219 |             dim_in (int): the channel dimension of the input. Normally 3 is used
220 |                 for rgb input, and 2 or 3 is used for optical flow input.
221 |             dim_out (int): the output dimension of the convolution in the stem
222 |                 layer.
223 |             kernel (list): the kernel size of the convolution in the stem layer.
224 |                 temporal kernel size, height kernel size, width kernel size in
225 |                 order.
226 |             stride (list): the stride size of the convolution in the stem layer.
227 |                 temporal kernel stride, height kernel size, width kernel size in
228 |                 order.
229 |             padding (int): the padding size of the convolution in the stem
230 |                 layer, temporal padding size, height padding size, width
231 |                 padding size in order.
232 |             inplace_relu (bool): calculate the relu on the original input
233 |                 without allocating new memory.
234 |             eps (float): epsilon for batch norm.
235 |             bn_mmt (float): momentum for batch norm. Noted that BN momentum in
236 |                 PyTorch = 1 - BN momentum in Caffe2.
237 |             norm_module (nn.Module): nn.Module for the normalization layer. The
238 |                 default is nn.BatchNorm3d.
239 |         """
240 |         super(X3DStem, self).__init__()
241 |         self.kernel = kernel
242 |         self.stride = stride
243 |         self.padding = padding
244 |         self.inplace_relu = inplace_relu
245 |         self.eps = eps
246 |         self.bn_mmt = bn_mmt
247 |         # Construct the stem layer.
248 |         self._construct_stem(dim_in, dim_out, norm_module)
249 | 
250 |     def _construct_stem(self, dim_in, dim_out, norm_module):
251 |         self.conv_xy = nn.Conv3d(
252 |             dim_in,
253 |             dim_out,
254 |             kernel_size=(1, self.kernel[1], self.kernel[2]),
255 |             stride=(1, self.stride[1], self.stride[2]),
256 |             padding=(0, self.padding[1], self.padding[2]),
257 |             bias=False,
258 |         )
259 |         self.conv = nn.Conv3d(
260 |             dim_out,
261 |             dim_out,
262 |             kernel_size=(self.kernel[0], 1, 1),
263 |             stride=(self.stride[0], 1, 1),
264 |             padding=(self.padding[0], 0, 0),
265 |             bias=False,
266 |             groups=dim_out,
267 |         )
268 | 
269 |         self.bn = norm_module(
270 |             num_features=dim_out, eps=self.eps, momentum=self.bn_mmt
271 |         )
272 |         self.relu = nn.ReLU(self.inplace_relu)
273 | 
274 |     def forward(self, x):
275 |         x = self.conv_xy(x)
276 |         x = self.conv(x)
277 |         x = self.bn(x)
278 |         x = self.relu(x)
279 |         return x
280 | 


--------------------------------------------------------------------------------
/timesformer/models/vit_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Ross Wightman
  2 | # Various utility functions
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from functools import partial
  7 | import math
  8 | import warnings
  9 | import torch.nn.functional as F
 10 | 
 11 | from timesformer.models.helpers import load_pretrained
 12 | from .build import MODEL_REGISTRY
 13 | from itertools import repeat
 14 | from torch._six import container_abcs
 15 | 
 16 | DEFAULT_CROP_PCT = 0.875
 17 | IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
 18 | IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
 19 | IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
 20 | IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
 21 | IMAGENET_DPN_MEAN = (124 / 255, 117 / 255, 104 / 255)
 22 | IMAGENET_DPN_STD = tuple([1 / (.0167 * 255)] * 3)
 23 | 
 24 | def _no_grad_trunc_normal_(tensor, mean, std, a, b):
 25 |     def norm_cdf(x):
 26 |         # Computes standard normal cumulative distribution function
 27 |         return (1. + math.erf(x / math.sqrt(2.))) / 2.
 28 | 
 29 |     if (mean < a - 2 * std) or (mean > b + 2 * std):
 30 |         warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
 31 |                       "The distribution of values may be incorrect.",
 32 |                       stacklevel=2)
 33 | 
 34 |     with torch.no_grad():
 35 |         # Values are generated by using a truncated uniform distribution and
 36 |         # then using the inverse CDF for the normal distribution.
 37 |         # Get upper and lower cdf values
 38 |         l = norm_cdf((a - mean) / std)
 39 |         u = norm_cdf((b - mean) / std)
 40 | 
 41 |         # Uniformly fill tensor with values from [l, u], then translate to
 42 |         # [2l-1, 2u-1].
 43 |         tensor.uniform_(2 * l - 1, 2 * u - 1)
 44 | 
 45 |         # Use inverse cdf transform for normal distribution to get truncated
 46 |         # standard normal
 47 |         tensor.erfinv_()
 48 | 
 49 |         # Transform to proper mean, std
 50 |         tensor.mul_(std * math.sqrt(2.))
 51 |         tensor.add_(mean)
 52 | 
 53 |         # Clamp to ensure it's in the proper range
 54 |         tensor.clamp_(min=a, max=b)
 55 |         return tensor
 56 | 
 57 | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
 58 |     # type: (Tensor, float, float, float, float) -> Tensor
 59 |     r"""Fills the input Tensor with values drawn from a truncated
 60 |     normal distribution. The values are effectively drawn from the
 61 |     normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
 62 |     with values outside :math:`[a, b]` redrawn until they are within
 63 |     the bounds. The method used for generating the random values works
 64 |     best when :math:`a \leq \text{mean} \leq b`.
 65 |     Args:
 66 |         tensor: an n-dimensional `torch.Tensor`
 67 |         mean: the mean of the normal distribution
 68 |         std: the standard deviation of the normal distribution
 69 |         a: the minimum cutoff value
 70 |         b: the maximum cutoff value
 71 |     Examples:
 72 |         >>> w = torch.empty(3, 5)
 73 |         >>> nn.init.trunc_normal_(w)
 74 |     """
 75 |     return _no_grad_trunc_normal_(tensor, mean, std, a, b)
 76 | 
 77 | # From PyTorch internals
 78 | def _ntuple(n):
 79 |     def parse(x):
 80 |         if isinstance(x, container_abcs.Iterable):
 81 |             return x
 82 |         return tuple(repeat(x, n))
 83 |     return parse
 84 | to_2tuple = _ntuple(2)
 85 | 
 86 | # Calculate symmetric padding for a convolution
 87 | def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
 88 |     padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
 89 |     return padding
 90 | 
 91 | def get_padding_value(padding, kernel_size, **kwargs):
 92 |     dynamic = False
 93 |     if isinstance(padding, str):
 94 |         # for any string padding, the padding will be calculated for you, one of three ways
 95 |         padding = padding.lower()
 96 |         if padding == 'same':
 97 |             # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
 98 |             if is_static_pad(kernel_size, **kwargs):
 99 |                 # static case, no extra overhead
100 |                 padding = get_padding(kernel_size, **kwargs)
101 |             else:
102 |                 # dynamic 'SAME' padding, has runtime/GPU memory overhead
103 |                 padding = 0
104 |                 dynamic = True
105 |         elif padding == 'valid':
106 |             # 'VALID' padding, same as padding=0
107 |             padding = 0
108 |         else:
109 |             # Default to PyTorch style 'same'-ish symmetric padding
110 |             padding = get_padding(kernel_size, **kwargs)
111 |     return padding, dynamic
112 | 
113 | # Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
114 | def get_same_padding(x: int, k: int, s: int, d: int):
115 |     return max((int(math.ceil(x // s)) - 1) * s + (k - 1) * d + 1 - x, 0)
116 | 
117 | 
118 | # Can SAME padding for given args be done statically?
119 | def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
120 |     return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
121 | 
122 | 
123 | # Dynamically pad input x with 'SAME' padding for conv with specified args
124 | #def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0):
125 | def pad_same(x, k, s, d=(1, 1), value= 0):
126 |     ih, iw = x.size()[-2:]
127 |     pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
128 |     if pad_h > 0 or pad_w > 0:
129 |         x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value)
130 |     return x
131 | 
132 | def adaptive_pool_feat_mult(pool_type='avg'):
133 |     if pool_type == 'catavgmax':
134 |         return 2
135 |     else:
136 |         return 1
137 | 
138 | def drop_path(x, drop_prob: float = 0., training: bool = False):
139 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
140 |     This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
141 |     the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
142 |     See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
143 |     changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
144 |     'survival rate' as the argument.
145 |     """
146 |     if drop_prob == 0. or not training:
147 |         return x
148 |     keep_prob = 1 - drop_prob
149 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
150 |     random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
151 |     random_tensor.floor_()  # binarize
152 |     output = x.div(keep_prob) * random_tensor
153 |     return output
154 | 
155 | class DropPath(nn.Module):
156 |     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
157 |     """
158 |     def __init__(self, drop_prob=None):
159 |         super(DropPath, self).__init__()
160 |         self.drop_prob = drop_prob
161 | 
162 |     def forward(self, x):
163 |         return drop_path(x, self.drop_prob, self.training)
164 | 


--------------------------------------------------------------------------------
/timesformer/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_eval_helper.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | ##############################################################################
 15 | #
 16 | # Based on:
 17 | # --------------------------------------------------------
 18 | # ActivityNet
 19 | # Copyright (c) 2015 ActivityNet
 20 | # Licensed under The MIT License
 21 | # [see https://github.com/activitynet/ActivityNet/blob/master/LICENSE for details]
 22 | # --------------------------------------------------------
 23 | 
 24 | """Helper functions for AVA evaluation."""
 25 | 
 26 | from __future__ import (
 27 |     absolute_import,
 28 |     division,
 29 |     print_function,
 30 |     unicode_literals,
 31 | )
 32 | import csv
 33 | import logging
 34 | import numpy as np
 35 | import pprint
 36 | import time
 37 | from collections import defaultdict
 38 | from fvcore.common.file_io import PathManager
 39 | import timesformer.utils.distributed as du
 40 | 
 41 | from timesformer.utils.ava_evaluation import (
 42 |     object_detection_evaluation,
 43 |     standard_fields,
 44 | )
 45 | 
 46 | logger = logging.getLogger(__name__)
 47 | 
 48 | 
 49 | def make_image_key(video_id, timestamp):
 50 |     """Returns a unique identifier for a video id & timestamp."""
 51 |     return "%s,%04d" % (video_id, int(timestamp))
 52 | 
 53 | 
 54 | def read_csv(csv_file, class_whitelist=None, load_score=False):
 55 |     """Loads boxes and class labels from a CSV file in the AVA format.
 56 |     CSV file format described at https://research.google.com/ava/download.html.
 57 |     Args:
 58 |       csv_file: A file object.
 59 |       class_whitelist: If provided, boxes corresponding to (integer) class labels
 60 |         not in this set are skipped.
 61 |     Returns:
 62 |       boxes: A dictionary mapping each unique image key (string) to a list of
 63 |         boxes, given as coordinates [y1, x1, y2, x2].
 64 |       labels: A dictionary mapping each unique image key (string) to a list of
 65 |         integer class lables, matching the corresponding box in `boxes`.
 66 |       scores: A dictionary mapping each unique image key (string) to a list of
 67 |         score values lables, matching the corresponding label in `labels`. If
 68 |         scores are not provided in the csv, then they will default to 1.0.
 69 |     """
 70 |     boxes = defaultdict(list)
 71 |     labels = defaultdict(list)
 72 |     scores = defaultdict(list)
 73 |     with PathManager.open(csv_file, "r") as f:
 74 |         reader = csv.reader(f)
 75 |         for row in reader:
 76 |             assert len(row) in [7, 8], "Wrong number of columns: " + row
 77 |             image_key = make_image_key(row[0], row[1])
 78 |             x1, y1, x2, y2 = [float(n) for n in row[2:6]]
 79 |             action_id = int(row[6])
 80 |             if class_whitelist and action_id not in class_whitelist:
 81 |                 continue
 82 |             score = 1.0
 83 |             if load_score:
 84 |                 score = float(row[7])
 85 |             boxes[image_key].append([y1, x1, y2, x2])
 86 |             labels[image_key].append(action_id)
 87 |             scores[image_key].append(score)
 88 |     return boxes, labels, scores
 89 | 
 90 | 
 91 | def read_exclusions(exclusions_file):
 92 |     """Reads a CSV file of excluded timestamps.
 93 |     Args:
 94 |       exclusions_file: A file object containing a csv of video-id,timestamp.
 95 |     Returns:
 96 |       A set of strings containing excluded image keys, e.g. "aaaaaaaaaaa,0904",
 97 |       or an empty set if exclusions file is None.
 98 |     """
 99 |     excluded = set()
100 |     if exclusions_file:
101 |         with PathManager.open(exclusions_file, "r") as f:
102 |             reader = csv.reader(f)
103 |             for row in reader:
104 |                 assert len(row) == 2, "Expected only 2 columns, got: " + row
105 |                 excluded.add(make_image_key(row[0], row[1]))
106 |     return excluded
107 | 
108 | 
109 | def read_labelmap(labelmap_file):
110 |     """Read label map and class ids."""
111 | 
112 |     labelmap = []
113 |     class_ids = set()
114 |     name = ""
115 |     class_id = ""
116 |     with PathManager.open(labelmap_file, "r") as f:
117 |         for line in f:
118 |             if line.startswith("  name:"):
119 |                 name = line.split('"')[1]
120 |             elif line.startswith("  id:") or line.startswith("  label_id:"):
121 |                 class_id = int(line.strip().split(" ")[-1])
122 |                 labelmap.append({"id": class_id, "name": name})
123 |                 class_ids.add(class_id)
124 |     return labelmap, class_ids
125 | 
126 | 
127 | def evaluate_ava_from_files(labelmap, groundtruth, detections, exclusions):
128 |     """Run AVA evaluation given annotation/prediction files."""
129 | 
130 |     categories, class_whitelist = read_labelmap(labelmap)
131 |     excluded_keys = read_exclusions(exclusions)
132 |     groundtruth = read_csv(groundtruth, class_whitelist, load_score=False)
133 |     detections = read_csv(detections, class_whitelist, load_score=True)
134 |     run_evaluation(categories, groundtruth, detections, excluded_keys)
135 | 
136 | 
137 | def evaluate_ava(
138 |     preds,
139 |     original_boxes,
140 |     metadata,
141 |     excluded_keys,
142 |     class_whitelist,
143 |     categories,
144 |     groundtruth=None,
145 |     video_idx_to_name=None,
146 |     name="latest",
147 | ):
148 |     """Run AVA evaluation given numpy arrays."""
149 | 
150 |     eval_start = time.time()
151 | 
152 |     detections = get_ava_eval_data(
153 |         preds,
154 |         original_boxes,
155 |         metadata,
156 |         class_whitelist,
157 |         video_idx_to_name=video_idx_to_name,
158 |     )
159 | 
160 |     logger.info("Evaluating with %d unique GT frames." % len(groundtruth[0]))
161 |     logger.info(
162 |         "Evaluating with %d unique detection frames" % len(detections[0])
163 |     )
164 | 
165 |     write_results(detections, "detections_%s.csv" % name)
166 |     write_results(groundtruth, "groundtruth_%s.csv" % name)
167 | 
168 |     results = run_evaluation(categories, groundtruth, detections, excluded_keys)
169 | 
170 |     logger.info("AVA eval done in %f seconds." % (time.time() - eval_start))
171 |     return results["PascalBoxes_Precision/mAP@0.5IOU"]
172 | 
173 | 
174 | def run_evaluation(
175 |     categories, groundtruth, detections, excluded_keys, verbose=True
176 | ):
177 |     """AVA evaluation main logic."""
178 | 
179 |     pascal_evaluator = object_detection_evaluation.PascalDetectionEvaluator(
180 |         categories
181 |     )
182 | 
183 |     boxes, labels, _ = groundtruth
184 | 
185 |     gt_keys = []
186 |     pred_keys = []
187 | 
188 |     for image_key in boxes:
189 |         if image_key in excluded_keys:
190 |             logging.info(
191 |                 (
192 |                     "Found excluded timestamp in ground truth: %s. "
193 |                     "It will be ignored."
194 |                 ),
195 |                 image_key,
196 |             )
197 |             continue
198 |         pascal_evaluator.add_single_ground_truth_image_info(
199 |             image_key,
200 |             {
201 |                 standard_fields.InputDataFields.groundtruth_boxes: np.array(
202 |                     boxes[image_key], dtype=float
203 |                 ),
204 |                 standard_fields.InputDataFields.groundtruth_classes: np.array(
205 |                     labels[image_key], dtype=int
206 |                 ),
207 |                 standard_fields.InputDataFields.groundtruth_difficult: np.zeros(
208 |                     len(boxes[image_key]), dtype=bool
209 |                 ),
210 |             },
211 |         )
212 | 
213 |         gt_keys.append(image_key)
214 | 
215 |     boxes, labels, scores = detections
216 | 
217 |     for image_key in boxes:
218 |         if image_key in excluded_keys:
219 |             logging.info(
220 |                 (
221 |                     "Found excluded timestamp in detections: %s. "
222 |                     "It will be ignored."
223 |                 ),
224 |                 image_key,
225 |             )
226 |             continue
227 |         pascal_evaluator.add_single_detected_image_info(
228 |             image_key,
229 |             {
230 |                 standard_fields.DetectionResultFields.detection_boxes: np.array(
231 |                     boxes[image_key], dtype=float
232 |                 ),
233 |                 standard_fields.DetectionResultFields.detection_classes: np.array(
234 |                     labels[image_key], dtype=int
235 |                 ),
236 |                 standard_fields.DetectionResultFields.detection_scores: np.array(
237 |                     scores[image_key], dtype=float
238 |                 ),
239 |             },
240 |         )
241 | 
242 |         pred_keys.append(image_key)
243 | 
244 |     metrics = pascal_evaluator.evaluate()
245 | 
246 |     if du.is_master_proc():
247 |         pprint.pprint(metrics, indent=2)
248 |     return metrics
249 | 
250 | 
251 | def get_ava_eval_data(
252 |     scores,
253 |     boxes,
254 |     metadata,
255 |     class_whitelist,
256 |     verbose=False,
257 |     video_idx_to_name=None,
258 | ):
259 |     """
260 |     Convert our data format into the data format used in official AVA
261 |     evaluation.
262 |     """
263 | 
264 |     out_scores = defaultdict(list)
265 |     out_labels = defaultdict(list)
266 |     out_boxes = defaultdict(list)
267 |     count = 0
268 |     for i in range(scores.shape[0]):
269 |         video_idx = int(np.round(metadata[i][0]))
270 |         sec = int(np.round(metadata[i][1]))
271 | 
272 |         video = video_idx_to_name[video_idx]
273 | 
274 |         key = video + "," + "%04d" % (sec)
275 |         batch_box = boxes[i].tolist()
276 |         # The first is batch idx.
277 |         batch_box = [batch_box[j] for j in [0, 2, 1, 4, 3]]
278 | 
279 |         one_scores = scores[i].tolist()
280 |         for cls_idx, score in enumerate(one_scores):
281 |             if cls_idx + 1 in class_whitelist:
282 |                 out_scores[key].append(score)
283 |                 out_labels[key].append(cls_idx + 1)
284 |                 out_boxes[key].append(batch_box[1:])
285 |                 count += 1
286 | 
287 |     return out_boxes, out_labels, out_scores
288 | 
289 | 
290 | def write_results(detections, filename):
291 |     """Write prediction results into official formats."""
292 |     start = time.time()
293 | 
294 |     boxes, labels, scores = detections
295 |     with PathManager.open(filename, "w") as f:
296 |         for key in boxes.keys():
297 |             for box, label, score in zip(boxes[key], labels[key], scores[key]):
298 |                 f.write(
299 |                     "%s,%.03f,%.03f,%.03f,%.03f,%d,%.04f\n"
300 |                     % (key, box[1], box[0], box[3], box[2], label, score)
301 |                 )
302 | 
303 |     logger.info("AVA results wrote to %s" % filename)
304 |     logger.info("\ttook %d seconds." % (time.time() - start))
305 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/README.md:
--------------------------------------------------------------------------------
1 | The code under this folder is from the official [ActivityNet repo](https://github.com/activitynet/ActivityNet).
2 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/TimeSformer/a5ef29a7b7264baff199a30b3306ac27de901133/timesformer/utils/ava_evaluation/__init__.py


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/ava_action_list_v2.1_for_activitynet_2018.pbtxt.txt:
--------------------------------------------------------------------------------
  1 | item {
  2 |   name: "bend/bow (at the waist)"
  3 |   id: 1
  4 | }
  5 | item {
  6 |   name: "crouch/kneel"
  7 |   id: 3
  8 | }
  9 | item {
 10 |   name: "dance"
 11 |   id: 4
 12 | }
 13 | item {
 14 |   name: "fall down"
 15 |   id: 5
 16 | }
 17 | item {
 18 |   name: "get up"
 19 |   id: 6
 20 | }
 21 | item {
 22 |   name: "jump/leap"
 23 |   id: 7
 24 | }
 25 | item {
 26 |   name: "lie/sleep"
 27 |   id: 8
 28 | }
 29 | item {
 30 |   name: "martial art"
 31 |   id: 9
 32 | }
 33 | item {
 34 |   name: "run/jog"
 35 |   id: 10
 36 | }
 37 | item {
 38 |   name: "sit"
 39 |   id: 11
 40 | }
 41 | item {
 42 |   name: "stand"
 43 |   id: 12
 44 | }
 45 | item {
 46 |   name: "swim"
 47 |   id: 13
 48 | }
 49 | item {
 50 |   name: "walk"
 51 |   id: 14
 52 | }
 53 | item {
 54 |   name: "answer phone"
 55 |   id: 15
 56 | }
 57 | item {
 58 |   name: "carry/hold (an object)"
 59 |   id: 17
 60 | }
 61 | item {
 62 |   name: "climb (e.g., a mountain)"
 63 |   id: 20
 64 | }
 65 | item {
 66 |   name: "close (e.g., a door, a box)"
 67 |   id: 22
 68 | }
 69 | item {
 70 |   name: "cut"
 71 |   id: 24
 72 | }
 73 | item {
 74 |   name: "dress/put on clothing"
 75 |   id: 26
 76 | }
 77 | item {
 78 |   name: "drink"
 79 |   id: 27
 80 | }
 81 | item {
 82 |   name: "drive (e.g., a car, a truck)"
 83 |   id: 28
 84 | }
 85 | item {
 86 |   name: "eat"
 87 |   id: 29
 88 | }
 89 | item {
 90 |   name: "enter"
 91 |   id: 30
 92 | }
 93 | item {
 94 |   name: "hit (an object)"
 95 |   id: 34
 96 | }
 97 | item {
 98 |   name: "lift/pick up"
 99 |   id: 36
100 | }
101 | item {
102 |   name: "listen (e.g., to music)"
103 |   id: 37
104 | }
105 | item {
106 |   name: "open (e.g., a window, a car door)"
107 |   id: 38
108 | }
109 | item {
110 |   name: "play musical instrument"
111 |   id: 41
112 | }
113 | item {
114 |   name: "point to (an object)"
115 |   id: 43
116 | }
117 | item {
118 |   name: "pull (an object)"
119 |   id: 45
120 | }
121 | item {
122 |   name: "push (an object)"
123 |   id: 46
124 | }
125 | item {
126 |   name: "put down"
127 |   id: 47
128 | }
129 | item {
130 |   name: "read"
131 |   id: 48
132 | }
133 | item {
134 |   name: "ride (e.g., a bike, a car, a horse)"
135 |   id: 49
136 | }
137 | item {
138 |   name: "sail boat"
139 |   id: 51
140 | }
141 | item {
142 |   name: "shoot"
143 |   id: 52
144 | }
145 | item {
146 |   name: "smoke"
147 |   id: 54
148 | }
149 | item {
150 |   name: "take a photo"
151 |   id: 56
152 | }
153 | item {
154 |   name: "text on/look at a cellphone"
155 |   id: 57
156 | }
157 | item {
158 |   name: "throw"
159 |   id: 58
160 | }
161 | item {
162 |   name: "touch (an object)"
163 |   id: 59
164 | }
165 | item {
166 |   name: "turn (e.g., a screwdriver)"
167 |   id: 60
168 | }
169 | item {
170 |   name: "watch (e.g., TV)"
171 |   id: 61
172 | }
173 | item {
174 |   name: "work on a computer"
175 |   id: 62
176 | }
177 | item {
178 |   name: "write"
179 |   id: 63
180 | }
181 | item {
182 |   name: "fight/hit (a person)"
183 |   id: 64
184 | }
185 | item {
186 |   name: "give/serve (an object) to (a person)"
187 |   id: 65
188 | }
189 | item {
190 |   name: "grab (a person)"
191 |   id: 66
192 | }
193 | item {
194 |   name: "hand clap"
195 |   id: 67
196 | }
197 | item {
198 |   name: "hand shake"
199 |   id: 68
200 | }
201 | item {
202 |   name: "hand wave"
203 |   id: 69
204 | }
205 | item {
206 |   name: "hug (a person)"
207 |   id: 70
208 | }
209 | item {
210 |   name: "kiss (a person)"
211 |   id: 72
212 | }
213 | item {
214 |   name: "lift (a person)"
215 |   id: 73
216 | }
217 | item {
218 |   name: "listen to (a person)"
219 |   id: 74
220 | }
221 | item {
222 |   name: "push (another person)"
223 |   id: 76
224 | }
225 | item {
226 |   name: "sing to (e.g., self, a person, a group)"
227 |   id: 77
228 | }
229 | item {
230 |   name: "take (an object) from (a person)"
231 |   id: 78
232 | }
233 | item {
234 |   name: "talk to (e.g., self, a person, a group)"
235 |   id: 79
236 | }
237 | item {
238 |   name: "watch (a person)"
239 |   id: 80
240 | }
241 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/label_map_util.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Label map utility functions."""
 16 | 
 17 | from __future__ import (
 18 |     absolute_import,
 19 |     division,
 20 |     print_function,
 21 |     unicode_literals,
 22 | )
 23 | import logging
 24 | 
 25 | # from google.protobuf import text_format
 26 | # from google3.third_party.tensorflow_models.object_detection.protos import string_int_label_map_pb2
 27 | 
 28 | 
 29 | def _validate_label_map(label_map):
 30 |     """Checks if a label map is valid.
 31 | 
 32 |   Args:
 33 |     label_map: StringIntLabelMap to validate.
 34 | 
 35 |   Raises:
 36 |     ValueError: if label map is invalid.
 37 |   """
 38 |     for item in label_map.item:
 39 |         if item.id < 1:
 40 |             raise ValueError("Label map ids should be >= 1.")
 41 | 
 42 | 
 43 | def create_category_index(categories):
 44 |     """Creates dictionary of COCO compatible categories keyed by category id.
 45 | 
 46 |   Args:
 47 |     categories: a list of dicts, each of which has the following keys:
 48 |       'id': (required) an integer id uniquely identifying this category.
 49 |       'name': (required) string representing category name
 50 |         e.g., 'cat', 'dog', 'pizza'.
 51 | 
 52 |   Returns:
 53 |     category_index: a dict containing the same entries as categories, but keyed
 54 |       by the 'id' field of each category.
 55 |   """
 56 |     category_index = {}
 57 |     for cat in categories:
 58 |         category_index[cat["id"]] = cat
 59 |     return category_index
 60 | 
 61 | 
 62 | def get_max_label_map_index(label_map):
 63 |     """Get maximum index in label map.
 64 | 
 65 |   Args:
 66 |     label_map: a StringIntLabelMapProto
 67 | 
 68 |   Returns:
 69 |     an integer
 70 |   """
 71 |     return max([item.id for item in label_map.item])
 72 | 
 73 | 
 74 | def convert_label_map_to_categories(
 75 |     label_map, max_num_classes, use_display_name=True
 76 | ):
 77 |     """Loads label map proto and returns categories list compatible with eval.
 78 | 
 79 |   This function loads a label map and returns a list of dicts, each of which
 80 |   has the following keys:
 81 |     'id': (required) an integer id uniquely identifying this category.
 82 |     'name': (required) string representing category name
 83 |       e.g., 'cat', 'dog', 'pizza'.
 84 |   We only allow class into the list if its id-label_id_offset is
 85 |   between 0 (inclusive) and max_num_classes (exclusive).
 86 |   If there are several items mapping to the same id in the label map,
 87 |   we will only keep the first one in the categories list.
 88 | 
 89 |   Args:
 90 |     label_map: a StringIntLabelMapProto or None.  If None, a default categories
 91 |       list is created with max_num_classes categories.
 92 |     max_num_classes: maximum number of (consecutive) label indices to include.
 93 |     use_display_name: (boolean) choose whether to load 'display_name' field
 94 |       as category name.  If False or if the display_name field does not exist,
 95 |       uses 'name' field as category names instead.
 96 |   Returns:
 97 |     categories: a list of dictionaries representing all possible categories.
 98 |   """
 99 |     categories = []
100 |     list_of_ids_already_added = []
101 |     if not label_map:
102 |         label_id_offset = 1
103 |         for class_id in range(max_num_classes):
104 |             categories.append(
105 |                 {
106 |                     "id": class_id + label_id_offset,
107 |                     "name": "category_{}".format(class_id + label_id_offset),
108 |                 }
109 |             )
110 |         return categories
111 |     for item in label_map.item:
112 |         if not 0 < item.id <= max_num_classes:
113 |             logging.info(
114 |                 "Ignore item %d since it falls outside of requested "
115 |                 "label range.",
116 |                 item.id,
117 |             )
118 |             continue
119 |         if use_display_name and item.HasField("display_name"):
120 |             name = item.display_name
121 |         else:
122 |             name = item.name
123 |         if item.id not in list_of_ids_already_added:
124 |             list_of_ids_already_added.append(item.id)
125 |             categories.append({"id": item.id, "name": name})
126 |     return categories
127 | 
128 | 
129 | def load_labelmap(path):
130 |     """Loads label map proto.
131 | 
132 |   Args:
133 |     path: path to StringIntLabelMap proto text file.
134 |   Returns:
135 |     a StringIntLabelMapProto
136 |   """
137 |     with open(path, "r") as fid:
138 |         label_map_string = fid.read()
139 |         label_map = string_int_label_map_pb2.StringIntLabelMap()
140 |         try:
141 |             text_format.Merge(label_map_string, label_map)
142 |         except text_format.ParseError:
143 |             label_map.ParseFromString(label_map_string)
144 |     _validate_label_map(label_map)
145 |     return label_map
146 | 
147 | 
148 | def get_label_map_dict(label_map_path, use_display_name=False):
149 |     """Reads a label map and returns a dictionary of label names to id.
150 | 
151 |   Args:
152 |     label_map_path: path to label_map.
153 |     use_display_name: whether to use the label map items' display names as keys.
154 | 
155 |   Returns:
156 |     A dictionary mapping label names to id.
157 |   """
158 |     label_map = load_labelmap(label_map_path)
159 |     label_map_dict = {}
160 |     for item in label_map.item:
161 |         if use_display_name:
162 |             label_map_dict[item.display_name] = item.id
163 |         else:
164 |             label_map_dict[item.name] = item.id
165 |     return label_map_dict
166 | 
167 | 
168 | def create_category_index_from_labelmap(label_map_path):
169 |     """Reads a label map and returns a category index.
170 | 
171 |   Args:
172 |     label_map_path: Path to `StringIntLabelMap` proto text file.
173 | 
174 |   Returns:
175 |     A category index, which is a dictionary that maps integer ids to dicts
176 |     containing categories, e.g.
177 |     {1: {'id': 1, 'name': 'dog'}, 2: {'id': 2, 'name': 'cat'}, ...}
178 |   """
179 |     label_map = load_labelmap(label_map_path)
180 |     max_num_classes = max(item.id for item in label_map.item)
181 |     categories = convert_label_map_to_categories(label_map, max_num_classes)
182 |     return create_category_index(categories)
183 | 
184 | 
185 | def create_class_agnostic_category_index():
186 |     """Creates a category index with a single `object` class."""
187 |     return {1: {"id": 1, "name": "object"}}
188 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/metrics.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Functions for computing metrics like precision, recall, CorLoc and etc."""
 17 | from __future__ import division
 18 | import numpy as np
 19 | 
 20 | 
 21 | def compute_precision_recall(scores, labels, num_gt):
 22 |     """Compute precision and recall.
 23 | 
 24 |   Args:
 25 |     scores: A float numpy array representing detection score
 26 |     labels: A boolean numpy array representing true/false positive labels
 27 |     num_gt: Number of ground truth instances
 28 | 
 29 |   Raises:
 30 |     ValueError: if the input is not of the correct format
 31 | 
 32 |   Returns:
 33 |     precision: Fraction of positive instances over detected ones. This value is
 34 |       None if no ground truth labels are present.
 35 |     recall: Fraction of detected positive instance over all positive instances.
 36 |       This value is None if no ground truth labels are present.
 37 | 
 38 |   """
 39 |     if (
 40 |         not isinstance(labels, np.ndarray)
 41 |         or labels.dtype != np.bool
 42 |         or len(labels.shape) != 1
 43 |     ):
 44 |         raise ValueError("labels must be single dimension bool numpy array")
 45 | 
 46 |     if not isinstance(scores, np.ndarray) or len(scores.shape) != 1:
 47 |         raise ValueError("scores must be single dimension numpy array")
 48 | 
 49 |     if num_gt < np.sum(labels):
 50 |         raise ValueError(
 51 |             "Number of true positives must be smaller than num_gt."
 52 |         )
 53 | 
 54 |     if len(scores) != len(labels):
 55 |         raise ValueError("scores and labels must be of the same size.")
 56 | 
 57 |     if num_gt == 0:
 58 |         return None, None
 59 | 
 60 |     sorted_indices = np.argsort(scores)
 61 |     sorted_indices = sorted_indices[::-1]
 62 |     labels = labels.astype(int)
 63 |     true_positive_labels = labels[sorted_indices]
 64 |     false_positive_labels = 1 - true_positive_labels
 65 |     cum_true_positives = np.cumsum(true_positive_labels)
 66 |     cum_false_positives = np.cumsum(false_positive_labels)
 67 |     precision = cum_true_positives.astype(float) / (
 68 |         cum_true_positives + cum_false_positives
 69 |     )
 70 |     recall = cum_true_positives.astype(float) / num_gt
 71 |     return precision, recall
 72 | 
 73 | 
 74 | def compute_average_precision(precision, recall):
 75 |     """Compute Average Precision according to the definition in VOCdevkit.
 76 | 
 77 |   Precision is modified to ensure that it does not decrease as recall
 78 |   decrease.
 79 | 
 80 |   Args:
 81 |     precision: A float [N, 1] numpy array of precisions
 82 |     recall: A float [N, 1] numpy array of recalls
 83 | 
 84 |   Raises:
 85 |     ValueError: if the input is not of the correct format
 86 | 
 87 |   Returns:
 88 |     average_precison: The area under the precision recall curve. NaN if
 89 |       precision and recall are None.
 90 | 
 91 |   """
 92 |     if precision is None:
 93 |         if recall is not None:
 94 |             raise ValueError("If precision is None, recall must also be None")
 95 |         return np.NAN
 96 | 
 97 |     if not isinstance(precision, np.ndarray) or not isinstance(
 98 |         recall, np.ndarray
 99 |     ):
100 |         raise ValueError("precision and recall must be numpy array")
101 |     if precision.dtype != np.float or recall.dtype != np.float:
102 |         raise ValueError("input must be float numpy array.")
103 |     if len(precision) != len(recall):
104 |         raise ValueError("precision and recall must be of the same size.")
105 |     if not precision.size:
106 |         return 0.0
107 |     if np.amin(precision) < 0 or np.amax(precision) > 1:
108 |         raise ValueError("Precision must be in the range of [0, 1].")
109 |     if np.amin(recall) < 0 or np.amax(recall) > 1:
110 |         raise ValueError("recall must be in the range of [0, 1].")
111 |     if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
112 |         raise ValueError("recall must be a non-decreasing array")
113 | 
114 |     recall = np.concatenate([[0], recall, [1]])
115 |     precision = np.concatenate([[0], precision, [0]])
116 | 
117 |     # Preprocess precision to be a non-decreasing array
118 |     for i in range(len(precision) - 2, -1, -1):
119 |         precision[i] = np.maximum(precision[i], precision[i + 1])
120 | 
121 |     indices = np.where(recall[1:] != recall[:-1])[0] + 1
122 |     average_precision = np.sum(
123 |         (recall[indices] - recall[indices - 1]) * precision[indices]
124 |     )
125 |     return average_precision
126 | 
127 | 
128 | def compute_cor_loc(
129 |     num_gt_imgs_per_class, num_images_correctly_detected_per_class
130 | ):
131 |     """Compute CorLoc according to the definition in the following paper.
132 | 
133 |   https://www.robots.ox.ac.uk/~vgg/rg/papers/deselaers-eccv10.pdf
134 | 
135 |   Returns nans if there are no ground truth images for a class.
136 | 
137 |   Args:
138 |     num_gt_imgs_per_class: 1D array, representing number of images containing
139 |         at least one object instance of a particular class
140 |     num_images_correctly_detected_per_class: 1D array, representing number of
141 |         images that are correctly detected at least one object instance of a
142 |         particular class
143 | 
144 |   Returns:
145 |     corloc_per_class: A float numpy array represents the corloc score of each
146 |       class
147 |   """
148 |     # Divide by zero expected for classes with no gt examples.
149 |     with np.errstate(divide="ignore", invalid="ignore"):
150 |         return np.where(
151 |             num_gt_imgs_per_class == 0,
152 |             np.nan,
153 |             num_images_correctly_detected_per_class / num_gt_imgs_per_class,
154 |         )
155 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/np_box_list.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Numpy BoxList classes and functions."""
 17 | 
 18 | from __future__ import (
 19 |     absolute_import,
 20 |     division,
 21 |     print_function,
 22 |     unicode_literals,
 23 | )
 24 | import numpy as np
 25 | 
 26 | 
 27 | class BoxList(object):
 28 |     """Box collection.
 29 | 
 30 |   BoxList represents a list of bounding boxes as numpy array, where each
 31 |   bounding box is represented as a row of 4 numbers,
 32 |   [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes within a
 33 |   given list correspond to a single image.
 34 | 
 35 |   Optionally, users can add additional related fields (such as
 36 |   objectness/classification scores).
 37 |   """
 38 | 
 39 |     def __init__(self, data):
 40 |         """Constructs box collection.
 41 | 
 42 |     Args:
 43 |       data: a numpy array of shape [N, 4] representing box coordinates
 44 | 
 45 |     Raises:
 46 |       ValueError: if bbox data is not a numpy array
 47 |       ValueError: if invalid dimensions for bbox data
 48 |     """
 49 |         if not isinstance(data, np.ndarray):
 50 |             raise ValueError("data must be a numpy array.")
 51 |         if len(data.shape) != 2 or data.shape[1] != 4:
 52 |             raise ValueError("Invalid dimensions for box data.")
 53 |         if data.dtype != np.float32 and data.dtype != np.float64:
 54 |             raise ValueError(
 55 |                 "Invalid data type for box data: float is required."
 56 |             )
 57 |         if not self._is_valid_boxes(data):
 58 |             raise ValueError(
 59 |                 "Invalid box data. data must be a numpy array of "
 60 |                 "N*[y_min, x_min, y_max, x_max]"
 61 |             )
 62 |         self.data = {"boxes": data}
 63 | 
 64 |     def num_boxes(self):
 65 |         """Return number of boxes held in collections."""
 66 |         return self.data["boxes"].shape[0]
 67 | 
 68 |     def get_extra_fields(self):
 69 |         """Return all non-box fields."""
 70 |         return [k for k in self.data.keys() if k != "boxes"]
 71 | 
 72 |     def has_field(self, field):
 73 |         return field in self.data
 74 | 
 75 |     def add_field(self, field, field_data):
 76 |         """Add data to a specified field.
 77 | 
 78 |     Args:
 79 |       field: a string parameter used to speficy a related field to be accessed.
 80 |       field_data: a numpy array of [N, ...] representing the data associated
 81 |           with the field.
 82 |     Raises:
 83 |       ValueError: if the field is already exist or the dimension of the field
 84 |           data does not matches the number of boxes.
 85 |     """
 86 |         if self.has_field(field):
 87 |             raise ValueError("Field " + field + "already exists")
 88 |         if len(field_data.shape) < 1 or field_data.shape[0] != self.num_boxes():
 89 |             raise ValueError("Invalid dimensions for field data")
 90 |         self.data[field] = field_data
 91 | 
 92 |     def get(self):
 93 |         """Convenience function for accesssing box coordinates.
 94 | 
 95 |     Returns:
 96 |       a numpy array of shape [N, 4] representing box corners
 97 |     """
 98 |         return self.get_field("boxes")
 99 | 
100 |     def get_field(self, field):
101 |         """Accesses data associated with the specified field in the box collection.
102 | 
103 |     Args:
104 |       field: a string parameter used to speficy a related field to be accessed.
105 | 
106 |     Returns:
107 |       a numpy 1-d array representing data of an associated field
108 | 
109 |     Raises:
110 |       ValueError: if invalid field
111 |     """
112 |         if not self.has_field(field):
113 |             raise ValueError("field {} does not exist".format(field))
114 |         return self.data[field]
115 | 
116 |     def get_coordinates(self):
117 |         """Get corner coordinates of boxes.
118 | 
119 |     Returns:
120 |      a list of 4 1-d numpy arrays [y_min, x_min, y_max, x_max]
121 |     """
122 |         box_coordinates = self.get()
123 |         y_min = box_coordinates[:, 0]
124 |         x_min = box_coordinates[:, 1]
125 |         y_max = box_coordinates[:, 2]
126 |         x_max = box_coordinates[:, 3]
127 |         return [y_min, x_min, y_max, x_max]
128 | 
129 |     def _is_valid_boxes(self, data):
130 |         """Check whether data fullfills the format of N*[ymin, xmin, ymax, xmin].
131 | 
132 |     Args:
133 |       data: a numpy array of shape [N, 4] representing box coordinates
134 | 
135 |     Returns:
136 |       a boolean indicating whether all ymax of boxes are equal or greater than
137 |           ymin, and all xmax of boxes are equal or greater than xmin.
138 |     """
139 |         if data.shape[0] > 0:
140 |             for i in range(data.shape[0]):
141 |                 if data[i, 0] > data[i, 2] or data[i, 1] > data[i, 3]:
142 |                     return False
143 |         return True
144 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/np_box_mask_list.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Numpy BoxMaskList classes and functions."""
17 | 
18 | from __future__ import (
19 |     absolute_import,
20 |     division,
21 |     print_function,
22 |     unicode_literals,
23 | )
24 | import numpy as np
25 | 
26 | from . import np_box_list
27 | 
28 | 
29 | class BoxMaskList(np_box_list.BoxList):
30 |     """Convenience wrapper for BoxList with masks.
31 | 
32 |   BoxMaskList extends the np_box_list.BoxList to contain masks as well.
33 |   In particular, its constructor receives both boxes and masks. Note that the
34 |   masks correspond to the full image.
35 |   """
36 | 
37 |     def __init__(self, box_data, mask_data):
38 |         """Constructs box collection.
39 | 
40 |     Args:
41 |       box_data: a numpy array of shape [N, 4] representing box coordinates
42 |       mask_data: a numpy array of shape [N, height, width] representing masks
43 |         with values are in {0,1}. The masks correspond to the full
44 |         image. The height and the width will be equal to image height and width.
45 | 
46 |     Raises:
47 |       ValueError: if bbox data is not a numpy array
48 |       ValueError: if invalid dimensions for bbox data
49 |       ValueError: if mask data is not a numpy array
50 |       ValueError: if invalid dimension for mask data
51 |     """
52 |         super(BoxMaskList, self).__init__(box_data)
53 |         if not isinstance(mask_data, np.ndarray):
54 |             raise ValueError("Mask data must be a numpy array.")
55 |         if len(mask_data.shape) != 3:
56 |             raise ValueError("Invalid dimensions for mask data.")
57 |         if mask_data.dtype != np.uint8:
58 |             raise ValueError(
59 |                 "Invalid data type for mask data: uint8 is required."
60 |             )
61 |         if mask_data.shape[0] != box_data.shape[0]:
62 |             raise ValueError(
63 |                 "There should be the same number of boxes and masks."
64 |             )
65 |         self.data["masks"] = mask_data
66 | 
67 |     def get_masks(self):
68 |         """Convenience function for accessing masks.
69 | 
70 |     Returns:
71 |       a numpy array of shape [N, height, width] representing masks
72 |     """
73 |         return self.get_field("masks")
74 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/np_box_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Operations for [N, 4] numpy arrays representing bounding boxes.
 17 | 
 18 | Example box operations that are supported:
 19 |   * Areas: compute bounding box areas
 20 |   * IOU: pairwise intersection-over-union scores
 21 | """
 22 | from __future__ import (
 23 |     absolute_import,
 24 |     division,
 25 |     print_function,
 26 |     unicode_literals,
 27 | )
 28 | import numpy as np
 29 | 
 30 | 
 31 | def area(boxes):
 32 |     """Computes area of boxes.
 33 | 
 34 |   Args:
 35 |     boxes: Numpy array with shape [N, 4] holding N boxes
 36 | 
 37 |   Returns:
 38 |     a numpy array with shape [N*1] representing box areas
 39 |   """
 40 |     return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 41 | 
 42 | 
 43 | def intersection(boxes1, boxes2):
 44 |     """Compute pairwise intersection areas between boxes.
 45 | 
 46 |   Args:
 47 |     boxes1: a numpy array with shape [N, 4] holding N boxes
 48 |     boxes2: a numpy array with shape [M, 4] holding M boxes
 49 | 
 50 |   Returns:
 51 |     a numpy array with shape [N*M] representing pairwise intersection area
 52 |   """
 53 |     [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
 54 |     [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
 55 | 
 56 |     all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
 57 |     all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
 58 |     intersect_heights = np.maximum(
 59 |         np.zeros(all_pairs_max_ymin.shape),
 60 |         all_pairs_min_ymax - all_pairs_max_ymin,
 61 |     )
 62 |     all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
 63 |     all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
 64 |     intersect_widths = np.maximum(
 65 |         np.zeros(all_pairs_max_xmin.shape),
 66 |         all_pairs_min_xmax - all_pairs_max_xmin,
 67 |     )
 68 |     return intersect_heights * intersect_widths
 69 | 
 70 | 
 71 | def iou(boxes1, boxes2):
 72 |     """Computes pairwise intersection-over-union between box collections.
 73 | 
 74 |   Args:
 75 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
 76 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
 77 | 
 78 |   Returns:
 79 |     a numpy array with shape [N, M] representing pairwise iou scores.
 80 |   """
 81 |     intersect = intersection(boxes1, boxes2)
 82 |     area1 = area(boxes1)
 83 |     area2 = area(boxes2)
 84 |     union = (
 85 |         np.expand_dims(area1, axis=1)
 86 |         + np.expand_dims(area2, axis=0)
 87 |         - intersect
 88 |     )
 89 |     return intersect / union
 90 | 
 91 | 
 92 | def ioa(boxes1, boxes2):
 93 |     """Computes pairwise intersection-over-area between box collections.
 94 | 
 95 |   Intersection-over-area (ioa) between two boxes box1 and box2 is defined as
 96 |   their intersection area over box2's area. Note that ioa is not symmetric,
 97 |   that is, IOA(box1, box2) != IOA(box2, box1).
 98 | 
 99 |   Args:
100 |     boxes1: a numpy array with shape [N, 4] holding N boxes.
101 |     boxes2: a numpy array with shape [M, 4] holding N boxes.
102 | 
103 |   Returns:
104 |     a numpy array with shape [N, M] representing pairwise ioa scores.
105 |   """
106 |     intersect = intersection(boxes1, boxes2)
107 |     areas = np.expand_dims(area(boxes2), axis=0)
108 |     return intersect / areas
109 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/np_mask_ops.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Operations for [N, height, width] numpy arrays representing masks.
 17 | 
 18 | Example mask operations that are supported:
 19 |   * Areas: compute mask areas
 20 |   * IOU: pairwise intersection-over-union scores
 21 | """
 22 | from __future__ import (
 23 |     absolute_import,
 24 |     division,
 25 |     print_function,
 26 |     unicode_literals,
 27 | )
 28 | import numpy as np
 29 | 
 30 | EPSILON = 1e-7
 31 | 
 32 | 
 33 | def area(masks):
 34 |     """Computes area of masks.
 35 | 
 36 |   Args:
 37 |     masks: Numpy array with shape [N, height, width] holding N masks. Masks
 38 |       values are of type np.uint8 and values are in {0,1}.
 39 | 
 40 |   Returns:
 41 |     a numpy array with shape [N*1] representing mask areas.
 42 | 
 43 |   Raises:
 44 |     ValueError: If masks.dtype is not np.uint8
 45 |   """
 46 |     if masks.dtype != np.uint8:
 47 |         raise ValueError("Masks type should be np.uint8")
 48 |     return np.sum(masks, axis=(1, 2), dtype=np.float32)
 49 | 
 50 | 
 51 | def intersection(masks1, masks2):
 52 |     """Compute pairwise intersection areas between masks.
 53 | 
 54 |   Args:
 55 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
 56 |       values are of type np.uint8 and values are in {0,1}.
 57 |     masks2: a numpy array with shape [M, height, width] holding M masks. Masks
 58 |       values are of type np.uint8 and values are in {0,1}.
 59 | 
 60 |   Returns:
 61 |     a numpy array with shape [N*M] representing pairwise intersection area.
 62 | 
 63 |   Raises:
 64 |     ValueError: If masks1 and masks2 are not of type np.uint8.
 65 |   """
 66 |     if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
 67 |         raise ValueError("masks1 and masks2 should be of type np.uint8")
 68 |     n = masks1.shape[0]
 69 |     m = masks2.shape[0]
 70 |     answer = np.zeros([n, m], dtype=np.float32)
 71 |     for i in np.arange(n):
 72 |         for j in np.arange(m):
 73 |             answer[i, j] = np.sum(
 74 |                 np.minimum(masks1[i], masks2[j]), dtype=np.float32
 75 |             )
 76 |     return answer
 77 | 
 78 | 
 79 | def iou(masks1, masks2):
 80 |     """Computes pairwise intersection-over-union between mask collections.
 81 | 
 82 |   Args:
 83 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
 84 |       values are of type np.uint8 and values are in {0,1}.
 85 |     masks2: a numpy array with shape [M, height, width] holding N masks. Masks
 86 |       values are of type np.uint8 and values are in {0,1}.
 87 | 
 88 |   Returns:
 89 |     a numpy array with shape [N, M] representing pairwise iou scores.
 90 | 
 91 |   Raises:
 92 |     ValueError: If masks1 and masks2 are not of type np.uint8.
 93 |   """
 94 |     if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
 95 |         raise ValueError("masks1 and masks2 should be of type np.uint8")
 96 |     intersect = intersection(masks1, masks2)
 97 |     area1 = area(masks1)
 98 |     area2 = area(masks2)
 99 |     union = (
100 |         np.expand_dims(area1, axis=1)
101 |         + np.expand_dims(area2, axis=0)
102 |         - intersect
103 |     )
104 |     return intersect / np.maximum(union, EPSILON)
105 | 
106 | 
107 | def ioa(masks1, masks2):
108 |     """Computes pairwise intersection-over-area between box collections.
109 | 
110 |   Intersection-over-area (ioa) between two masks, mask1 and mask2 is defined as
111 |   their intersection area over mask2's area. Note that ioa is not symmetric,
112 |   that is, IOA(mask1, mask2) != IOA(mask2, mask1).
113 | 
114 |   Args:
115 |     masks1: a numpy array with shape [N, height, width] holding N masks. Masks
116 |       values are of type np.uint8 and values are in {0,1}.
117 |     masks2: a numpy array with shape [M, height, width] holding N masks. Masks
118 |       values are of type np.uint8 and values are in {0,1}.
119 | 
120 |   Returns:
121 |     a numpy array with shape [N, M] representing pairwise ioa scores.
122 | 
123 |   Raises:
124 |     ValueError: If masks1 and masks2 are not of type np.uint8.
125 |   """
126 |     if masks1.dtype != np.uint8 or masks2.dtype != np.uint8:
127 |         raise ValueError("masks1 and masks2 should be of type np.uint8")
128 |     intersect = intersection(masks1, masks2)
129 |     areas = np.expand_dims(area(masks2), axis=0)
130 |     return intersect / (areas + EPSILON)
131 | 


--------------------------------------------------------------------------------
/timesformer/utils/ava_evaluation/standard_fields.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Contains classes specifying naming conventions used for object detection.
 17 | 
 18 | 
 19 | Specifies:
 20 |   InputDataFields: standard fields used by reader/preprocessor/batcher.
 21 |   DetectionResultFields: standard fields returned by object detector.
 22 |   BoxListFields: standard field used by BoxList
 23 |   TfExampleFields: standard fields for tf-example data format (go/tf-example).
 24 | """
 25 | 
 26 | 
 27 | from __future__ import (
 28 |     absolute_import,
 29 |     division,
 30 |     print_function,
 31 |     unicode_literals,
 32 | )
 33 | 
 34 | 
 35 | class InputDataFields(object):
 36 |     """Names for the input tensors.
 37 | 
 38 |   Holds the standard data field names to use for identifying input tensors. This
 39 |   should be used by the decoder to identify keys for the returned tensor_dict
 40 |   containing input tensors. And it should be used by the model to identify the
 41 |   tensors it needs.
 42 | 
 43 |   Attributes:
 44 |     image: image.
 45 |     original_image: image in the original input size.
 46 |     key: unique key corresponding to image.
 47 |     source_id: source of the original image.
 48 |     filename: original filename of the dataset (without common path).
 49 |     groundtruth_image_classes: image-level class labels.
 50 |     groundtruth_boxes: coordinates of the ground truth boxes in the image.
 51 |     groundtruth_classes: box-level class labels.
 52 |     groundtruth_label_types: box-level label types (e.g. explicit negative).
 53 |     groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
 54 |       is the groundtruth a single object or a crowd.
 55 |     groundtruth_area: area of a groundtruth segment.
 56 |     groundtruth_difficult: is a `difficult` object
 57 |     groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of the
 58 |       same class, forming a connected group, where instances are heavily
 59 |       occluding each other.
 60 |     proposal_boxes: coordinates of object proposal boxes.
 61 |     proposal_objectness: objectness score of each proposal.
 62 |     groundtruth_instance_masks: ground truth instance masks.
 63 |     groundtruth_instance_boundaries: ground truth instance boundaries.
 64 |     groundtruth_instance_classes: instance mask-level class labels.
 65 |     groundtruth_keypoints: ground truth keypoints.
 66 |     groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
 67 |     groundtruth_label_scores: groundtruth label scores.
 68 |     groundtruth_weights: groundtruth weight factor for bounding boxes.
 69 |     num_groundtruth_boxes: number of groundtruth boxes.
 70 |     true_image_shapes: true shapes of images in the resized images, as resized
 71 |       images can be padded with zeros.
 72 |   """
 73 | 
 74 |     image = "image"
 75 |     original_image = "original_image"
 76 |     key = "key"
 77 |     source_id = "source_id"
 78 |     filename = "filename"
 79 |     groundtruth_image_classes = "groundtruth_image_classes"
 80 |     groundtruth_boxes = "groundtruth_boxes"
 81 |     groundtruth_classes = "groundtruth_classes"
 82 |     groundtruth_label_types = "groundtruth_label_types"
 83 |     groundtruth_is_crowd = "groundtruth_is_crowd"
 84 |     groundtruth_area = "groundtruth_area"
 85 |     groundtruth_difficult = "groundtruth_difficult"
 86 |     groundtruth_group_of = "groundtruth_group_of"
 87 |     proposal_boxes = "proposal_boxes"
 88 |     proposal_objectness = "proposal_objectness"
 89 |     groundtruth_instance_masks = "groundtruth_instance_masks"
 90 |     groundtruth_instance_boundaries = "groundtruth_instance_boundaries"
 91 |     groundtruth_instance_classes = "groundtruth_instance_classes"
 92 |     groundtruth_keypoints = "groundtruth_keypoints"
 93 |     groundtruth_keypoint_visibilities = "groundtruth_keypoint_visibilities"
 94 |     groundtruth_label_scores = "groundtruth_label_scores"
 95 |     groundtruth_weights = "groundtruth_weights"
 96 |     num_groundtruth_boxes = "num_groundtruth_boxes"
 97 |     true_image_shape = "true_image_shape"
 98 | 
 99 | 
100 | class DetectionResultFields(object):
101 |     """Naming conventions for storing the output of the detector.
102 | 
103 |   Attributes:
104 |     source_id: source of the original image.
105 |     key: unique key corresponding to image.
106 |     detection_boxes: coordinates of the detection boxes in the image.
107 |     detection_scores: detection scores for the detection boxes in the image.
108 |     detection_classes: detection-level class labels.
109 |     detection_masks: contains a segmentation mask for each detection box.
110 |     detection_boundaries: contains an object boundary for each detection box.
111 |     detection_keypoints: contains detection keypoints for each detection box.
112 |     num_detections: number of detections in the batch.
113 |   """
114 | 
115 |     source_id = "source_id"
116 |     key = "key"
117 |     detection_boxes = "detection_boxes"
118 |     detection_scores = "detection_scores"
119 |     detection_classes = "detection_classes"
120 |     detection_masks = "detection_masks"
121 |     detection_boundaries = "detection_boundaries"
122 |     detection_keypoints = "detection_keypoints"
123 |     num_detections = "num_detections"
124 | 
125 | 
126 | class BoxListFields(object):
127 |     """Naming conventions for BoxLists.
128 | 
129 |   Attributes:
130 |     boxes: bounding box coordinates.
131 |     classes: classes per bounding box.
132 |     scores: scores per bounding box.
133 |     weights: sample weights per bounding box.
134 |     objectness: objectness score per bounding box.
135 |     masks: masks per bounding box.
136 |     boundaries: boundaries per bounding box.
137 |     keypoints: keypoints per bounding box.
138 |     keypoint_heatmaps: keypoint heatmaps per bounding box.
139 |   """
140 | 
141 |     boxes = "boxes"
142 |     classes = "classes"
143 |     scores = "scores"
144 |     weights = "weights"
145 |     objectness = "objectness"
146 |     masks = "masks"
147 |     boundaries = "boundaries"
148 |     keypoints = "keypoints"
149 |     keypoint_heatmaps = "keypoint_heatmaps"
150 | 
151 | 
152 | class TfExampleFields(object):
153 |     """TF-example proto feature names for object detection.
154 | 
155 |   Holds the standard feature names to load from an Example proto for object
156 |   detection.
157 | 
158 |   Attributes:
159 |     image_encoded: JPEG encoded string
160 |     image_format: image format, e.g. "JPEG"
161 |     filename: filename
162 |     channels: number of channels of image
163 |     colorspace: colorspace, e.g. "RGB"
164 |     height: height of image in pixels, e.g. 462
165 |     width: width of image in pixels, e.g. 581
166 |     source_id: original source of the image
167 |     object_class_text: labels in text format, e.g. ["person", "cat"]
168 |     object_class_label: labels in numbers, e.g. [16, 8]
169 |     object_bbox_xmin: xmin coordinates of groundtruth box, e.g. 10, 30
170 |     object_bbox_xmax: xmax coordinates of groundtruth box, e.g. 50, 40
171 |     object_bbox_ymin: ymin coordinates of groundtruth box, e.g. 40, 50
172 |     object_bbox_ymax: ymax coordinates of groundtruth box, e.g. 80, 70
173 |     object_view: viewpoint of object, e.g. ["frontal", "left"]
174 |     object_truncated: is object truncated, e.g. [true, false]
175 |     object_occluded: is object occluded, e.g. [true, false]
176 |     object_difficult: is object difficult, e.g. [true, false]
177 |     object_group_of: is object a single object or a group of objects
178 |     object_depiction: is object a depiction
179 |     object_is_crowd: [DEPRECATED, use object_group_of instead]
180 |       is the object a single object or a crowd
181 |     object_segment_area: the area of the segment.
182 |     object_weight: a weight factor for the object's bounding box.
183 |     instance_masks: instance segmentation masks.
184 |     instance_boundaries: instance boundaries.
185 |     instance_classes: Classes for each instance segmentation mask.
186 |     detection_class_label: class label in numbers.
187 |     detection_bbox_ymin: ymin coordinates of a detection box.
188 |     detection_bbox_xmin: xmin coordinates of a detection box.
189 |     detection_bbox_ymax: ymax coordinates of a detection box.
190 |     detection_bbox_xmax: xmax coordinates of a detection box.
191 |     detection_score: detection score for the class label and box.
192 |   """
193 | 
194 |     image_encoded = "image/encoded"
195 |     image_format = "image/format"  # format is reserved keyword
196 |     filename = "image/filename"
197 |     channels = "image/channels"
198 |     colorspace = "image/colorspace"
199 |     height = "image/height"
200 |     width = "image/width"
201 |     source_id = "image/source_id"
202 |     object_class_text = "image/object/class/text"
203 |     object_class_label = "image/object/class/label"
204 |     object_bbox_ymin = "image/object/bbox/ymin"
205 |     object_bbox_xmin = "image/object/bbox/xmin"
206 |     object_bbox_ymax = "image/object/bbox/ymax"
207 |     object_bbox_xmax = "image/object/bbox/xmax"
208 |     object_view = "image/object/view"
209 |     object_truncated = "image/object/truncated"
210 |     object_occluded = "image/object/occluded"
211 |     object_difficult = "image/object/difficult"
212 |     object_group_of = "image/object/group_of"
213 |     object_depiction = "image/object/depiction"
214 |     object_is_crowd = "image/object/is_crowd"
215 |     object_segment_area = "image/object/segment/area"
216 |     object_weight = "image/object/weight"
217 |     instance_masks = "image/segmentation/object"
218 |     instance_boundaries = "image/boundaries/object"
219 |     instance_classes = "image/segmentation/object/class"
220 |     detection_class_label = "image/detection/label"
221 |     detection_bbox_ymin = "image/detection/bbox/ymin"
222 |     detection_bbox_xmin = "image/detection/bbox/xmin"
223 |     detection_bbox_ymax = "image/detection/bbox/ymax"
224 |     detection_bbox_xmax = "image/detection/bbox/xmax"
225 |     detection_score = "image/detection/score"
226 | 


--------------------------------------------------------------------------------
/timesformer/utils/benchmark.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | Functions for benchmarks.
  4 | """
  5 | 
  6 | import numpy as np
  7 | import pprint
  8 | import torch
  9 | import tqdm
 10 | from fvcore.common.timer import Timer
 11 | 
 12 | import timesformer.utils.logging as logging
 13 | import timesformer.utils.misc as misc
 14 | from timesformer.datasets import loader
 15 | from timesformer.utils.env import setup_environment
 16 | 
 17 | logger = logging.get_logger(__name__)
 18 | 
 19 | 
 20 | def benchmark_data_loading(cfg):
 21 |     """
 22 |     Benchmark the speed of data loading in PySlowFast.
 23 |     Args:
 24 | 
 25 |         cfg (CfgNode): configs. Details can be found in
 26 |             lib/config/defaults.py
 27 |     """
 28 |     # Set up environment.
 29 |     setup_environment()
 30 |     # Set random seed from configs.
 31 |     np.random.seed(cfg.RNG_SEED)
 32 |     torch.manual_seed(cfg.RNG_SEED)
 33 | 
 34 |     # Setup logging format.
 35 |     logging.setup_logging(cfg.OUTPUT_DIR)
 36 | 
 37 |     # Print config.
 38 |     logger.info("Benchmark data loading with config:")
 39 |     logger.info(pprint.pformat(cfg))
 40 | 
 41 |     timer = Timer()
 42 |     dataloader = loader.construct_loader(cfg, "train")
 43 |     logger.info(
 44 |         "Initialize loader using {:.2f} seconds.".format(timer.seconds())
 45 |     )
 46 |     # Total batch size across different machines.
 47 |     batch_size = cfg.TRAIN.BATCH_SIZE * cfg.NUM_SHARDS
 48 |     log_period = cfg.BENCHMARK.LOG_PERIOD
 49 |     epoch_times = []
 50 |     # Test for a few epochs.
 51 |     for cur_epoch in range(cfg.BENCHMARK.NUM_EPOCHS):
 52 |         timer = Timer()
 53 |         timer_epoch = Timer()
 54 |         iter_times = []
 55 |         if cfg.BENCHMARK.SHUFFLE:
 56 |             loader.shuffle_dataset(dataloader, cur_epoch)
 57 |         for cur_iter, _ in enumerate(tqdm.tqdm(dataloader)):
 58 |             if cur_iter > 0 and cur_iter % log_period == 0:
 59 |                 iter_times.append(timer.seconds())
 60 |                 ram_usage, ram_total = misc.cpu_mem_usage()
 61 |                 logger.info(
 62 |                     "Epoch {}: {} iters ({} videos) in {:.2f} seconds. "
 63 |                     "RAM Usage: {:.2f}/{:.2f} GB.".format(
 64 |                         cur_epoch,
 65 |                         log_period,
 66 |                         log_period * batch_size,
 67 |                         iter_times[-1],
 68 |                         ram_usage,
 69 |                         ram_total,
 70 |                     )
 71 |                 )
 72 |                 timer.reset()
 73 |         epoch_times.append(timer_epoch.seconds())
 74 |         ram_usage, ram_total = misc.cpu_mem_usage()
 75 |         logger.info(
 76 |             "Epoch {}: in total {} iters ({} videos) in {:.2f} seconds. "
 77 |             "RAM Usage: {:.2f}/{:.2f} GB.".format(
 78 |                 cur_epoch,
 79 |                 len(dataloader),
 80 |                 len(dataloader) * batch_size,
 81 |                 epoch_times[-1],
 82 |                 ram_usage,
 83 |                 ram_total,
 84 |             )
 85 |         )
 86 |         logger.info(
 87 |             "Epoch {}: on average every {} iters ({} videos) take {:.2f}/{:.2f} "
 88 |             "(avg/std) seconds.".format(
 89 |                 cur_epoch,
 90 |                 log_period,
 91 |                 log_period * batch_size,
 92 |                 np.mean(iter_times),
 93 |                 np.std(iter_times),
 94 |             )
 95 |         )
 96 |     logger.info(
 97 |         "On average every epoch ({} videos) takes {:.2f}/{:.2f} "
 98 |         "(avg/std) seconds.".format(
 99 |             len(dataloader) * batch_size,
100 |             np.mean(epoch_times),
101 |             np.std(epoch_times),
102 |         )
103 |     )
104 | 


--------------------------------------------------------------------------------
/timesformer/utils/bn_helper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """bn helper."""
 4 | 
 5 | import itertools
 6 | import torch
 7 | 
 8 | 
 9 | @torch.no_grad()
10 | def compute_and_update_bn_stats(model, data_loader, num_batches=200):
11 |     """
12 |     Compute and update the batch norm stats to make it more precise. During
13 |     training both bn stats and the weight are changing after every iteration,
14 |     so the bn can not precisely reflect the latest stats of the current model.
15 |     Here the bn stats is recomputed without change of weights, to make the
16 |     running mean and running var more precise.
17 |     Args:
18 |         model (model): the model using to compute and update the bn stats.
19 |         data_loader (dataloader): dataloader using to provide inputs.
20 |         num_batches (int): running iterations using to compute the stats.
21 |     """
22 | 
23 |     # Prepares all the bn layers.
24 |     bn_layers = [
25 |         m
26 |         for m in model.modules()
27 |         if any(
28 |             (
29 |                 isinstance(m, bn_type)
30 |                 for bn_type in (
31 |                     torch.nn.BatchNorm1d,
32 |                     torch.nn.BatchNorm2d,
33 |                     torch.nn.BatchNorm3d,
34 |                 )
35 |             )
36 |         )
37 |     ]
38 | 
39 |     # In order to make the running stats only reflect the current batch, the
40 |     # momentum is disabled.
41 |     # bn.running_mean = (1 - momentum) * bn.running_mean + momentum * batch_mean
42 |     # Setting the momentum to 1.0 to compute the stats without momentum.
43 |     momentum_actual = [bn.momentum for bn in bn_layers]
44 |     for bn in bn_layers:
45 |         bn.momentum = 1.0
46 | 
47 |     # Calculates the running iterations for precise stats computation.
48 |     running_mean = [torch.zeros_like(bn.running_mean) for bn in bn_layers]
49 |     running_square_mean = [torch.zeros_like(bn.running_var) for bn in bn_layers]
50 | 
51 |     for ind, (inputs, _, _) in enumerate(
52 |         itertools.islice(data_loader, num_batches)
53 |     ):
54 |         # Forwards the model to update the bn stats.
55 |         if isinstance(inputs, (list,)):
56 |             for i in range(len(inputs)):
57 |                 inputs[i] = inputs[i].float().cuda(non_blocking=True)
58 |         else:
59 |             inputs = inputs.cuda(non_blocking=True)
60 |         model(inputs)
61 | 
62 |         for i, bn in enumerate(bn_layers):
63 |             # Accumulates the bn stats.
64 |             running_mean[i] += (bn.running_mean - running_mean[i]) / (ind + 1)
65 |             # $E(x^2) = Var(x) + E(x)^2$.
66 |             cur_square_mean = bn.running_var + bn.running_mean ** 2
67 |             running_square_mean[i] += (
68 |                 cur_square_mean - running_square_mean[i]
69 |             ) / (ind + 1)
70 | 
71 |     for i, bn in enumerate(bn_layers):
72 |         bn.running_mean = running_mean[i]
73 |         # Var(x) = $E(x^2) - E(x)^2$.
74 |         bn.running_var = running_square_mean[i] - bn.running_mean ** 2
75 |         # Sets the precise bn stats.
76 |         bn.momentum = momentum_actual[i]
77 | 


--------------------------------------------------------------------------------
/timesformer/utils/c2_model_loading.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """Caffe2 to PyTorch checkpoint name converting utility."""
  4 | 
  5 | import re
  6 | 
  7 | 
  8 | def get_name_convert_func():
  9 |     """
 10 |     Get the function to convert Caffe2 layer names to PyTorch layer names.
 11 |     Returns:
 12 |         (func): function to convert parameter name from Caffe2 format to PyTorch
 13 |         format.
 14 |     """
 15 |     pairs = [
 16 |         # ------------------------------------------------------------
 17 |         # 'nonlocal_conv3_1_theta_w' -> 's3.pathway0_nonlocal3.conv_g.weight'
 18 |         [
 19 |             r"^nonlocal_conv([0-9]+)_([0-9]+)_(.*)",
 20 |             r"s\1.pathway0_nonlocal\2_\3",
 21 |         ],
 22 |         # 'theta' -> 'conv_theta'
 23 |         [r"^(.*)_nonlocal([0-9]+)_(theta)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 24 |         # 'g' -> 'conv_g'
 25 |         [r"^(.*)_nonlocal([0-9]+)_(g)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 26 |         # 'phi' -> 'conv_phi'
 27 |         [r"^(.*)_nonlocal([0-9]+)_(phi)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 28 |         # 'out' -> 'conv_out'
 29 |         [r"^(.*)_nonlocal([0-9]+)_(out)(.*)", r"\1_nonlocal\2.conv_\3\4"],
 30 |         # 'nonlocal_conv4_5_bn_s' -> 's4.pathway0_nonlocal3.bn.weight'
 31 |         [r"^(.*)_nonlocal([0-9]+)_(bn)_(.*)", r"\1_nonlocal\2.\3.\4"],
 32 |         # ------------------------------------------------------------
 33 |         # 't_pool1_subsample_bn' -> 's1_fuse.conv_f2s.bn.running_mean'
 34 |         [r"^t_pool1_subsample_bn_(.*)", r"s1_fuse.bn.\1"],
 35 |         # 't_pool1_subsample' -> 's1_fuse.conv_f2s'
 36 |         [r"^t_pool1_subsample_(.*)", r"s1_fuse.conv_f2s.\1"],
 37 |         # 't_res4_5_branch2c_bn_subsample_bn_rm' -> 's4_fuse.conv_f2s.bias'
 38 |         [
 39 |             r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_bn_(.*)",
 40 |             r"s\1_fuse.bn.\3",
 41 |         ],
 42 |         # 't_pool1_subsample' -> 's1_fuse.conv_f2s'
 43 |         [
 44 |             r"^t_res([0-9]+)_([0-9]+)_branch2c_bn_subsample_(.*)",
 45 |             r"s\1_fuse.conv_f2s.\3",
 46 |         ],
 47 |         # ------------------------------------------------------------
 48 |         # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b'
 49 |         [
 50 |             r"^res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)",
 51 |             r"s\1.pathway0_res\2.branch\3.\4_\5",
 52 |         ],
 53 |         # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.'
 54 |         [r"^res_conv1_bn_(.*)", r"s1.pathway0_stem.bn.\1"],
 55 |         # 'conv1_xy_w_momentum' -> 's1.pathway0_stem.conv_xy.'
 56 |         [r"^conv1_xy(.*)", r"s1.pathway0_stem.conv_xy\1"],
 57 |         # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.'
 58 |         [r"^conv1_(.*)", r"s1.pathway0_stem.conv.\1"],
 59 |         # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight'
 60 |         [
 61 |             r"^res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)",
 62 |             r"s\1.pathway0_res\2.branch\3_\4",
 63 |         ],
 64 |         # 'res_conv1_' -> 's1.pathway0_stem.conv.'
 65 |         [r"^res_conv1_(.*)", r"s1.pathway0_stem.conv.\1"],
 66 |         # ------------------------------------------------------------
 67 |         # 'res4_4_branch_2c_bn_b' -> 's4.pathway0_res4.branch2.c_bn_b'
 68 |         [
 69 |             r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)([a-z])_(.*)",
 70 |             r"s\1.pathway1_res\2.branch\3.\4_\5",
 71 |         ],
 72 |         # 'res_conv1_bn_' -> 's1.pathway0_stem.bn.'
 73 |         [r"^t_res_conv1_bn_(.*)", r"s1.pathway1_stem.bn.\1"],
 74 |         # 'conv1_w_momentum' -> 's1.pathway0_stem.conv.'
 75 |         [r"^t_conv1_(.*)", r"s1.pathway1_stem.conv.\1"],
 76 |         # 'res4_0_branch1_w' -> 'S4.pathway0_res0.branch1.weight'
 77 |         [
 78 |             r"^t_res([0-9]+)_([0-9]+)_branch([0-9]+)_(.*)",
 79 |             r"s\1.pathway1_res\2.branch\3_\4",
 80 |         ],
 81 |         # 'res_conv1_' -> 's1.pathway0_stem.conv.'
 82 |         [r"^t_res_conv1_(.*)", r"s1.pathway1_stem.conv.\1"],
 83 |         # ------------------------------------------------------------
 84 |         # pred_ -> head.projection.
 85 |         [r"pred_(.*)", r"head.projection.\1"],
 86 |         # '.b_bn_fc' -> '.se.fc'
 87 |         [r"(.*)b_bn_fc(.*)", r"\1se.fc\2"],
 88 |         # conv_5 -> head.conv_5.
 89 |         [r"conv_5(.*)", r"head.conv_5\1"],
 90 |         # conv_5 -> head.conv_5.
 91 |         [r"lin_5(.*)", r"head.lin_5\1"],
 92 |         # '.bn_b' -> '.weight'
 93 |         [r"(.*)bn.b\Z", r"\1bn.bias"],
 94 |         # '.bn_s' -> '.weight'
 95 |         [r"(.*)bn.s\Z", r"\1bn.weight"],
 96 |         # '_bn_rm' -> '.running_mean'
 97 |         [r"(.*)bn.rm\Z", r"\1bn.running_mean"],
 98 |         # '_bn_riv' -> '.running_var'
 99 |         [r"(.*)bn.riv\Z", r"\1bn.running_var"],
100 |         # '_b' -> '.bias'
101 |         [r"(.*)[\._]b\Z", r"\1.bias"],
102 |         # '_w' -> '.weight'
103 |         [r"(.*)[\._]w\Z", r"\1.weight"],
104 |     ]
105 | 
106 |     def convert_caffe2_name_to_pytorch(caffe2_layer_name):
107 |         """
108 |         Convert the caffe2_layer_name to pytorch format by apply the list of
109 |         regular expressions.
110 |         Args:
111 |             caffe2_layer_name (str): caffe2 layer name.
112 |         Returns:
113 |             (str): pytorch layer name.
114 |         """
115 |         for source, dest in pairs:
116 |             caffe2_layer_name = re.sub(source, dest, caffe2_layer_name)
117 |         return caffe2_layer_name
118 | 
119 |     return convert_caffe2_name_to_pytorch
120 | 


--------------------------------------------------------------------------------
/timesformer/utils/distributed.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """Distributed helpers."""
  4 | 
  5 | import functools
  6 | import logging
  7 | import pickle
  8 | import torch
  9 | import torch.distributed as dist
 10 | 
 11 | _LOCAL_PROCESS_GROUP = None
 12 | 
 13 | 
 14 | def all_gather(tensors):
 15 |     """
 16 |     All gathers the provided tensors from all processes across machines.
 17 |     Args:
 18 |         tensors (list): tensors to perform all gather across all processes in
 19 |         all machines.
 20 |     """
 21 | 
 22 |     gather_list = []
 23 |     output_tensor = []
 24 |     world_size = dist.get_world_size()
 25 |     for tensor in tensors:
 26 |         tensor_placeholder = [
 27 |             torch.ones_like(tensor) for _ in range(world_size)
 28 |         ]
 29 |         dist.all_gather(tensor_placeholder, tensor, async_op=False)
 30 |         gather_list.append(tensor_placeholder)
 31 |     for gathered_tensor in gather_list:
 32 |         output_tensor.append(torch.cat(gathered_tensor, dim=0))
 33 |     return output_tensor
 34 | 
 35 | 
 36 | def all_reduce(tensors, average=True):
 37 |     """
 38 |     All reduce the provided tensors from all processes across machines.
 39 |     Args:
 40 |         tensors (list): tensors to perform all reduce across all processes in
 41 |         all machines.
 42 |         average (bool): scales the reduced tensor by the number of overall
 43 |         processes across all machines.
 44 |     """
 45 | 
 46 |     for tensor in tensors:
 47 |         dist.all_reduce(tensor, async_op=False)
 48 |     if average:
 49 |         world_size = dist.get_world_size()
 50 |         for tensor in tensors:
 51 |             tensor.mul_(1.0 / world_size)
 52 |     return tensors
 53 | 
 54 | 
 55 | def init_process_group(
 56 |     local_rank,
 57 |     local_world_size,
 58 |     shard_id,
 59 |     num_shards,
 60 |     init_method,
 61 |     dist_backend="nccl",
 62 | ):
 63 |     """
 64 |     Initializes the default process group.
 65 |     Args:
 66 |         local_rank (int): the rank on the current local machine.
 67 |         local_world_size (int): the world size (number of processes running) on
 68 |         the current local machine.
 69 |         shard_id (int): the shard index (machine rank) of the current machine.
 70 |         num_shards (int): number of shards for distributed training.
 71 |         init_method (string): supporting three different methods for
 72 |             initializing process groups:
 73 |             "file": use shared file system to initialize the groups across
 74 |             different processes.
 75 |             "tcp": use tcp address to initialize the groups across different
 76 |         dist_backend (string): backend to use for distributed training. Options
 77 |             includes gloo, mpi and nccl, the details can be found here:
 78 |             https://pytorch.org/docs/stable/distributed.html
 79 |     """
 80 |     # Sets the GPU to use.
 81 |     torch.cuda.set_device(local_rank)
 82 |     # Initialize the process group.
 83 |     proc_rank = local_rank + shard_id * local_world_size
 84 |     world_size = local_world_size * num_shards
 85 |     dist.init_process_group(
 86 |         backend=dist_backend,
 87 |         init_method=init_method,
 88 |         world_size=world_size,
 89 |         rank=proc_rank,
 90 |     )
 91 | 
 92 | 
 93 | def is_master_proc(num_gpus=8):
 94 |     """
 95 |     Determines if the current process is the master process.
 96 |     """
 97 |     if torch.distributed.is_initialized():
 98 |         return dist.get_rank() % num_gpus == 0
 99 |     else:
100 |         return True
101 | 
102 | 
103 | def is_root_proc():
104 |     """
105 |     Determines if the current process is the root process.
106 |     """
107 |     if torch.distributed.is_initialized():
108 |         return dist.get_rank() == 0
109 |     else:
110 |         return True
111 | 
112 | 
113 | def get_world_size():
114 |     """
115 |     Get the size of the world.
116 |     """
117 |     if not dist.is_available():
118 |         return 1
119 |     if not dist.is_initialized():
120 |         return 1
121 |     return dist.get_world_size()
122 | 
123 | 
124 | def get_rank():
125 |     """
126 |     Get the rank of the current process.
127 |     """
128 |     if not dist.is_available():
129 |         return 0
130 |     if not dist.is_initialized():
131 |         return 0
132 |     return dist.get_rank()
133 | 
134 | 
135 | def synchronize():
136 |     """
137 |     Helper function to synchronize (barrier) among all processes when
138 |     using distributed training
139 |     """
140 |     if not dist.is_available():
141 |         return
142 |     if not dist.is_initialized():
143 |         return
144 |     world_size = dist.get_world_size()
145 |     if world_size == 1:
146 |         return
147 |     dist.barrier()
148 | 
149 | 
150 | @functools.lru_cache()
151 | def _get_global_gloo_group():
152 |     """
153 |     Return a process group based on gloo backend, containing all the ranks
154 |     The result is cached.
155 |     Returns:
156 |         (group): pytorch dist group.
157 |     """
158 |     if dist.get_backend() == "nccl":
159 |         return dist.new_group(backend="gloo")
160 |     else:
161 |         return dist.group.WORLD
162 | 
163 | 
164 | def _serialize_to_tensor(data, group):
165 |     """
166 |     Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl`
167 |         backend is supported.
168 |     Args:
169 |         data (data): data to be serialized.
170 |         group (group): pytorch dist group.
171 |     Returns:
172 |         tensor (ByteTensor): tensor that serialized.
173 |     """
174 | 
175 |     backend = dist.get_backend(group)
176 |     assert backend in ["gloo", "nccl"]
177 |     device = torch.device("cpu" if backend == "gloo" else "cuda")
178 | 
179 |     buffer = pickle.dumps(data)
180 |     if len(buffer) > 1024 ** 3:
181 |         logger = logging.getLogger(__name__)
182 |         logger.warning(
183 |             "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
184 |                 get_rank(), len(buffer) / (1024 ** 3), device
185 |             )
186 |         )
187 |     storage = torch.ByteStorage.from_buffer(buffer)
188 |     tensor = torch.ByteTensor(storage).to(device=device)
189 |     return tensor
190 | 
191 | 
192 | def _pad_to_largest_tensor(tensor, group):
193 |     """
194 |     Padding all the tensors from different GPUs to the largest ones.
195 |     Args:
196 |         tensor (tensor): tensor to pad.
197 |         group (group): pytorch dist group.
198 |     Returns:
199 |         list[int]: size of the tensor, on each rank
200 |         Tensor: padded tensor that has the max size
201 |     """
202 |     world_size = dist.get_world_size(group=group)
203 |     assert (
204 |         world_size >= 1
205 |     ), "comm.gather/all_gather must be called from ranks within the given group!"
206 |     local_size = torch.tensor(
207 |         [tensor.numel()], dtype=torch.int64, device=tensor.device
208 |     )
209 |     size_list = [
210 |         torch.zeros([1], dtype=torch.int64, device=tensor.device)
211 |         for _ in range(world_size)
212 |     ]
213 |     dist.all_gather(size_list, local_size, group=group)
214 |     size_list = [int(size.item()) for size in size_list]
215 | 
216 |     max_size = max(size_list)
217 | 
218 |     # we pad the tensor because torch all_gather does not support
219 |     # gathering tensors of different shapes
220 |     if local_size != max_size:
221 |         padding = torch.zeros(
222 |             (max_size - local_size,), dtype=torch.uint8, device=tensor.device
223 |         )
224 |         tensor = torch.cat((tensor, padding), dim=0)
225 |     return size_list, tensor
226 | 
227 | 
228 | def all_gather_unaligned(data, group=None):
229 |     """
230 |     Run all_gather on arbitrary picklable data (not necessarily tensors).
231 | 
232 |     Args:
233 |         data: any picklable object
234 |         group: a torch process group. By default, will use a group which
235 |             contains all ranks on gloo backend.
236 | 
237 |     Returns:
238 |         list[data]: list of data gathered from each rank
239 |     """
240 |     if get_world_size() == 1:
241 |         return [data]
242 |     if group is None:
243 |         group = _get_global_gloo_group()
244 |     if dist.get_world_size(group) == 1:
245 |         return [data]
246 | 
247 |     tensor = _serialize_to_tensor(data, group)
248 | 
249 |     size_list, tensor = _pad_to_largest_tensor(tensor, group)
250 |     max_size = max(size_list)
251 | 
252 |     # receiving Tensor from all ranks
253 |     tensor_list = [
254 |         torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
255 |         for _ in size_list
256 |     ]
257 |     dist.all_gather(tensor_list, tensor, group=group)
258 | 
259 |     data_list = []
260 |     for size, tensor in zip(size_list, tensor_list):
261 |         buffer = tensor.cpu().numpy().tobytes()[:size]
262 |         data_list.append(pickle.loads(buffer))
263 | 
264 |     return data_list
265 | 
266 | 
267 | def init_distributed_training(cfg):
268 |     """
269 |     Initialize variables needed for distributed training.
270 |     """
271 |     if cfg.NUM_GPUS <= 1:
272 |         return
273 |     num_gpus_per_machine = cfg.NUM_GPUS
274 |     num_machines = dist.get_world_size() // num_gpus_per_machine
275 |     for i in range(num_machines):
276 |         ranks_on_i = list(
277 |             range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
278 |         )
279 |         pg = dist.new_group(ranks_on_i)
280 |         if i == cfg.SHARD_ID:
281 |             global _LOCAL_PROCESS_GROUP
282 |             _LOCAL_PROCESS_GROUP = pg
283 | 
284 | 
285 | def get_local_size() -> int:
286 |     """
287 |     Returns:
288 |         The size of the per-machine process group,
289 |         i.e. the number of processes per machine.
290 |     """
291 |     if not dist.is_available():
292 |         return 1
293 |     if not dist.is_initialized():
294 |         return 1
295 |     return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
296 | 
297 | 
298 | def get_local_rank() -> int:
299 |     """
300 |     Returns:
301 |         The rank of the current process within the local (per-machine) process group.
302 |     """
303 |     if not dist.is_available():
304 |         return 0
305 |     if not dist.is_initialized():
306 |         return 0
307 |     assert _LOCAL_PROCESS_GROUP is not None
308 |     return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
309 | 


--------------------------------------------------------------------------------
/timesformer/utils/env.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Set up Environment."""
 4 | 
 5 | import timesformer.utils.logging as logging
 6 | 
 7 | _ENV_SETUP_DONE = False
 8 | 
 9 | 
10 | def setup_environment():
11 |     global _ENV_SETUP_DONE
12 |     if _ENV_SETUP_DONE:
13 |         return
14 |     _ENV_SETUP_DONE = True
15 | 


--------------------------------------------------------------------------------
/timesformer/utils/logging.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Logging."""
 4 | 
 5 | import atexit
 6 | import builtins
 7 | import decimal
 8 | import functools
 9 | import logging
10 | import os
11 | import sys
12 | import simplejson
13 | from fvcore.common.file_io import PathManager
14 | 
15 | import timesformer.utils.distributed as du
16 | 
17 | 
18 | def _suppress_print():
19 |     """
20 |     Suppresses printing from the current process.
21 |     """
22 | 
23 |     def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False):
24 |         pass
25 | 
26 |     builtins.print = print_pass
27 | 
28 | 
29 | @functools.lru_cache(maxsize=None)
30 | def _cached_log_stream(filename):
31 |     io = PathManager.open(filename, "a", buffering=1024)
32 |     atexit.register(io.close)
33 |     return io
34 | 
35 | 
36 | def setup_logging(output_dir=None):
37 |     """
38 |     Sets up the logging for multiple processes. Only enable the logging for the
39 |     master process, and suppress logging for the non-master processes.
40 |     """
41 |     # Set up logging format.
42 |     _FORMAT = "[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s"
43 | 
44 |     if du.is_master_proc():
45 |         # Enable logging for the master process.
46 |         logging.root.handlers = []
47 |     else:
48 |         # Suppress logging for non-master processes.
49 |         _suppress_print()
50 | 
51 |     logger = logging.getLogger()
52 |     logger.setLevel(logging.DEBUG)
53 |     logger.propagate = False
54 |     plain_formatter = logging.Formatter(
55 |         "[%(asctime)s][%(levelname)s] %(filename)s: %(lineno)3d: %(message)s",
56 |         datefmt="%m/%d %H:%M:%S",
57 |     )
58 | 
59 |     if du.is_master_proc():
60 |         ch = logging.StreamHandler(stream=sys.stdout)
61 |         ch.setLevel(logging.DEBUG)
62 |         ch.setFormatter(plain_formatter)
63 |         logger.addHandler(ch)
64 | 
65 |     if output_dir is not None and du.is_master_proc(du.get_world_size()):
66 |         filename = os.path.join(output_dir, "stdout.log")
67 |         fh = logging.StreamHandler(_cached_log_stream(filename))
68 |         fh.setLevel(logging.DEBUG)
69 |         fh.setFormatter(plain_formatter)
70 |         logger.addHandler(fh)
71 | 
72 | 
73 | def get_logger(name):
74 |     """
75 |     Retrieve the logger with the specified name or, if name is None, return a
76 |     logger which is the root logger of the hierarchy.
77 |     Args:
78 |         name (string): name of the logger.
79 |     """
80 |     return logging.getLogger(name)
81 | 
82 | 
83 | def log_json_stats(stats):
84 |     """
85 |     Logs json stats.
86 |     Args:
87 |         stats (dict): a dictionary of statistical information to log.
88 |     """
89 |     stats = {
90 |         k: decimal.Decimal("{:.5f}".format(v)) if isinstance(v, float) else v
91 |         for k, v in stats.items()
92 |     }
93 |     json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True)
94 |     logger = get_logger(__name__)
95 |     logger.info("json_stats: {:s}".format(json_stats))
96 | 


--------------------------------------------------------------------------------
/timesformer/utils/lr_policy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Learning rate policy."""
 4 | 
 5 | import math
 6 | 
 7 | 
 8 | def get_lr_at_epoch(cfg, cur_epoch):
 9 |     """
10 |     Retrieve the learning rate of the current epoch with the option to perform
11 |     warm up in the beginning of the training stage.
12 |     Args:
13 |         cfg (CfgNode): configs. Details can be found in
14 |             slowfast/config/defaults.py
15 |         cur_epoch (float): the number of epoch of the current training stage.
16 |     """
17 |     lr = get_lr_func(cfg.SOLVER.LR_POLICY)(cfg, cur_epoch)
18 |     # Perform warm up.
19 |     if cur_epoch < cfg.SOLVER.WARMUP_EPOCHS:
20 |         lr_start = cfg.SOLVER.WARMUP_START_LR
21 |         lr_end = get_lr_func(cfg.SOLVER.LR_POLICY)(
22 |             cfg, cfg.SOLVER.WARMUP_EPOCHS
23 |         )
24 |         alpha = (lr_end - lr_start) / cfg.SOLVER.WARMUP_EPOCHS
25 |         lr = cur_epoch * alpha + lr_start
26 |     return lr
27 | 
28 | 
29 | def lr_func_cosine(cfg, cur_epoch):
30 |     """
31 |     Retrieve the learning rate to specified values at specified epoch with the
32 |     cosine learning rate schedule. Details can be found in:
33 |     Ilya Loshchilov, and  Frank Hutter
34 |     SGDR: Stochastic Gradient Descent With Warm Restarts.
35 |     Args:
36 |         cfg (CfgNode): configs. Details can be found in
37 |             slowfast/config/defaults.py
38 |         cur_epoch (float): the number of epoch of the current training stage.
39 |     """
40 |     assert cfg.SOLVER.COSINE_END_LR < cfg.SOLVER.BASE_LR
41 |     return (
42 |         cfg.SOLVER.COSINE_END_LR
43 |         + (cfg.SOLVER.BASE_LR - cfg.SOLVER.COSINE_END_LR)
44 |         * (math.cos(math.pi * cur_epoch / cfg.SOLVER.MAX_EPOCH) + 1.0)
45 |         * 0.5
46 |     )
47 | 
48 | 
49 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch):
50 |     """
51 |     Retrieve the learning rate to specified values at specified epoch with the
52 |     steps with relative learning rate schedule.
53 |     Args:
54 |         cfg (CfgNode): configs. Details can be found in
55 |             slowfast/config/defaults.py
56 |         cur_epoch (float): the number of epoch of the current training stage.
57 |     """
58 |     ind = get_step_index(cfg, cur_epoch)
59 |     return cfg.SOLVER.LRS[ind] * cfg.SOLVER.BASE_LR
60 | 
61 | 
62 | def get_step_index(cfg, cur_epoch):
63 |     """
64 |     Retrieves the lr step index for the given epoch.
65 |     Args:
66 |         cfg (CfgNode): configs. Details can be found in
67 |             slowfast/config/defaults.py
68 |         cur_epoch (float): the number of epoch of the current training stage.
69 |     """
70 |     steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_EPOCH]
71 |     for ind, step in enumerate(steps):  # NoQA
72 |         if cur_epoch < step:
73 |             break
74 |     return ind - 1
75 | 
76 | 
77 | def get_lr_func(lr_policy):
78 |     """
79 |     Given the configs, retrieve the specified lr policy function.
80 |     Args:
81 |         lr_policy (string): the learning rate policy to use for the job.
82 |     """
83 |     policy = "lr_func_" + lr_policy
84 |     if policy not in globals():
85 |         raise NotImplementedError("Unknown LR policy: {}".format(lr_policy))
86 |     else:
87 |         return globals()[policy]
88 | 


--------------------------------------------------------------------------------
/timesformer/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Functions for computing metrics."""
 4 | 
 5 | import torch
 6 | import numpy as np
 7 | 
 8 | def topks_correct(preds, labels, ks):
 9 |     """
10 |     Given the predictions, labels, and a list of top-k values, compute the
11 |     number of correct predictions for each top-k value.
12 | 
13 |     Args:
14 |         preds (array): array of predictions. Dimension is batchsize
15 |             N x ClassNum.
16 |         labels (array): array of labels. Dimension is batchsize N.
17 |         ks (list): list of top-k values. For example, ks = [1, 5] correspods
18 |             to top-1 and top-5.
19 | 
20 |     Returns:
21 |         topks_correct (list): list of numbers, where the `i`-th entry
22 |             corresponds to the number of top-`ks[i]` correct predictions.
23 |     """
24 |     assert preds.size(0) == labels.size(
25 |         0
26 |     ), "Batch dim of predictions and labels must match"
27 |     # Find the top max_k predictions for each sample
28 |     _top_max_k_vals, top_max_k_inds = torch.topk(
29 |         preds, max(ks), dim=1, largest=True, sorted=True
30 |     )
31 |     # (batch_size, max_k) -> (max_k, batch_size).
32 |     top_max_k_inds = top_max_k_inds.t()
33 |     # (batch_size, ) -> (max_k, batch_size).
34 |     rep_max_k_labels = labels.view(1, -1).expand_as(top_max_k_inds)
35 |     # (i, j) = 1 if top i-th prediction for the j-th sample is correct.
36 |     top_max_k_correct = top_max_k_inds.eq(rep_max_k_labels)
37 |     # Compute the number of topk correct predictions for each k.
38 |     topks_correct = [top_max_k_correct[:k, :].float().sum() for k in ks]
39 |     return topks_correct
40 | 
41 | 
42 | def topk_errors(preds, labels, ks):
43 |     """
44 |     Computes the top-k error for each k.
45 |     Args:
46 |         preds (array): array of predictions. Dimension is N.
47 |         labels (array): array of labels. Dimension is N.
48 |         ks (list): list of ks to calculate the top accuracies.
49 |     """
50 |     num_topks_correct = topks_correct(preds, labels, ks)
51 |     return [(1.0 - x / preds.size(0)) * 100.0 for x in num_topks_correct]
52 | 
53 | 
54 | def topk_accuracies(preds, labels, ks):
55 |     """
56 |     Computes the top-k accuracy for each k.
57 |     Args:
58 |         preds (array): array of predictions. Dimension is N.
59 |         labels (array): array of labels. Dimension is N.
60 |         ks (list): list of ks to calculate the top accuracies.
61 |     """
62 |     num_topks_correct = topks_correct(preds, labels, ks)
63 |     return [(x / preds.size(0)) * 100.0 for x in num_topks_correct]
64 | 
65 | def multitask_topks_correct(preds, labels, ks=(1,)):
66 |     """
67 |     Args:
68 |         preds: tuple(torch.FloatTensor), each tensor should be of shape
69 |             [batch_size, class_count], class_count can vary on a per task basis, i.e.
70 |             outputs[i].shape[1] can be different to outputs[j].shape[j].
71 |         labels: tuple(torch.LongTensor), each tensor should be of shape [batch_size]
72 |         ks: tuple(int), compute accuracy at top-k for the values of k specified
73 |             in this parameter.
74 |     Returns:
75 |         tuple(float), same length at topk with the corresponding accuracy@k in.
76 |     """
77 |     max_k = int(np.max(ks))
78 |     task_count = len(preds)
79 |     batch_size = labels[0].size(0)
80 |     all_correct = torch.zeros(max_k, batch_size).type(torch.ByteTensor)
81 |     if torch.cuda.is_available():
82 |         all_correct = all_correct.cuda()
83 |     for output, label in zip(preds, labels):
84 |         _, max_k_idx = output.topk(max_k, dim=1, largest=True, sorted=True)
85 |         # Flip batch_size, class_count as .view doesn't work on non-contiguous
86 |         max_k_idx = max_k_idx.t()
87 |         correct_for_task = max_k_idx.eq(label.view(1, -1).expand_as(max_k_idx))
88 |         all_correct.add_(correct_for_task)
89 | 
90 |     multitask_topks_correct = [
91 |         torch.ge(all_correct[:k].float().sum(0), task_count).float().sum(0) for k in ks
92 |     ]
93 | 
94 |     return multitask_topks_correct
95 | 


--------------------------------------------------------------------------------
/timesformer/utils/multigrid.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """Helper functions for multigrid training."""
  4 | 
  5 | import numpy as np
  6 | 
  7 | import timesformer.utils.logging as logging
  8 | 
  9 | logger = logging.get_logger(__name__)
 10 | 
 11 | 
 12 | class MultigridSchedule(object):
 13 |     """
 14 |     This class defines multigrid training schedule and update cfg accordingly.
 15 |     """
 16 | 
 17 |     def init_multigrid(self, cfg):
 18 |         """
 19 |         Update cfg based on multigrid settings.
 20 |         Args:
 21 |             cfg (configs): configs that contains training and multigrid specific
 22 |                 hyperparameters. Details can be seen in
 23 |                 slowfast/config/defaults.py.
 24 |         Returns:
 25 |             cfg (configs): the updated cfg.
 26 |         """
 27 |         self.schedule = None
 28 |         # We may modify cfg.TRAIN.BATCH_SIZE, cfg.DATA.NUM_FRAMES, and
 29 |         # cfg.DATA.TRAIN_CROP_SIZE during training, so we store their original
 30 |         # value in cfg and use them as global variables.
 31 |         cfg.MULTIGRID.DEFAULT_B = cfg.TRAIN.BATCH_SIZE
 32 |         cfg.MULTIGRID.DEFAULT_T = cfg.DATA.NUM_FRAMES
 33 |         cfg.MULTIGRID.DEFAULT_S = cfg.DATA.TRAIN_CROP_SIZE
 34 | 
 35 |         if cfg.MULTIGRID.LONG_CYCLE:
 36 |             self.schedule = self.get_long_cycle_schedule(cfg)
 37 |             cfg.SOLVER.STEPS = [0] + [s[-1] for s in self.schedule]
 38 |             # Fine-tuning phase.
 39 |             cfg.SOLVER.STEPS[-1] = (
 40 |                 cfg.SOLVER.STEPS[-2] + cfg.SOLVER.STEPS[-1]
 41 |             ) // 2
 42 |             cfg.SOLVER.LRS = [
 43 |                 cfg.SOLVER.GAMMA ** s[0] * s[1][0] for s in self.schedule
 44 |             ]
 45 |             # Fine-tuning phase.
 46 |             cfg.SOLVER.LRS = cfg.SOLVER.LRS[:-1] + [
 47 |                 cfg.SOLVER.LRS[-2],
 48 |                 cfg.SOLVER.LRS[-1],
 49 |             ]
 50 | 
 51 |             cfg.SOLVER.MAX_EPOCH = self.schedule[-1][-1]
 52 | 
 53 |         elif cfg.MULTIGRID.SHORT_CYCLE:
 54 |             cfg.SOLVER.STEPS = [
 55 |                 int(s * cfg.MULTIGRID.EPOCH_FACTOR) for s in cfg.SOLVER.STEPS
 56 |             ]
 57 |             cfg.SOLVER.MAX_EPOCH = int(
 58 |                 cfg.SOLVER.MAX_EPOCH * cfg.MULTIGRID.EPOCH_FACTOR
 59 |             )
 60 |         return cfg
 61 | 
 62 |     def update_long_cycle(self, cfg, cur_epoch):
 63 |         """
 64 |         Before every epoch, check if long cycle shape should change. If it
 65 |             should, update cfg accordingly.
 66 |         Args:
 67 |             cfg (configs): configs that contains training and multigrid specific
 68 |                 hyperparameters. Details can be seen in
 69 |                 slowfast/config/defaults.py.
 70 |             cur_epoch (int): current epoch index.
 71 |         Returns:
 72 |             cfg (configs): the updated cfg.
 73 |             changed (bool): do we change long cycle shape at this epoch?
 74 |         """
 75 |         base_b, base_t, base_s = get_current_long_cycle_shape(
 76 |             self.schedule, cur_epoch
 77 |         )
 78 |         if base_s != cfg.DATA.TRAIN_CROP_SIZE or base_t != cfg.DATA.NUM_FRAMES:
 79 | 
 80 |             cfg.DATA.NUM_FRAMES = base_t
 81 |             cfg.DATA.TRAIN_CROP_SIZE = base_s
 82 |             cfg.TRAIN.BATCH_SIZE = base_b * cfg.MULTIGRID.DEFAULT_B
 83 | 
 84 |             bs_factor = (
 85 |                 float(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS)
 86 |                 / cfg.MULTIGRID.BN_BASE_SIZE
 87 |             )
 88 | 
 89 |             if bs_factor < 1:
 90 |                 cfg.BN.NORM_TYPE = "sync_batchnorm"
 91 |                 cfg.BN.NUM_SYNC_DEVICES = int(1.0 / bs_factor)
 92 |             elif bs_factor > 1:
 93 |                 cfg.BN.NORM_TYPE = "sub_batchnorm"
 94 |                 cfg.BN.NUM_SPLITS = int(bs_factor)
 95 |             else:
 96 |                 cfg.BN.NORM_TYPE = "batchnorm"
 97 | 
 98 |             cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = cfg.DATA.SAMPLING_RATE * (
 99 |                 cfg.MULTIGRID.DEFAULT_T // cfg.DATA.NUM_FRAMES
100 |             )
101 |             logger.info("Long cycle updates:")
102 |             logger.info("\tBN.NORM_TYPE: {}".format(cfg.BN.NORM_TYPE))
103 |             if cfg.BN.NORM_TYPE == "sync_batchnorm":
104 |                 logger.info(
105 |                     "\tBN.NUM_SYNC_DEVICES: {}".format(cfg.BN.NUM_SYNC_DEVICES)
106 |                 )
107 |             elif cfg.BN.NORM_TYPE == "sub_batchnorm":
108 |                 logger.info("\tBN.NUM_SPLITS: {}".format(cfg.BN.NUM_SPLITS))
109 |             logger.info("\tTRAIN.BATCH_SIZE: {}".format(cfg.TRAIN.BATCH_SIZE))
110 |             logger.info(
111 |                 "\tDATA.NUM_FRAMES x LONG_CYCLE_SAMPLING_RATE: {}x{}".format(
112 |                     cfg.DATA.NUM_FRAMES, cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE
113 |                 )
114 |             )
115 |             logger.info(
116 |                 "\tDATA.TRAIN_CROP_SIZE: {}".format(cfg.DATA.TRAIN_CROP_SIZE)
117 |             )
118 |             return cfg, True
119 |         else:
120 |             return cfg, False
121 | 
122 |     def get_long_cycle_schedule(self, cfg):
123 |         """
124 |         Based on multigrid hyperparameters, define the schedule of a long cycle.
125 |         Args:
126 |             cfg (configs): configs that contains training and multigrid specific
127 |                 hyperparameters. Details can be seen in
128 |                 slowfast/config/defaults.py.
129 |         Returns:
130 |             schedule (list): Specifies a list long cycle base shapes and their
131 |                 corresponding training epochs.
132 |         """
133 | 
134 |         steps = cfg.SOLVER.STEPS
135 | 
136 |         default_size = float(
137 |             cfg.DATA.NUM_FRAMES * cfg.DATA.TRAIN_CROP_SIZE ** 2
138 |         )
139 |         default_iters = steps[-1]
140 | 
141 |         # Get shapes and average batch size for each long cycle shape.
142 |         avg_bs = []
143 |         all_shapes = []
144 |         for t_factor, s_factor in cfg.MULTIGRID.LONG_CYCLE_FACTORS:
145 |             base_t = int(round(cfg.DATA.NUM_FRAMES * t_factor))
146 |             base_s = int(round(cfg.DATA.TRAIN_CROP_SIZE * s_factor))
147 |             if cfg.MULTIGRID.SHORT_CYCLE:
148 |                 shapes = [
149 |                     [
150 |                         base_t,
151 |                         cfg.MULTIGRID.DEFAULT_S
152 |                         * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[0],
153 |                     ],
154 |                     [
155 |                         base_t,
156 |                         cfg.MULTIGRID.DEFAULT_S
157 |                         * cfg.MULTIGRID.SHORT_CYCLE_FACTORS[1],
158 |                     ],
159 |                     [base_t, base_s],
160 |                 ]
161 |             else:
162 |                 shapes = [[base_t, base_s]]
163 | 
164 |             # (T, S) -> (B, T, S)
165 |             shapes = [
166 |                 [int(round(default_size / (s[0] * s[1] * s[1]))), s[0], s[1]]
167 |                 for s in shapes
168 |             ]
169 |             avg_bs.append(np.mean([s[0] for s in shapes]))
170 |             all_shapes.append(shapes)
171 | 
172 |         # Get schedule regardless of cfg.MULTIGRID.EPOCH_FACTOR.
173 |         total_iters = 0
174 |         schedule = []
175 |         for step_index in range(len(steps) - 1):
176 |             step_epochs = steps[step_index + 1] - steps[step_index]
177 | 
178 |             for long_cycle_index, shapes in enumerate(all_shapes):
179 |                 cur_epochs = (
180 |                     step_epochs * avg_bs[long_cycle_index] / sum(avg_bs)
181 |                 )
182 | 
183 |                 cur_iters = cur_epochs / avg_bs[long_cycle_index]
184 |                 total_iters += cur_iters
185 |                 schedule.append((step_index, shapes[-1], cur_epochs))
186 | 
187 |         iter_saving = default_iters / total_iters
188 | 
189 |         final_step_epochs = cfg.SOLVER.MAX_EPOCH - steps[-1]
190 | 
191 |         # We define the fine-tuning phase to have the same amount of iteration
192 |         # saving as the rest of the training.
193 |         ft_epochs = final_step_epochs / iter_saving * avg_bs[-1]
194 | 
195 |         schedule.append((step_index + 1, all_shapes[-1][2], ft_epochs))
196 | 
197 |         # Obtrain final schedule given desired cfg.MULTIGRID.EPOCH_FACTOR.
198 |         x = (
199 |             cfg.SOLVER.MAX_EPOCH
200 |             * cfg.MULTIGRID.EPOCH_FACTOR
201 |             / sum(s[-1] for s in schedule)
202 |         )
203 | 
204 |         final_schedule = []
205 |         total_epochs = 0
206 |         for s in schedule:
207 |             epochs = s[2] * x
208 |             total_epochs += epochs
209 |             final_schedule.append((s[0], s[1], int(round(total_epochs))))
210 |         print_schedule(final_schedule)
211 |         return final_schedule
212 | 
213 | 
214 | def print_schedule(schedule):
215 |     """
216 |     Log schedule.
217 |     """
218 |     logger.info("Long cycle index\tBase shape\tEpochs")
219 |     for s in schedule:
220 |         logger.info("{}\t{}\t{}".format(s[0], s[1], s[2]))
221 | 
222 | 
223 | def get_current_long_cycle_shape(schedule, epoch):
224 |     """
225 |     Given a schedule and epoch index, return the long cycle base shape.
226 |     Args:
227 |         schedule (configs): configs that contains training and multigrid specific
228 |             hyperparameters. Details can be seen in
229 |             slowfast/config/defaults.py.
230 |         cur_epoch (int): current epoch index.
231 |     Returns:
232 |         shapes (list): A list describing the base shape in a long cycle:
233 |             [batch size relative to default,
234 |             number of frames, spatial dimension].
235 |     """
236 |     for s in schedule:
237 |         if epoch < s[-1]:
238 |             return s[1]
239 |     return schedule[-1][1]
240 | 


--------------------------------------------------------------------------------
/timesformer/utils/multiprocessing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Multiprocessing helpers."""
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def run(
 9 |     local_rank,
10 |     num_proc,
11 |     func,
12 |     init_method,
13 |     shard_id,
14 |     num_shards,
15 |     backend,
16 |     cfg,
17 |     output_queue=None,
18 | ):
19 |     """
20 |     Runs a function from a child process.
21 |     Args:
22 |         local_rank (int): rank of the current process on the current machine.
23 |         num_proc (int): number of processes per machine.
24 |         func (function): function to execute on each of the process.
25 |         init_method (string): method to initialize the distributed training.
26 |             TCP initialization: equiring a network address reachable from all
27 |             processes followed by the port.
28 |             Shared file-system initialization: makes use of a file system that
29 |             is shared and visible from all machines. The URL should start with
30 |             file:// and contain a path to a non-existent file on a shared file
31 |             system.
32 |         shard_id (int): the rank of the current machine.
33 |         num_shards (int): number of overall machines for the distributed
34 |             training job.
35 |         backend (string): three distributed backends ('nccl', 'gloo', 'mpi') are
36 |             supports, each with different capabilities. Details can be found
37 |             here:
38 |             https://pytorch.org/docs/stable/distributed.html
39 |         cfg (CfgNode): configs. Details can be found in
40 |             slowfast/config/defaults.py
41 |         output_queue (queue): can optionally be used to return values from the
42 |             master process.
43 |     """
44 |     # Initialize the process group.
45 |     world_size = num_proc * num_shards
46 |     rank = shard_id * num_proc + local_rank
47 | 
48 |     try:
49 |         torch.distributed.init_process_group(
50 |             backend=backend,
51 |             init_method=init_method,
52 |             world_size=world_size,
53 |             rank=rank,
54 |         )
55 |     except Exception as e:
56 |         raise e
57 | 
58 |     torch.cuda.set_device(local_rank)
59 |     ret = func(cfg)
60 |     if output_queue is not None and local_rank == 0:
61 |         output_queue.put(ret)
62 | 


--------------------------------------------------------------------------------
/timesformer/utils/parser.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Argument parser functions."""
 4 | 
 5 | import argparse
 6 | import sys
 7 | 
 8 | import timesformer.utils.checkpoint as cu
 9 | from timesformer.config.defaults import get_cfg
10 | 
11 | 
12 | def parse_args():
13 |     """
14 |     Parse the following arguments for a default parser for PySlowFast users.
15 |     Args:
16 |         shard_id (int): shard id for the current machine. Starts from 0 to
17 |             num_shards - 1. If single machine is used, then set shard id to 0.
18 |         num_shards (int): number of shards using by the job.
19 |         init_method (str): initialization method to launch the job with multiple
20 |             devices. Options includes TCP or shared file-system for
21 |             initialization. details can be find in
22 |             https://pytorch.org/docs/stable/distributed.html#tcp-initialization
23 |         cfg (str): path to the config file.
24 |         opts (argument): provide addtional options from the command line, it
25 |             overwrites the config loaded from file.
26 |     """
27 |     parser = argparse.ArgumentParser(
28 |         description="Provide SlowFast video training and testing pipeline."
29 |     )
30 |     parser.add_argument(
31 |         "--shard_id",
32 |         help="The shard id of current node, Starts from 0 to num_shards - 1",
33 |         default=0,
34 |         type=int,
35 |     )
36 |     parser.add_argument(
37 |         "--num_shards",
38 |         help="Number of shards using by the job",
39 |         default=1,
40 |         type=int,
41 |     )
42 |     parser.add_argument(
43 |         "--init_method",
44 |         help="Initialization method, includes TCP or shared file-system",
45 |         default="tcp://localhost:9999",
46 |         type=str,
47 |     )
48 |     parser.add_argument(
49 |         "--cfg",
50 |         dest="cfg_file",
51 |         help="Path to the config file",
52 |         default="configs/Kinetics/SLOWFAST_4x16_R50.yaml",
53 |         type=str,
54 |     )
55 |     parser.add_argument(
56 |         "opts",
57 |         help="See slowfast/config/defaults.py for all options",
58 |         default=None,
59 |         nargs=argparse.REMAINDER,
60 |     )
61 |     if len(sys.argv) == 1:
62 |         parser.print_help()
63 |     return parser.parse_args()
64 | 
65 | 
66 | def load_config(args):
67 |     """
68 |     Given the arguemnts, load and initialize the configs.
69 |     Args:
70 |         args (argument): arguments includes `shard_id`, `num_shards`,
71 |             `init_method`, `cfg_file`, and `opts`.
72 |     """
73 |     # Setup cfg.
74 |     cfg = get_cfg()
75 |     # Load config from cfg.
76 |     if args.cfg_file is not None:
77 |         cfg.merge_from_file(args.cfg_file)
78 |     # Load config from command line, overwrite config from opts.
79 |     if args.opts is not None:
80 |         cfg.merge_from_list(args.opts)
81 | 
82 |     # Inherit parameters from args.
83 |     if hasattr(args, "num_shards") and hasattr(args, "shard_id"):
84 |         cfg.NUM_SHARDS = args.num_shards
85 |         cfg.SHARD_ID = args.shard_id
86 |     if hasattr(args, "rng_seed"):
87 |         cfg.RNG_SEED = args.rng_seed
88 |     if hasattr(args, "output_dir"):
89 |         cfg.OUTPUT_DIR = args.output_dir
90 | 
91 |     # Create the checkpoint dir.
92 |     cu.make_checkpoint_dir(cfg.OUTPUT_DIR)
93 |     return cfg
94 | 


--------------------------------------------------------------------------------
/timesformer/utils/weight_init_helper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Utility function for weight initialization"""
 4 | 
 5 | import torch.nn as nn
 6 | from fvcore.nn.weight_init import c2_msra_fill
 7 | 
 8 | 
 9 | def init_weights(model, fc_init_std=0.01, zero_init_final_bn=True):
10 |     """
11 |     Performs ResNet style weight initialization.
12 |     Args:
13 |         fc_init_std (float): the expected standard deviation for fc layer.
14 |         zero_init_final_bn (bool): if True, zero initialize the final bn for
15 |             every bottleneck.
16 |     """
17 |     for m in model.modules():
18 |         if isinstance(m, nn.Conv3d):
19 |             """
20 |             Follow the initialization method proposed in:
21 |             {He, Kaiming, et al.
22 |             "Delving deep into rectifiers: Surpassing human-level
23 |             performance on imagenet classification."
24 |             arXiv preprint arXiv:1502.01852 (2015)}
25 |             """
26 |             c2_msra_fill(m)
27 |         elif isinstance(m, nn.BatchNorm3d):
28 |             if (
29 |                 hasattr(m, "transform_final_bn")
30 |                 and m.transform_final_bn
31 |                 and zero_init_final_bn
32 |             ):
33 |                 batchnorm_weight = 0.0
34 |             else:
35 |                 batchnorm_weight = 1.0
36 |             if m.weight is not None:
37 |                 m.weight.data.fill_(batchnorm_weight)
38 |             if m.bias is not None:
39 |                 m.bias.data.zero_()
40 |         if isinstance(m, nn.Linear):
41 |             m.weight.data.normal_(mean=0.0, std=fc_init_std)
42 |             if m.bias is not None:
43 |                 m.bias.data.zero_()
44 | 


--------------------------------------------------------------------------------
/timesformer/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 | 


--------------------------------------------------------------------------------
/tools/benchmark.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | """
 3 | A script to benchmark data loading.
 4 | """
 5 | 
 6 | import timesformer.utils.logging as logging
 7 | from timesformer.utils.benchmark import benchmark_data_loading
 8 | from timesformer.utils.misc import launch_job
 9 | from timesformer.utils.parser import load_config, parse_args
10 | 
11 | logger = logging.get_logger(__name__)
12 | 
13 | 
14 | def main():
15 |     args = parse_args()
16 |     cfg = load_config(args)
17 | 
18 |     launch_job(
19 |         cfg=cfg, init_method=args.init_method, func=benchmark_data_loading
20 |     )
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     main()
25 | 


--------------------------------------------------------------------------------
/tools/run_net.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | """Wrapper to train and test a video classification model."""
 4 | from timesformer.utils.misc import launch_job
 5 | from timesformer.utils.parser import load_config, parse_args
 6 | 
 7 | from tools.test_net import test
 8 | from tools.train_net import train
 9 | 
10 | 
11 | def get_func(cfg):
12 |     train_func = train
13 |     test_func = test
14 |     return train_func, test_func
15 | 
16 | def main():
17 |     """
18 |     Main function to spawn the train and test process.
19 |     """
20 |     args = parse_args()
21 |     if args.num_shards > 1:
22 |        args.output_dir = str(args.job_dir)
23 |     cfg = load_config(args)
24 | 
25 |     train, test = get_func(cfg)
26 | 
27 |     # Perform training.
28 |     if cfg.TRAIN.ENABLE:
29 |         launch_job(cfg=cfg, init_method=args.init_method, func=train)
30 | 
31 |     # Perform multi-clip testing.
32 |     if cfg.TEST.ENABLE:
33 |         launch_job(cfg=cfg, init_method=args.init_method, func=test)
34 | 
35 |     # Perform model visualization.
36 |     if cfg.TENSORBOARD.ENABLE and (
37 |         cfg.TENSORBOARD.MODEL_VIS.ENABLE
38 |         or cfg.TENSORBOARD.WRONG_PRED_VIS.ENABLE
39 |     ):
40 |         launch_job(cfg=cfg, init_method=args.init_method, func=visualize)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/tools/submit.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from pathlib import Path
  4 | import shutil
  5 | import submitit
  6 | import multiprocessing
  7 | import sys
  8 | 
  9 | import torch
 10 | import timesformer.utils.checkpoint as cu
 11 | import timesformer.utils.multiprocessing as mpu
 12 | from timesformer.utils.misc import launch_job
 13 | from timesformer.utils.parser import load_config
 14 | 
 15 | from tools.run_net import get_func
 16 | 
 17 | def parse_args():
 18 |     parser = argparse.ArgumentParser(
 19 |         "Submitit for onestage training", add_help=False
 20 |     )
 21 |     parser.add_argument(
 22 |         "--num_gpus",
 23 |         help="Number of GPUs",
 24 |         default=8,
 25 |         type=int,
 26 |     )
 27 |     parser.add_argument(
 28 |         "--num_shards",
 29 |         help="Number of Nodes",
 30 |         default=1,
 31 |         type=int,
 32 |     )
 33 |     parser.add_argument(
 34 |         "--partition", default="learnfair", type=str, help="Partition where to submit"
 35 |     )
 36 |     parser.add_argument("--timeout", default=60 * 72, type=int, help="Duration of the job")
 37 |     parser.add_argument("--cfg", dest="cfg_file", help="Path to the config file",
 38 |                         default="configs/test_R50_8GPU.yaml", type=str)
 39 |     parser.add_argument(
 40 |         "--job_dir", default="", type=str, help="Job dir. Leave empty for automatic."
 41 |     )
 42 |     parser.add_argument(
 43 |         "--name", default="", type=str, help="Job dir. Leave empty for automatic."
 44 |     )
 45 |     parser.add_argument(
 46 |         "--resume-from",
 47 |         default="",
 48 |         type=str,
 49 |         help=(
 50 |             "Weights to resume from (.*pth file) or a file (last_checkpoint) that contains "
 51 |             + "weight file name from the same directory"
 52 |         ),
 53 |     )
 54 |     parser.add_argument("--resume-job", default="", type=str, help="resume training from the job")
 55 |     parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
 56 |     parser.add_argument("--postfix", default="experiment", type=str, help="Postfix of the jobs")
 57 |     parser.add_argument("--mail", default="", type=str,
 58 |                         help="Email this user when the job finishes if specified")
 59 |     parser.add_argument('--comment', default="", type=str,
 60 |                         help='Comment to pass to scheduler, e.g. priority message')
 61 |     parser.add_argument(
 62 |         "opts",
 63 |         help="See lib/config/defaults.py for all options",
 64 |         default=None,
 65 |         nargs=argparse.REMAINDER,
 66 |     )
 67 |     return parser.parse_args()
 68 | 
 69 | 
 70 | def get_shared_folder() -> Path:
 71 |     user = os.getenv("USER")
 72 |     if Path("/checkpoint/").is_dir():
 73 |         p = Path(f"/checkpoint/{user}/experiments")
 74 |         p.mkdir(exist_ok=True)
 75 |         return p
 76 |     raise RuntimeError("No shared folder available")
 77 | 
 78 | 
 79 | def launch(shard_id, num_shards, cfg, init_method):
 80 |     os.environ["NCCL_MIN_NRINGS"] = "8"
 81 | 
 82 |     print ("Pytorch version: ", torch.__version__)
 83 | 
 84 |     cfg.SHARD_ID = shard_id
 85 |     cfg.NUM_SHARDS = num_shards
 86 | 
 87 |     print([
 88 |         shard_id, num_shards, cfg
 89 |     ])
 90 | 
 91 |     train, test = get_func(cfg)
 92 |     # Launch job.
 93 |     if cfg.TRAIN.ENABLE:
 94 |         launch_job(cfg=cfg, init_method=init_method, func=train)
 95 | 
 96 |     if cfg.TEST.ENABLE:
 97 |         launch_job(cfg=cfg, init_method=init_method, func=test)
 98 | 
 99 | 
100 | class Trainer(object):
101 |     def __init__(self, args):
102 |         self.args = args
103 | 
104 |     def __call__(self):
105 | 
106 |         socket_name = os.popen("ip r | grep default | awk '{print $5}'").read().strip('\n')
107 |         print("Setting GLOO and NCCL sockets IFNAME to: {}".format(socket_name))
108 |         os.environ["GLOO_SOCKET_IFNAME"] = socket_name
109 |         # not sure if the next line is really affect anything
110 |         os.environ["NCCL_SOCKET_IFNAME"] = socket_name
111 | 
112 | 
113 |         hostname_first_node = os.popen(
114 |             "scontrol show hostnames $SLURM_JOB_NODELIST"
115 |         ).read().split("\n")[0]
116 |         dist_url = "tcp://{}:12399".format(hostname_first_node)
117 |         print("We will use the following dist url: {}".format(dist_url))
118 | 
119 |         self._setup_gpu_args()
120 |         results = launch(
121 |             shard_id=self.args.machine_rank,
122 |             num_shards=self.args.num_shards,
123 |             cfg=load_config(self.args),
124 |             init_method=dist_url,
125 |         )
126 |         return results
127 | 
128 |     def checkpoint(self):
129 |         import submitit
130 | 
131 |         job_env = submitit.JobEnvironment()
132 |         slurm_job_id = job_env.job_id
133 |         if self.args.resume_job == "":
134 |             self.args.resume_job = slurm_job_id
135 |         print("Requeuing ", self.args)
136 |         empty_trainer = type(self)(self.args)
137 |         return submitit.helpers.DelayedSubmission(empty_trainer)
138 | 
139 |     def _setup_gpu_args(self):
140 |         import submitit
141 | 
142 |         job_env = submitit.JobEnvironment()
143 |         print(self.args)
144 | 
145 |         self.args.machine_rank = job_env.global_rank
146 |         print(f"Process rank: {job_env.global_rank}")
147 | 
148 | 
149 | def main():
150 |     args = parse_args()
151 | 
152 |     if args.name == "":
153 |         cfg_name = os.path.splitext(os.path.basename(args.cfg_file))[0]
154 |         args.name = '_'.join([cfg_name, args.postfix])
155 | 
156 |     assert args.job_dir != ""
157 | 
158 |     args.output_dir = str(args.job_dir)
159 |     args.job_dir = Path(args.job_dir) / "%j"
160 | 
161 |     # Note that the folder will depend on the job_id, to easily track experiments
162 |     #executor = submitit.AutoExecutor(folder=Path(args.job_dir) / "%j", slurm_max_num_timeout=30)
163 |     executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
164 | 
165 |     # cluster setup is defined by environment variables
166 |     num_gpus_per_node = args.num_gpus
167 |     nodes = args.num_shards
168 |     partition = args.partition
169 |     timeout_min = args.timeout
170 |     kwargs = {}
171 |     if args.use_volta32:
172 |         kwargs['slurm_constraint'] = 'volta32gb,ib4'
173 |     if args.comment:
174 |         kwargs['slurm_comment'] = args.comment
175 | 
176 |     executor.update_parameters(
177 |         mem_gb=60 * num_gpus_per_node,
178 |         gpus_per_node=num_gpus_per_node,
179 |         tasks_per_node=1,
180 |         cpus_per_task=10 * num_gpus_per_node,
181 |         nodes=nodes,
182 |         timeout_min=timeout_min,  # max is 60 * 72
183 |         slurm_partition=partition,
184 |         slurm_signal_delay_s=120,
185 |         **kwargs
186 |     )
187 | 
188 | 
189 |     print(args.name)
190 |     executor.update_parameters(name=args.name)
191 | 
192 |     trainer = Trainer(args)
193 |     job = executor.submit(trainer)
194 | 
195 |     print("Submitted job_id:", job.job_id)
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     main()
200 | 


--------------------------------------------------------------------------------
/tools/test_net.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | 
  3 | """Multi-view test a video classification model."""
  4 | 
  5 | import numpy as np
  6 | import os
  7 | import pickle
  8 | import torch
  9 | from fvcore.common.file_io import PathManager
 10 | import cv2
 11 | from einops import rearrange, reduce, repeat
 12 | import scipy.io
 13 | 
 14 | import timesformer.utils.checkpoint as cu
 15 | import timesformer.utils.distributed as du
 16 | import timesformer.utils.logging as logging
 17 | import timesformer.utils.misc as misc
 18 | import timesformer.visualization.tensorboard_vis as tb
 19 | from timesformer.datasets import loader
 20 | from timesformer.models import build_model
 21 | from timesformer.utils.meters import TestMeter
 22 | 
 23 | logger = logging.get_logger(__name__)
 24 | 
 25 | 
 26 | @torch.no_grad()
 27 | def perform_test(test_loader, model, test_meter, cfg, writer=None):
 28 |     """
 29 |     For classification:
 30 |     Perform mutli-view testing that uniformly samples N clips from a video along
 31 |     its temporal axis. For each clip, it takes 3 crops to cover the spatial
 32 |     dimension, followed by averaging the softmax scores across all Nx3 views to
 33 |     form a video-level prediction. All video predictions are compared to
 34 |     ground-truth labels and the final testing performance is logged.
 35 |     For detection:
 36 |     Perform fully-convolutional testing on the full frames without crop.
 37 |     Args:
 38 |         test_loader (loader): video testing loader.
 39 |         model (model): the pretrained video model to test.
 40 |         test_meter (TestMeter): testing meters to log and ensemble the testing
 41 |             results.
 42 |         cfg (CfgNode): configs. Details can be found in
 43 |             slowfast/config/defaults.py
 44 |         writer (TensorboardWriter object, optional): TensorboardWriter object
 45 |             to writer Tensorboard log.
 46 |     """
 47 |     # Enable eval mode.
 48 |     model.eval()
 49 |     test_meter.iter_tic()
 50 | 
 51 |     for cur_iter, (inputs, labels, video_idx, meta) in enumerate(test_loader):
 52 |         if cfg.NUM_GPUS:
 53 |             # Transfer the data to the current GPU device.
 54 |             if isinstance(inputs, (list,)):
 55 |                 for i in range(len(inputs)):
 56 |                     inputs[i] = inputs[i].cuda(non_blocking=True)
 57 |             else:
 58 |                 inputs = inputs.cuda(non_blocking=True)
 59 | 
 60 |             # Transfer the data to the current GPU device.
 61 |             labels = labels.cuda()
 62 |             video_idx = video_idx.cuda()
 63 |             for key, val in meta.items():
 64 |                 if isinstance(val, (list,)):
 65 |                     for i in range(len(val)):
 66 |                         val[i] = val[i].cuda(non_blocking=True)
 67 |                 else:
 68 |                     meta[key] = val.cuda(non_blocking=True)
 69 |         test_meter.data_toc()
 70 | 
 71 |         if cfg.DETECTION.ENABLE:
 72 |             # Compute the predictions.
 73 |             preds = model(inputs, meta["boxes"])
 74 |             ori_boxes = meta["ori_boxes"]
 75 |             metadata = meta["metadata"]
 76 | 
 77 |             preds = preds.detach().cpu() if cfg.NUM_GPUS else preds.detach()
 78 |             ori_boxes = (
 79 |                 ori_boxes.detach().cpu() if cfg.NUM_GPUS else ori_boxes.detach()
 80 |             )
 81 |             metadata = (
 82 |                 metadata.detach().cpu() if cfg.NUM_GPUS else metadata.detach()
 83 |             )
 84 | 
 85 |             if cfg.NUM_GPUS > 1:
 86 |                 preds = torch.cat(du.all_gather_unaligned(preds), dim=0)
 87 |                 ori_boxes = torch.cat(du.all_gather_unaligned(ori_boxes), dim=0)
 88 |                 metadata = torch.cat(du.all_gather_unaligned(metadata), dim=0)
 89 | 
 90 |             test_meter.iter_toc()
 91 |             # Update and log stats.
 92 |             test_meter.update_stats(preds, ori_boxes, metadata)
 93 |             test_meter.log_iter_stats(None, cur_iter)
 94 |         else:
 95 |             # Perform the forward pass.
 96 |             preds = model(inputs)
 97 | 
 98 |             # Gather all the predictions across all the devices to perform ensemble.
 99 |             if cfg.NUM_GPUS > 1:
100 |                 preds, labels, video_idx = du.all_gather(
101 |                     [preds, labels, video_idx]
102 |                 )
103 |             if cfg.NUM_GPUS:
104 |                 preds = preds.cpu()
105 |                 labels = labels.cpu()
106 |                 video_idx = video_idx.cpu()
107 | 
108 |             test_meter.iter_toc()
109 |             # Update and log stats.
110 |             test_meter.update_stats(
111 |                 preds.detach(), labels.detach(), video_idx.detach()
112 |             )
113 |             test_meter.log_iter_stats(cur_iter)
114 | 
115 |         test_meter.iter_tic()
116 | 
117 |     # Log epoch stats and print the final testing results.
118 |     if not cfg.DETECTION.ENABLE:
119 |         all_preds = test_meter.video_preds.clone().detach()
120 |         all_labels = test_meter.video_labels
121 |         if cfg.NUM_GPUS:
122 |             all_preds = all_preds.cpu()
123 |             all_labels = all_labels.cpu()
124 |         if writer is not None:
125 |             writer.plot_eval(preds=all_preds, labels=all_labels)
126 | 
127 |         if cfg.TEST.SAVE_RESULTS_PATH != "":
128 |             save_path = os.path.join(cfg.OUTPUT_DIR, cfg.TEST.SAVE_RESULTS_PATH)
129 | 
130 |             with PathManager.open(save_path, "wb") as f:
131 |                 pickle.dump([all_labels, all_labels], f)
132 | 
133 |             logger.info(
134 |                 "Successfully saved prediction results to {}".format(save_path)
135 |             )
136 | 
137 |     test_meter.finalize_metrics()
138 |     return test_meter
139 | 
140 | 
141 | def test(cfg):
142 |     """
143 |     Perform multi-view testing on the pretrained video model.
144 |     Args:
145 |         cfg (CfgNode): configs. Details can be found in
146 |             slowfast/config/defaults.py
147 |     """
148 |     # Set up environment.
149 |     du.init_distributed_training(cfg)
150 |     # Set random seed from configs.
151 |     np.random.seed(cfg.RNG_SEED)
152 |     torch.manual_seed(cfg.RNG_SEED)
153 | 
154 |     # Setup logging format.
155 |     logging.setup_logging(cfg.OUTPUT_DIR)
156 | 
157 |     # Print config.
158 |     logger.info("Test with config:")
159 |     logger.info(cfg)
160 | 
161 |     # Build the video model and print model statistics.
162 |     model = build_model(cfg)
163 |     if du.is_master_proc() and cfg.LOG_MODEL_INFO:
164 |         misc.log_model_info(model, cfg, use_train_input=False)
165 | 
166 |     cu.load_test_checkpoint(cfg, model)
167 | 
168 |     # Create video testing loaders.
169 |     test_loader = loader.construct_loader(cfg, "test")
170 |     logger.info("Testing model for {} iterations".format(len(test_loader)))
171 | 
172 |     assert (
173 |         len(test_loader.dataset)
174 |         % (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS)
175 |         == 0
176 |     )
177 |     # Create meters for multi-view testing.
178 |     test_meter = TestMeter(
179 |         len(test_loader.dataset)
180 |         // (cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS),
181 |         cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS,
182 |         cfg.MODEL.NUM_CLASSES,
183 |         len(test_loader),
184 |         cfg.DATA.MULTI_LABEL,
185 |         cfg.DATA.ENSEMBLE_METHOD,
186 |     )
187 | 
188 |     # Set up writer for logging to Tensorboard format.
189 |     if cfg.TENSORBOARD.ENABLE and du.is_master_proc(
190 |         cfg.NUM_GPUS * cfg.NUM_SHARDS
191 |     ):
192 |         writer = tb.TensorboardWriter(cfg)
193 |     else:
194 |         writer = None
195 | 
196 |     # # Perform multi-view test on the entire dataset.
197 |     test_meter = perform_test(test_loader, model, test_meter, cfg, writer)
198 |     if writer is not None:
199 |         writer.close()
200 | 


--------------------------------------------------------------------------------