├── FEATURE_ZOO.md
├── GUIDELINES.md
├── LICENSE
├── MODEL_ZOO.md
├── README.md
├── configs
    ├── pool
    │   ├── backbone
    │   │   ├── csn.yaml
    │   │   ├── localization-conv.yaml
    │   │   ├── r2d3ds.yaml
    │   │   ├── r2p1d.yaml
    │   │   ├── s3dg.yaml
    │   │   ├── slowfast_4x16.yaml
    │   │   ├── slowfast_8x8.yaml
    │   │   ├── tada2d.yaml
    │   │   ├── tadaconvnextv2_base.yaml
    │   │   ├── tadaconvnextv2_small.yaml
    │   │   ├── tadaconvnextv2_tiny.yaml
    │   │   ├── tadaformer_b16.yaml
    │   │   ├── tadaformer_l14.yaml
    │   │   ├── timesformer.yaml
    │   │   ├── vivit.yaml
    │   │   └── vivit_fac_enc.yaml
    │   ├── base.yaml
    │   └── run
    │   │   └── training
    │   │       ├── finetune.yaml
    │   │       ├── from_scratch.yaml
    │   │       ├── from_scratch_large.yaml
    │   │       ├── localization.yaml
    │   │       └── mosi.yaml
    └── projects
    │   ├── epic-kitchen-ar
    │       ├── csn_ek100.yaml
    │       ├── csn_ek100_submission.yaml
    │       ├── ek100
    │       │   ├── csn.yaml
    │       │   ├── csn_submit.yaml
    │       │   ├── csn_test.yaml
    │       │   ├── vivit_fac_enc.yaml
    │       │   ├── vivit_fac_enc_submit.yaml
    │       │   └── vivit_fac_enc_test.yaml
    │       ├── k400
    │       │   ├── vivit_fac_enc_b16x2.yaml
    │       │   └── vivit_fac_enc_b16x2_test.yaml
    │       ├── vivit_fac_enc_ek100.yaml
    │       ├── vivit_fac_enc_ek100_submission.yaml
    │       └── vivit_fac_enc_k400.yaml
    │   ├── epic-kitchen-tal
    │       ├── bmn-epic
    │       │   └── vivit-os-local.yaml
    │       └── bmn_epic.yaml
    │   ├── mosi
    │       ├── baselines
    │       │   ├── r2d3ds_hmdb.yaml
    │       │   ├── r2d3ds_ucf.yaml
    │       │   ├── r2p1d_hmdb.yaml
    │       │   └── r2p1d_ucf.yaml
    │       ├── ft-hmdb
    │       │   ├── r2d3ds.yaml
    │       │   ├── r2d3ds_test.yaml
    │       │   ├── r2p1d.yaml
    │       │   └── r2p1d_test.yaml
    │       ├── ft-ucf
    │       │   ├── r2d3ds.yaml
    │       │   ├── r2d3ds_test.yaml
    │       │   ├── r2p1d.yaml
    │       │   └── r2p1d_test.yaml
    │       ├── ft_r2d3ds_hmdb.yaml
    │       ├── ft_r2d3ds_ucf.yaml
    │       ├── ft_r2p1d_hmdb.yaml
    │       ├── ft_r2p1d_ucf.yaml
    │       ├── mosi_r2d3ds_hmdb.yaml
    │       ├── mosi_r2d3ds_imagenet.yaml
    │       ├── mosi_r2d3ds_ucf.yaml
    │       ├── mosi_r2p1d_hmdb.yaml
    │       ├── mosi_r2p1d_ucf.yaml
    │       ├── pt-hmdb
    │       │   ├── r2d3ds.yaml
    │       │   └── r2p1d.yaml
    │       ├── pt-imagenet
    │       │   └── r2d3ds.yaml
    │       └── pt-ucf
    │       │   ├── r2d3ds.yaml
    │       │   └── r2p1d.yaml
    │   ├── tada
    │       ├── k400
    │       │   ├── tada2d_16x5.yaml
    │       │   └── tada2d_8x8.yaml
    │       ├── ssv2
    │       │   ├── tada2d_16f.yaml
    │       │   └── tada2d_8f.yaml
    │       ├── tada2d_k400.yaml
    │       └── tada2d_ssv2.yaml
    │   ├── tadaconvnextv2
    │       ├── tadaconvnextv2_base_k400_16f.yaml
    │       ├── tadaconvnextv2_base_ssv2_16f.yaml
    │       ├── tadaconvnextv2_small_k400_16f.yaml
    │       ├── tadaconvnextv2_small_ssv2_16f.yaml
    │       ├── tadaconvnextv2_tiny_k400_16f.yaml
    │       └── tadaconvnextv2_tiny_ssv2_16f.yaml
    │   └── tadaformer
    │       ├── tadaformer_b16_k400_16f.yaml
    │       ├── tadaformer_b16_ssv2_16f.yaml
    │       ├── tadaformer_l14_k400_16f.yaml
    │       └── tadaformer_l14_ssv2_16f.yaml
├── projects
    ├── epic-kitchen-ar
    │   └── README.md
    ├── epic-kitchen-tal
    │   └── README.md
    ├── mosi
    │   ├── MoSI.png
    │   └── README.md
    ├── tada
    │   ├── README.md
    │   └── TAda2D.png
    └── tadaconvv2
    │   ├── README.md
    │   └── TAdaConvV2.png
├── runs
    ├── run.py
    ├── submission_test.py
    ├── test.py
    ├── test_epic_localization.py
    └── train.py
└── tadaconv
    ├── datasets
        ├── __init__.py
        ├── base
        │   ├── __init__.py
        │   ├── base_dataset.py
        │   ├── builder.py
        │   ├── epickitchen100.py
        │   ├── epickitchen100_feature.py
        │   ├── hmdb51.py
        │   ├── imagenet.py
        │   ├── kinetics400.py
        │   ├── ssv2.py
        │   └── ucf101.py
        └── utils
        │   ├── __init__.py
        │   ├── auto_augment.py
        │   ├── collate_functions.py
        │   ├── mixup.py
        │   ├── preprocess_ssv2.py
        │   ├── random_erasing.py
        │   └── transformations.py
    ├── models
        ├── __init__.py
        ├── base
        │   ├── __init__.py
        │   ├── backbone.py
        │   ├── base_blocks.py
        │   ├── builder.py
        │   ├── models.py
        │   ├── slowfast.py
        │   └── transformer.py
        ├── module_zoo
        │   ├── __init__.py
        │   ├── branches
        │   │   ├── __init__.py
        │   │   ├── csn_branch.py
        │   │   ├── non_local.py
        │   │   ├── r2d3d_branch.py
        │   │   ├── r2plus1d_branch.py
        │   │   ├── s3dg_branch.py
        │   │   ├── slowfast_branch.py
        │   │   ├── tada_branch.py
        │   │   ├── tadaconvnextv2.py
        │   │   └── tadaformer.py
        │   ├── heads
        │   │   ├── __init__.py
        │   │   ├── bmn_head.py
        │   │   ├── mosi_head.py
        │   │   ├── slowfast_head.py
        │   │   └── transformer_head.py
        │   ├── ops
        │   │   ├── __init__.py
        │   │   ├── misc.py
        │   │   ├── tadaconv.py
        │   │   └── tadaconv_v2.py
        │   └── stems
        │   │   ├── __init__.py
        │   │   ├── downsample_stem.py
        │   │   ├── embedding_stem.py
        │   │   └── r2plus1d_stem.py
        └── utils
        │   ├── init_helper.py
        │   ├── lars.py
        │   ├── localization_losses.py
        │   ├── losses.py
        │   ├── lr_policy.py
        │   ├── model_ema.py
        │   ├── optimizer.py
        │   └── params.py
    ├── sslgenerators
        ├── __init__.py
        ├── builder.py
        └── mosi
        │   └── mosi_generator.py
    └── utils
        ├── __init__.py
        ├── bboxes_1d.py
        ├── bucket.py
        ├── checkpoint.py
        ├── config.py
        ├── distributed.py
        ├── eval_tal
            ├── eval_epic_detection.py
            └── eval_tal.py
        ├── launcher.py
        ├── logging.py
        ├── meters.py
        ├── metrics.py
        ├── misc.py
        ├── registry.py
        ├── sampler.py
        ├── tal_tools.py
        ├── tensor.py
        ├── timer.py
        └── val_dist_sampler.py


/FEATURE_ZOO.md:
--------------------------------------------------------------------------------
 1 | # FEATURE ZOO
 2 | 
 3 | Here, we provide strong features for temporal action localization on HACS and Epic-Kitchens-100. 
 4 | 
 5 | | dataset | model | resolution | features | classification | average mAP |
 6 | | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
 7 | | EK100 | TAda2D | 8 x 8 | [features:code dc05](https://pan.baidu.com/s/1YS9yj_O21HedIxyh2PMrqw) | [classification:code 2j51](https://pan.baidu.com/s/1z7h7OAFR2UO_Q7t8dA6YbQ) | 13.18 (A) |
 8 | | HACS | TAda2D | 8 x 8 | [features:code 23kv](https://pan.baidu.com/s/1FHkRFvJldtEmD8kzYw_yMQ) | - | 32.3 |
 9 | | EK100 | ViViT Fact. Enc.-B16x2 | 32 x 2 | coming soon | coming soon | 18.30 (A) |
10 | 
11 | Annotations used for temporal action localization with our codebase can be found [here:code r30w](https://pan.baidu.com/s/16CtY0zTIzgDpm7sjhCAA6w).
12 | 
13 | Pre-trained localization models using these features can be found in the [MODEL_ZOO.md](MODEL_ZOO.md).
14 | 
15 | ## Guideline
16 | 
17 | ### Feature preparation
18 | After downloading the compressed feature files, first extract the `.pkl` files as follows. For example, for TAda2D HACS features:
19 | 
20 | ```bash
21 | cat features_s16_fps30_val_2G.tar.gz?? | tar zx
22 | cat features_s16_fps30_train_2G.tar.gz?? | tar zx
23 | ```
24 | 
25 | By the above commands, you should have two folders named `features_s16_fps30_train` and `features_s16_fps30_val`, containing some `.pkl` files. Each `.pkl` file corresponds to one video. 
26 | 
27 | ### Feature loading
28 | To load the features, please use the `load_feature` function in `datasets/base/epickitchen100_feature.py`:
29 | 
30 | ```python
31 | def load_feature(path):
32 |     if type(path) is str:
33 |         with open(path, 'rb') as f:
34 |             data = torch.load(f)
35 |     else:
36 |         data = torch.load(path)
37 |     return data
38 | ```
39 | 
40 | ### Feature concatenation
41 | For **Epic-Kitchen-100**, we divide each video to multiple clips, and the length of each clip is 5 secs. To perform action localization, features are first concatenated using `_transform_feature_scale` function in `datasets/base/epickitchen100_feature.py`. For example, during training, if the action segment is `[8.5, 16.1]`, it will require three clip features: `[[5.0,10.0], [10.0, 15.0], [15.0, 20.0]]`. From the features from these clips, we obtain features for the ground truth action segment. For more details, please refer to [epickitchen100_feature.py](datasets/base/epickitchen100_feature.py).


--------------------------------------------------------------------------------
/GUIDELINES.md:
--------------------------------------------------------------------------------
 1 | # Guidelines for pytorch-video-understanding
 2 | 
 3 | ## Installation
 4 | 
 5 | Requirements:
 6 | - Python>=3.6
 7 | - torch>=1.5
 8 | - torchvision (version corresponding with torch)
 9 | - simplejson==3.11.1
10 | - decord>=0.6.0
11 | - pyyaml
12 | - einops
13 | - oss2
14 | - psutil
15 | - tqdm
16 | - pandas
17 | 
18 | optional requirements
19 | - fvcore (for flops calculation)
20 | 
21 | ## Data preparation
22 | 
23 | For all datasets available in `datasets/base`, the name for each dataset list is specified in the `_get_dataset_list_name` function. 
24 | Here we provide a table summarizing all the name and the formats of the datasets.
25 | 
26 | | dataset | split | list file name | format |
27 | | ------- | ----- | -------------- | ------ | 
28 | | epic-kitchens-100 | train | EPIC_100_train.csv | as downloaded |
29 | | epic-kitchens-100 | val | EPIC_100_validation.csv | as downloaded | 
30 | | epic-kitchens-100 | test | EPIC_100_test_timestamps.csv | as downloaded | 
31 | | hmdb51 | train/val | hmdb51_train_list.txt/hmdb51_val_list.txt | "video_path, supervised_label" | 
32 | | imagenet | train/val | imagenet_train.txt/imagenet_val.txt | "image_path, supervised_label" |
33 | | kinetics 400 | train/val | kinetics400_train_list.txt/kinetics400_val_list.txt | "video_path, supervised_label" |
34 | | ssv2 | train | something-something-v2-train-with-label.json | json file with "label_idx" specifying the class and "id" specifying the name | 
35 | | ssv2 | val | something-something-v2-val-with-label.json | json file with "label_idx" specifying the class and "id" specifying the name | 
36 | | ucf101 | train/val | ucf101_train_list.txt/ucf101_val_list.txt | "video_path, supervised_label" |
37 | 
38 | For epic-kitchens-features, the file name is specified in the respective configs in `configs/projects/epic-kitchen-tal`.
39 | 
40 | ### Preprocessing Something-Something-V2 dataset
41 | 
42 | We found the the video decoder we use [decord](https://github.com/dmlc/decord) has difficulty in decoding the original webm files. So we provide a script for preprocessing the `.webm` files in the original something-something-v2 dataset to `.mp4` files. To do this, simply run:
43 | 
44 | ```bash
45 | python datasets/utils/preprocess_ssv2_annos.py --anno --anno_path path_to_your_annotation
46 | python datasets/utils/preprocess_ssv2_annos.py --data --data_path path_to_your_ssv2_videos --data_out_path path_to_put_output_videos
47 | ```
48 | 
49 | Remember to make sure the annotation files are organized as follows:
50 | ```
51 | -- path_to_your_annotation
52 |     -- something-something-v2-train.json
53 |     -- something-something-v2-validation.json
54 |     -- something-something-v2-labels.json
55 | ```
56 | 
57 | ## Running
58 | 
59 | The entry file for all the runs are `runs/run.py`. 
60 | 
61 | Before running, some settings need to be configured in the config file. 
62 | The codebase is designed to be experiment friendly for rapid development of new models and representation learning approaches, in that the config files are designed in a hierarchical way.
63 | 
64 | Take Tada2D as an example, each experiment (such as TAda2D_8x8 on kinetics 400: `configs/projects/tada/k400/tada2d_8x8.yaml`) inherits the config from the following hierarchy. 
65 | ```
66 | --- base config file [configs/pool/base.yaml]
67 |     --- base run config [configs/pool/run/training/from_scratch_large.yaml]
68 |     --- base backbone config [configs/pool/backbone/tada2d.yaml]
69 |         --- base experiment config [configs/projects/tada/tada2d_k400.yaml]
70 |             --- current experiment config [configs/projects/tada/k400/tada2d_8x8.yaml]
71 | ```
72 | Generally, the base config file `configs/pool/base.yaml` contains all the possible keys used in this codebase and the bottom config overwrites its base config when the same key is encountered in both files.
73 | A good practice would be set the parameters shared for all the experiments in the base experiment config, and set parameters that are different for each experiments to the current experiment config.
74 | 
75 | For an example run, open `configs/projects/tada/tada2d_k400.yaml` 
76 | A. Set `DATA.DATA_ROOT_DIR` and `DATA.DATA_ANNO_DIR` to point to the kinetics 400, 
77 | B. Set the valid gpu number `NUM_GPUS`
78 | Then the codebase can be run by:
79 | ```
80 | python runs/run.py --cfg configs/projects/tada/k400/tada2d_8x8.yaml 
81 | ```


--------------------------------------------------------------------------------
/configs/pool/backbone/csn.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: irCSN
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: 152
 6 |     META_ARCH: ResNet3D
 7 |     NUM_FILTERS: [64, 256, 512, 1024, 2048]
 8 |     NUM_INPUT_CHANNELS: 3
 9 |     NUM_OUT_FEATURES: 2048
10 |     KERNEL_SIZE: [
11 |       [3, 7, 7],
12 |       [3, 3, 3],
13 |       [3, 3, 3],
14 |       [3, 3, 3],
15 |       [3, 3, 3]
16 |     ]
17 |     DOWNSAMPLING: [true, false, true, true, true]
18 |     DOWNSAMPLING_TEMPORAL: [false, false, true, true, true]
19 |     NUM_STREAMS: 1
20 |     EXPANSION_RATIO: 4
21 |     BRANCH:
22 |       NAME: CSNBranch
23 |     STEM:
24 |       NAME: DownSampleStem
25 |     NONLOCAL:
26 |       ENABLE: false
27 |       STAGES: [5]
28 |       MASK_ENABLE: false
29 |   HEAD:
30 |     NAME: BaseHead
31 |     ACTIVATION: softmax
32 |     DROPOUT_RATE: 0
33 |     NUM_CLASSES:              # !!!
34 | 


--------------------------------------------------------------------------------
/configs/pool/backbone/localization-conv.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: BaseVideoModel
 3 | VIDEO:
 4 |   DIM1D: 256
 5 |   DIM2D: 128
 6 |   DIM3D: 512
 7 |   BACKBONE_LAYER: 2
 8 |   BACKBONE_GROUPS_NUM: 4
 9 |   BACKBONE:
10 |     META_ARCH: SimpleLocalizationConv


--------------------------------------------------------------------------------
/configs/pool/backbone/r2d3ds.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: R2D3D
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: 18
 6 |     META_ARCH: ResNet3D
 7 |     NUM_FILTERS: [64, 64, 128, 256, 256]
 8 |     NUM_INPUT_CHANNELS: 3
 9 |     NUM_OUT_FEATURES: 256
10 |     KERNEL_SIZE: [
11 |       [1, 7, 7],
12 |       [1, 3, 3],
13 |       [1, 3, 3],
14 |       [3, 3, 3],
15 |       [3, 3, 3]
16 |     ]
17 |     DOWNSAMPLING: [true, false, true, true, true]
18 |     DOWNSAMPLING_TEMPORAL: [false, false, false, true, true]
19 |     NUM_STREAMS: 1
20 |     EXPANSION_RATIO: 2
21 |     BRANCH:
22 |       NAME: R2D3DBranch
23 |     STEM:
24 |       NAME: DownSampleStem
25 |     NONLOCAL:
26 |       ENABLE: false
27 |       STAGES: [5]
28 |       MASK_ENABLE: false
29 |   HEAD:
30 |     NAME: BaseHead
31 |     ACTIVATION: softmax
32 |     DROPOUT_RATE: 0
33 |     NUM_CLASSES:              # !!!
34 | 


--------------------------------------------------------------------------------
/configs/pool/backbone/r2p1d.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: R2Plus1D
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: 10
 6 |     META_ARCH: ResNet3D
 7 |     NUM_INPUT_CHANNELS: 3
 8 |     NUM_FILTERS: [64, 64, 128, 256, 512]
 9 |     NUM_OUT_FEATURES: 512
10 |     KERNEL_SIZE: [
11 |       [3, 7, 7],
12 |       [3, 3, 3],
13 |       [3, 3, 3],
14 |       [3, 3, 3],
15 |       [3, 3, 3]
16 |     ]
17 |     DOWNSAMPLING: [true, false, true, true, true]
18 |     DOWNSAMPLING_TEMPORAL: [false, false, true, true, true]
19 |     NUM_STREAMS: 1
20 |     EXPANSION_RATIO: 2
21 |     BRANCH:
22 |       NAME: R2Plus1DBranch
23 |     STEM:
24 |       NAME: R2Plus1DStem
25 |     NONLOCAL:
26 |       ENABLE: false
27 |       STAGES: [5]
28 |       MASK_ENABLE: false
29 |   HEAD:
30 |     NAME: BaseHead
31 |     ACTIVATION: softmax
32 |     DROPOUT_RATE: 0
33 |     NUM_CLASSES:              # !!!
34 | 


--------------------------------------------------------------------------------
/configs/pool/backbone/s3dg.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: S3DG
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     META_ARCH: Inception3D
 6 |     NUM_OUT_FEATURES: 1024
 7 |     NUM_STREAMS: 1
 8 |     BRANCH:
 9 |       NAME: STConv3d
10 |       GATING: true
11 |     STEM:
12 |       NAME: STConv3d
13 |     NONLOCAL:
14 |       ENABLE: false
15 |       STAGES: [5]
16 |       MASK_ENABLE: false
17 |   HEAD:
18 |     NAME: BaseHead
19 |     ACTIVATION: softmax
20 |     DROPOUT_RATE: 0
21 |     NUM_CLASSES:              # !!!


--------------------------------------------------------------------------------
/configs/pool/backbone/slowfast_4x16.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: SlowFast_4x16
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: 50
 6 |     META_ARCH: Slowfast
 7 |     NUM_FILTERS: [64, 256, 512, 1024, 2048]
 8 |     NUM_INPUT_CHANNELS: 3
 9 |     NUM_OUT_FEATURES: 2048
10 |     KERNEL_SIZE: [
11 |       [
12 |         [1, 7, 7],
13 |         [1, 3, 3],
14 |         [1, 3, 3],
15 |         [1, 3, 3],
16 |         [1, 3, 3],
17 |       ],
18 |       [
19 |         [5, 7, 7],
20 |         [1, 3, 3],
21 |         [1, 3, 3],
22 |         [1, 3, 3],
23 |         [1, 3, 3],
24 |       ],
25 |     ]
26 |     DOWNSAMPLING: [true, false, true, true, true]
27 |     DOWNSAMPLING_TEMPORAL: [false, false, false, false, false]
28 |     TEMPORAL_CONV_BOTTLENECK:
29 |       [
30 |         [false, false, false, true, true], # slow branch,
31 |         [false, true, true, true, true]    # fast branch
32 |       ]
33 |     NUM_STREAMS: 1
34 |     EXPANSION_RATIO: 4
35 |     BRANCH:
36 |       NAME: SlowfastBranch
37 |     STEM:
38 |       NAME: DownSampleStem
39 |     SLOWFAST:
40 |       MODE: slowfast
41 |       ALPHA: 8
42 |       BETA: 8             # slow fast channel ratio
43 |       CONV_CHANNEL_RATIO: 2
44 |       KERNEL_SIZE: 5
45 |       FUSION_CONV_BIAS: false
46 |       FUSION_BN: true
47 |       FUSION_RELU: true
48 |     NONLOCAL:
49 |       ENABLE: false
50 |       STAGES: [5]
51 |       MASK_ENABLE: false
52 |   HEAD:
53 |     NAME: SlowFastHead
54 |     ACTIVATION: softmax
55 |     DROPOUT_RATE: 0
56 |     NUM_CLASSES:              # !!!
57 | DATA:
58 |   NUM_INPUT_FRAMES: 32
59 |   SAMPLING_RATE: 2
60 | 


--------------------------------------------------------------------------------
/configs/pool/backbone/slowfast_8x8.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: SlowFast_8x8
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: 50
 6 |     META_ARCH: Slowfast
 7 |     NUM_FILTERS: [64, 256, 512, 1024, 2048]
 8 |     NUM_INPUT_CHANNELS: 3
 9 |     NUM_OUT_FEATURES: 2048
10 |     KERNEL_SIZE: [
11 |       [
12 |         [1, 7, 7],
13 |         [1, 3, 3],
14 |         [1, 3, 3],
15 |         [1, 3, 3],
16 |         [1, 3, 3],
17 |       ],
18 |       [
19 |         [5, 7, 7],
20 |         [1, 3, 3],
21 |         [1, 3, 3],
22 |         [1, 3, 3],
23 |         [1, 3, 3],
24 |       ],
25 |     ]
26 |     DOWNSAMPLING: [true, false, true, true, true]
27 |     DOWNSAMPLING_TEMPORAL: [false, false, false, false, false]
28 |     TEMPORAL_CONV_BOTTLENECK:
29 |       [
30 |         [false, false, false, true, true], # slow branch,
31 |         [false, true, true, true, true]    # fast branch
32 |       ]
33 |     NUM_STREAMS: 1
34 |     EXPANSION_RATIO: 4
35 |     BRANCH:
36 |       NAME: SlowfastBranch
37 |     STEM:
38 |       NAME: DownSampleStem
39 |     SLOWFAST:
40 |       MODE: slowfast
41 |       ALPHA: 4
42 |       BETA: 8             # slow fast channel ratio
43 |       CONV_CHANNEL_RATIO: 2
44 |       KERNEL_SIZE: 7
45 |       FUSION_CONV_BIAS: false
46 |       FUSION_BN: true
47 |       FUSION_RELU: true
48 |     NONLOCAL:
49 |       ENABLE: false
50 |       STAGES: [5]
51 |       MASK_ENABLE: false
52 |   HEAD:
53 |     NAME: SlowFastHead
54 |     ACTIVATION: softmax
55 |     DROPOUT_RATE: 0
56 |     NUM_CLASSES:              # !!!
57 | DATA:
58 |   NUM_INPUT_FRAMES: 32
59 |   SAMPLING_RATE: 2


--------------------------------------------------------------------------------
/configs/pool/backbone/tada2d.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: TAda2D
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: 50
 6 |     META_ARCH: ResNet3D
 7 |     NUM_FILTERS: [64, 256, 512, 1024, 2048]
 8 |     NUM_INPUT_CHANNELS: 3
 9 |     NUM_OUT_FEATURES: 2048
10 |     KERNEL_SIZE: [
11 |       [1, 7, 7],
12 |       [1, 3, 3],
13 |       [1, 3, 3],
14 |       [1, 3, 3],
15 |       [1, 3, 3]
16 |     ]
17 |     DOWNSAMPLING: [true, true, true, true, true]
18 |     DOWNSAMPLING_TEMPORAL: [false, false, false, false, false]
19 |     NUM_STREAMS: 1
20 |     EXPANSION_RATIO: 4
21 |     INITIALIZATION: kaiming
22 |     STEM:
23 |       NAME: Base2DStem
24 |     BRANCH:
25 |       NAME: TAda2DBlock
26 |       ROUTE_FUNC_K: [3, 3]
27 |       ROUTE_FUNC_R: 4
28 |       POOL_K: [3, 1, 1]
29 |     NONLOCAL:
30 |       ENABLE: false
31 |       STAGES: [5]
32 |       MASK_ENABLE: false
33 |   HEAD:
34 |     NAME: BaseHead
35 |     ACTIVATION: softmax
36 |     DROPOUT_RATE: 0
37 |     NUM_CLASSES:              # !!!
38 | 


--------------------------------------------------------------------------------
/configs/pool/backbone/tadaconvnextv2_base.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: TAdaConvNeXtV2-Base
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: [3, 3, 27, 3]
 6 |     META_ARCH: ConvNeXt
 7 |     NUM_FILTERS: [128, 256, 512, 1024]
 8 |     NUM_INPUT_CHANNELS: 3
 9 |     NUM_OUT_FEATURES: 1024
10 |     DROP_PATH: 0.6
11 |     LARGE_SCALE_INIT_VALUE: 1e-6
12 |     STEM:
13 |       T_KERNEL_SIZE: 3
14 |       T_STRIDE: 2
15 |     BRANCH:
16 |       NAME: TAdaConvNeXtV2Block
17 |       ROUTE_FUNC_K: [3, 3]
18 |       ROUTE_FUNC_R: 2
19 |       HEAD_DIM: 64
20 |   HEAD:
21 |     NAME: BaseHead
22 |     ACTIVATION: softmax
23 |     DROPOUT_RATE: 0
24 |     NUM_CLASSES:              # !!!
25 | 
26 | 


--------------------------------------------------------------------------------
/configs/pool/backbone/tadaconvnextv2_small.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: TAdaConvNeXtV2-Small
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: [3, 3, 27, 3]
 6 |     META_ARCH: ConvNeXt
 7 |     NUM_FILTERS: [96, 192, 384, 768]
 8 |     NUM_INPUT_CHANNELS: 3
 9 |     NUM_OUT_FEATURES: 768
10 |     DROP_PATH: 0.4
11 |     LARGE_SCALE_INIT_VALUE: 1e-6
12 |     STEM:
13 |       T_KERNEL_SIZE: 3
14 |       T_STRIDE: 2
15 |     BRANCH:
16 |       NAME: TAdaConvNeXtV2Block
17 |       ROUTE_FUNC_K: [3, 3]
18 |       ROUTE_FUNC_R: 2
19 |       HEAD_DIM: 48
20 |   HEAD:
21 |     NAME: BaseHead
22 |     ACTIVATION: softmax
23 |     DROPOUT_RATE: 0
24 |     NUM_CLASSES:              # !!!
25 | 
26 | 


--------------------------------------------------------------------------------
/configs/pool/backbone/tadaconvnextv2_tiny.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: TAdaConvNeXtV2-Tiny
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     DEPTH: [3, 3, 9, 3]
 6 |     META_ARCH: ConvNeXt
 7 |     NUM_FILTERS: [96, 192, 384, 768]
 8 |     NUM_INPUT_CHANNELS: 3
 9 |     NUM_OUT_FEATURES: 768
10 |     DROP_PATH: 0.2
11 |     LARGE_SCALE_INIT_VALUE: 1e-6
12 |     STEM:
13 |       T_KERNEL_SIZE: 3
14 |       T_STRIDE: 2
15 |     BRANCH:
16 |       NAME: TAdaConvNeXtV2Block
17 |       ROUTE_FUNC_K: [3, 3]
18 |       ROUTE_FUNC_R: 2
19 |       HEAD_DIM: 48
20 |   HEAD:
21 |     NAME: BaseHead
22 |     ACTIVATION: softmax
23 |     DROPOUT_RATE: 0
24 |     NUM_CLASSES:              # !!!
25 | 
26 | 


--------------------------------------------------------------------------------
/configs/pool/backbone/tadaformer_b16.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: TAdaFormer_B16
 3 |   
 4 | VIDEO:
 5 |   BACKBONE:
 6 |     META_ARCH: VisionTransformer
 7 |     INPUT_RES: 224
 8 |     PATCH_SIZE: 16
 9 |     TUBLET_SIZE: 3
10 |     TUBLET_STRIDE: 2
11 |     NUM_FEATURES: 768
12 |     NUM_OUT_FEATURES: 768
13 |     DEPTH: 12
14 |     NUM_HEADS: 12
15 |     DROP_PATH: 0.0
16 |     ATTN_DROPOUT: 0.0
17 |     REQUIRE_PROJ: false
18 |     ATTN_MASK_ENABLE: false
19 |     DOUBLE_TADA: false
20 |     FREEZE: false
21 |     REDUCTION: 2
22 |     BRANCH: 
23 |       NAME: TAdaFormerBlock
24 |       ROUTE_FUNC_K: [3, 3]
25 |       ROUTE_FUNC_R: 2
26 |     TEMP_ENHANCE: false
27 |   HEAD:
28 |     NAME: BaseHead
29 |     OUTPUT_DIM: 512


--------------------------------------------------------------------------------
/configs/pool/backbone/tadaformer_l14.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: TAdaFormer_L14
 3 |   
 4 | VIDEO:
 5 |   BACKBONE:
 6 |     META_ARCH: VisionTransformer
 7 |     INPUT_RES: 224
 8 |     PATCH_SIZE: 14
 9 |     TUBLET_SIZE: 3
10 |     TUBLET_STRIDE: 2
11 |     NUM_FEATURES: 1024
12 |     NUM_OUT_FEATURES: 1024
13 |     DEPTH: 24
14 |     NUM_HEADS: 16
15 |     DROP_PATH: 0.0
16 |     ATTN_DROPOUT: 0.0
17 |     REQUIRE_PROJ: false
18 |     ATTN_MASK_ENABLE: false
19 |     DOUBLE_TADA: false
20 |     FREEZE: false
21 |     REDUCTION: 2
22 |     BRANCH: 
23 |       NAME: TAdaFormerBlock
24 |       ROUTE_FUNC_K: [3, 3]
25 |       ROUTE_FUNC_R: 2
26 |     TEMP_ENHANCE: false
27 |   HEAD:
28 |     NAME: BaseHead
29 |     OUTPUT_DIM: 512


--------------------------------------------------------------------------------
/configs/pool/backbone/timesformer.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: timesformer
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     META_ARCH: Transformer
 6 |     NUM_FEATURES: 768
 7 |     NUM_OUT_FEATURES: 768
 8 |     PATCH_SIZE: 16
 9 |     DEPTH: 12
10 |     NUM_HEADS: 12
11 |     DIM_HEAD: 64
12 |     ATTN_DROPOUT: 0.1
13 |     FF_DROPOUT: 0.1
14 |     DROP_PATH: 0.0
15 |     PRE_LOGITS: false
16 |     STEM:
17 |       NAME: PatchEmbedStem
18 |     BRANCH:
19 |       NAME: TimesformerLayer
20 |     NONLOCAL:
21 |       ENABLE: false
22 |       STAGES: [5]
23 |       MASK_ENABLE: false
24 |   HEAD:
25 |     NAME: TransformerHead
26 |     ACTIVATION: softmax
27 |     DROPOUT_RATE: 0
28 |     NUM_CLASSES:              # !!!


--------------------------------------------------------------------------------
/configs/pool/backbone/vivit.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: vivit
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     META_ARCH: Transformer
 6 |     NUM_FEATURES: 768
 7 |     NUM_OUT_FEATURES: 768
 8 |     PATCH_SIZE: 16
 9 |     TUBELET_SIZE: 2
10 |     DEPTH: 12
11 |     NUM_HEADS: 12
12 |     DIM_HEAD: 64
13 |     ATTN_DROPOUT: 0.0
14 |     FF_DROPOUT: 0.0
15 |     DROP_PATH: 0.1
16 |     MLP_MULT: 4
17 |     STEM:
18 |       NAME: TubeletEmbeddingStem
19 |     BRANCH:
20 |       NAME: BaseTransformerLayer
21 |   HEAD:
22 |     NAME: TransformerHead
23 |     ACTIVATION: softmax
24 |     DROPOUT_RATE: 0
25 |     NUM_CLASSES:              # !!!
26 |     PRE_LOGITS: false
27 | TRAIN:
28 |   CHECKPOINT_PRE_PROCESS:
29 |     ENABLE: true
30 |     POP_HEAD: true
31 |     POS_EMBED: repeat
32 |     PATCH_EMBD: central_frame


--------------------------------------------------------------------------------
/configs/pool/backbone/vivit_fac_enc.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   NAME: vivit
 3 | VIDEO:
 4 |   BACKBONE:
 5 |     META_ARCH: FactorizedTransformer
 6 |     NUM_FEATURES: 768
 7 |     NUM_OUT_FEATURES: 768
 8 |     PATCH_SIZE: 16
 9 |     TUBELET_SIZE: 2
10 |     DEPTH: 12
11 |     DEPTH_TEMP: 4
12 |     NUM_HEADS: 12
13 |     DIM_HEAD: 64
14 |     ATTN_DROPOUT: 0.0
15 |     FF_DROPOUT: 0.0
16 |     DROP_PATH: 0.1
17 |     MLP_MULT: 4
18 |     STEM:
19 |       NAME: TubeletEmbeddingStem
20 |     BRANCH:
21 |       NAME: BaseTransformerLayer
22 |   HEAD:
23 |     NAME: TransformerHead
24 |     ACTIVATION: softmax
25 |     DROPOUT_RATE: 0
26 |     NUM_CLASSES:              # !!!
27 |     PRE_LOGITS: false
28 | TRAIN:
29 |   CHECKPOINT_PRE_PROCESS:
30 |     ENABLE: true
31 |     POP_HEAD: true
32 |     POS_EMBED: 
33 |     PATCH_EMBD: central_frame


--------------------------------------------------------------------------------
/configs/pool/base.yaml:
--------------------------------------------------------------------------------
  1 | TASK_TYPE: classification
  2 | PRETRAIN:
  3 |   ENABLE: false
  4 | LOCALIZATION:
  5 |   ENABLE: false
  6 | TRAIN:
  7 |   ENABLE: false
  8 |   DATASET:
  9 |   BATCH_SIZE: 128
 10 |   LOG_FILE: training_log.log
 11 |   EVAL_PERIOD: 10
 12 |   NUM_FOLDS: 1
 13 |   AUTO_RESUME: true
 14 |   CHECKPOINT_PERIOD: 10
 15 |   INIT: ""
 16 |   CHECKPOINT_FILE_PATH: ""
 17 |   CHECKPOINT_TYPE: pytorch
 18 |   CHECKPOINT_INFLATE: false
 19 |   CHECKPOINT_PRE_PROCESS:
 20 |     ENABLE: false
 21 |   FINE_TUNE: false
 22 |   ONLY_LINEAR: false
 23 |   LR_REDUCE: false
 24 |   TRAIN_VAL_COMBINE: false
 25 | TEST:
 26 |   ENABLE: false
 27 |   DATASET:
 28 |   BATCH_SIZE: 100
 29 |   NUM_SPATIAL_CROPS: 1
 30 |   SPATIAL_CROPS: cc
 31 |   NUM_ENSEMBLE_VIEWS: 1
 32 |   LOG_FILE: val.log
 33 |   CHECKPOINT_FILE_PATH: ""
 34 |   CHECKPOINT_TYPE: pytorch
 35 |   AUTOMATIC_MULTI_SCALE_TEST: true 
 36 | VISUALIZATION:
 37 |   ENABLE: false
 38 |   NAME: ""
 39 |   FEATURE_MAPS:
 40 |     ENABLE: false
 41 |     BASE_OUTPUT_DIR: ""
 42 | SUBMISSION:
 43 |   ENABLE: false
 44 |   SAVE_RESULTS_PATH: "test.json"
 45 | DATA:
 46 |   DATA_ROOT_DIR: /data_root/
 47 |   ANNO_DIR: /anno_dir/
 48 |   NUM_INPUT_FRAMES: 16
 49 |   NUM_INPUT_CHANNELS: 3
 50 |   SAMPLING_MODE: interval_based
 51 |   SAMPLING_RATE: 4
 52 |   TRAIN_JITTER_SCALES: [168, 224]
 53 |   TRAIN_CROP_SIZE: 112
 54 |   TEST_SCALE: 224
 55 |   TEST_CROP_SIZE: 112
 56 |   MEAN: [0.45, 0.45, 0.45]
 57 |   STD: [0.225, 0.225, 0.225]
 58 |   MULTI_LABEL: false
 59 |   ENSEMBLE_METHOD: sum
 60 |   TARGET_FPS: 30
 61 |   MINUS_INTERVAL: false
 62 | MODEL:
 63 |   NAME: 
 64 |   EMA:
 65 |     ENABLE: false
 66 |     DECAY: 0.99996
 67 | VIDEO:
 68 |   BACKBONE:
 69 |     DEPTH: 
 70 |     META_ARCH: 
 71 |     NUM_FILTERS: 
 72 |     NUM_INPUT_CHANNELS: 3
 73 |     NUM_OUT_FEATURES: 
 74 |     KERNEL_SIZE: 
 75 |     DOWNSAMPLING: 
 76 |     DOWNSAMPLING_TEMPORAL: 
 77 |     NUM_STREAMS: 1
 78 |     EXPANSION_RATIO: 2
 79 |     BRANCH:
 80 |       NAME: 
 81 |     STEM:
 82 |       NAME: 
 83 |     NONLOCAL:
 84 |       ENABLE: false
 85 |       STAGES: [5]
 86 |       MASK_ENABLE: false
 87 |     INITIALIZATION: 
 88 |   HEAD:
 89 |     NAME: BaseHead
 90 |     ACTIVATION: softmax
 91 |     DROPOUT_RATE: 0
 92 |     NUM_CLASSES:
 93 | OPTIMIZER:
 94 |   ADJUST_LR: false
 95 |   BASE_LR: 0.002
 96 |   LR_POLICY: cosine
 97 |   MAX_EPOCH: 300
 98 |   MOMENTUM: 0.9
 99 |   WEIGHT_DECAY: 1e-3
100 |   WARMUP_EPOCHS: 10
101 |   WARMUP_START_LR: 0.0002
102 |   OPTIM_METHOD: adam
103 |   DAMPENING: 0.0
104 |   NESTEROV: true
105 |   BIAS_DOUBLE: false
106 |   NEW_PARAMS: []
107 |   NEW_PARAMS_MULT: 10
108 |   NEW_PARAMS_WD_MULT: 1
109 |   LAYER_WISE_LR_DECAY: 1.0
110 |   COSINE_AFTER_WARMUP: false
111 |   COSINE_END_LR: 1e-6
112 | BN:
113 |   WB_LOCK: false
114 |   FREEZE: false
115 |   WEIGHT_DECAY: 0.0
116 |   MOMENTUM: 0.1
117 |   EPS: 1e-5
118 |   SYNC: false
119 | DATA_LOADER:
120 |   NUM_WORKERS: 4
121 |   PIN_MEMORY: false
122 |   ENABLE_MULTI_THREAD_DECODE: true
123 |   COLLATE_FN: 
124 | NUM_GPUS: 8
125 | SHARD_ID: 0
126 | NUM_SHARDS: 1
127 | RANDOM_SEED: 0
128 | OUTPUT_DIR: output/
129 | OUTPUT_CFG_FILE: configuration.log
130 | LOG_PERIOD: 10
131 | DIST_BACKEND: nccl
132 | LOG_MODEL_INFO: true
133 | LOG_CONFIG_INFO: true
134 | OSS:
135 |   ENABLE: false
136 |   KEY: 
137 |   SECRET:    
138 |   ENDPOINT: 
139 |   CHECKPOINT_OUTPUT_PATH:                   # !!@7
140 |   SECONDARY_DATA_OSS: 
141 |     ENABLE: false
142 |     KEY: 
143 |     SECRET: 
144 |     ENDPOINT: 
145 |     BUCKETS: [""]
146 | AUGMENTATION:
147 |   COLOR_AUG: false
148 |   BRIGHTNESS: 0.5
149 |   CONTRAST: 0.5
150 |   SATURATION: 0.5
151 |   HUE: 0.25
152 |   GRAYSCALE: 0.3
153 |   CONSISTENT: true
154 |   SHUFFLE: true
155 |   GRAY_FIRST: true
156 |   RATIO: [0.857142857142857, 1.1666666666666667]
157 |   USE_GPU: false
158 |   MIXUP: 
159 |     ENABLE: false
160 |     ALPHA: 0.0
161 |     PROB: 1.0
162 |     MODE: batch
163 |     SWITCH_PROB: 0.5
164 |   CUTMIX: 
165 |     ENABLE: false
166 |     ALPHA: 0.0
167 |     MINMAX: 
168 |   RANDOM_ERASING:
169 |     ENABLE: false
170 |     PROB: 0.25
171 |     MODE: const
172 |     COUNT: [1, 1]
173 |     NUM_SPLITS: 0
174 |     AREA_RANGE: [0.02, 0.33]
175 |     MIN_ASPECT: 0.3
176 |   LABEL_SMOOTHING: 0.0
177 |   SSV2_FLIP: false
178 | PAI: false
179 | USE_MULTISEG_VAL_DIST: false


--------------------------------------------------------------------------------
/configs/pool/run/training/finetune.yaml:
--------------------------------------------------------------------------------
 1 | PRETRAIN:
 2 |   ENABLE: false
 3 | TRAIN:
 4 |   ENABLE: true
 5 |   DATASET:                          # !!@1
 6 |   BATCH_SIZE: 1024
 7 |   LOG_FILE: training_log.log
 8 |   LOSS_FUNC: cross_entropy
 9 |   EVAL_PERIOD: 5
10 |   NUM_FOLDS: 30
11 |   AUTO_RESUME: true
12 |   CHECKPOINT_PERIOD: 10
13 |   CHECKPOINT_FILE_PATH: ""          # !!@2
14 |   CHECKPOINT_TYPE: pytorch
15 |   CHECKPOINT_INFLATE: false
16 |   FINE_TUNE: true
17 |   ONLY_LINEAR: false
18 | TEST:
19 |   ENABLE: true                     # !!@3
20 |   DATASET:                          # !!@3
21 |   BATCH_SIZE: 1024
22 |   NUM_SPATIAL_CROPS: 1
23 |   SPATIAL_CROPS: cc
24 |   NUM_ENSEMBLE_VIEWS: 1
25 |   LOG_FILE: val.log
26 |   CHECKPOINT_FILE_PATH: ""
27 |   CHECKPOINT_TYPE: pytorch
28 |   AUTOMATIC_MULTI_SCALE_TEST: true
29 | DATA:
30 |   DATA_ROOT_DIR: 
31 |   ANNO_DIR: 
32 |   NUM_INPUT_FRAMES: 16
33 |   NUM_INPUT_CHANNELS: 3
34 |   SAMPLING_MODE: interval_based
35 |   SAMPLING_RATE: 4
36 |   TRAIN_JITTER_SCALES: [168, 224]
37 |   TRAIN_CROP_SIZE: 112
38 |   TEST_SCALE: 224
39 |   TEST_CROP_SIZE: 112
40 |   MEAN: [0.45, 0.45, 0.45]
41 |   STD: [0.225, 0.225, 0.225]
42 |   MULTI_LABEL: false
43 |   ENSEMBLE_METHOD: sum
44 |   FPS: 30
45 |   TARGET_FPS: 30
46 | OPTIMIZER:
47 |   BASE_LR: 0.002
48 |   LR_POLICY: cosine
49 |   MAX_EPOCH: 300
50 |   MOMENTUM: 0.9
51 |   WEIGHT_DECAY: 1e-3
52 |   WARMUP_EPOCHS: 10
53 |   WARMUP_START_LR: 0.0002
54 |   OPTIM_METHOD: adam
55 |   DAMPENING: 0.0
56 |   NESTEROV: true
57 | BN:
58 |   WEIGHT_DECAY: 0.0
59 |   EPS: 1e-3
60 | DATA_LOADER:
61 |   NUM_WORKERS: 4
62 |   PIN_MEMORY: false
63 |   ENABLE_MULTI_THREAD_DECODE: true
64 | NUM_GPUS: 8
65 | SHARD_ID: 0
66 | NUM_SHARDS: 1
67 | RANDOM_SEED: 0
68 | OUTPUT_DIR: 
69 | OUTPUT_CFG_FILE: configuration.log
70 | LOG_PERIOD: 10
71 | DIST_BACKEND: nccl
72 | LOG_MODEL_INFO: true
73 | LOG_CONFIG_INFO: true
74 | AUGMENTATION:
75 |   COLOR_AUG: true
76 |   BRIGHTNESS: 0.5
77 |   CONTRAST: 0.5
78 |   SATURATION: 0.5
79 |   HUE: 0.25
80 |   GRAYSCALE: 0.3
81 |   CONSISTENT: true
82 |   SHUFFLE: true
83 |   GRAY_FIRST: true
84 |   RATIO: [0.857142857142857, 1.1666666666666667]
85 |   USE_GPU: true
86 | PAI: false
87 | 
88 | 


--------------------------------------------------------------------------------
/configs/pool/run/training/from_scratch.yaml:
--------------------------------------------------------------------------------
 1 | PRETRAIN:
 2 |   ENABLE: false
 3 | TRAIN:
 4 |   ENABLE: true
 5 |   DATASET:                          # !!@1
 6 |   BATCH_SIZE: 1024
 7 |   LOG_FILE: training_log.log
 8 |   LOSS_FUNC: cross_entropy
 9 |   EVAL_PERIOD: 5
10 |   NUM_FOLDS: 30
11 |   AUTO_RESUME: true
12 |   CHECKPOINT_PERIOD: 10
13 |   CHECKPOINT_FILE_PATH: ""          # !!@2
14 |   CHECKPOINT_TYPE: pytorch
15 |   CHECKPOINT_INFLATE: false
16 |   FINE_TUNE: false
17 |   ONLY_LINEAR: false
18 | TEST:
19 |   ENABLE: false                     # !!@3
20 |   DATASET:                          # !!@3
21 |   BATCH_SIZE: 1024
22 |   NUM_SPATIAL_CROPS: 1
23 |   SPATIAL_CROPS: cc
24 |   NUM_ENSEMBLE_VIEWS: 1
25 |   LOG_FILE: val.log
26 |   CHECKPOINT_FILE_PATH: ""
27 |   CHECKPOINT_TYPE: pytorch
28 |   AUTOMATIC_MULTI_SCALE_TEST: true
29 | DATA:
30 |   DATA_ROOT_DIR: 
31 |   ANNO_DIR: 
32 |   NUM_INPUT_FRAMES: 16
33 |   NUM_INPUT_CHANNELS: 3
34 |   SAMPLING_MODE: interval_based
35 |   SAMPLING_RATE: 4
36 |   TRAIN_JITTER_SCALES: [168, 224]
37 |   TRAIN_CROP_SIZE: 112
38 |   TEST_SCALE: 224
39 |   TEST_CROP_SIZE: 112
40 |   MEAN: [0.45, 0.45, 0.45]
41 |   STD: [0.225, 0.225, 0.225]
42 |   MULTI_LABEL: false
43 |   ENSEMBLE_METHOD: sum
44 |   FPS: 30
45 |   TARGET_FPS: 30
46 | OPTIMIZER:
47 |   BASE_LR: 0.002
48 |   LR_POLICY: cosine
49 |   MAX_EPOCH: 300
50 |   MOMENTUM: 0.9
51 |   WEIGHT_DECAY: 1e-3
52 |   WARMUP_EPOCHS: 10
53 |   WARMUP_START_LR: 0.0002
54 |   OPTIM_METHOD: adam
55 |   DAMPENING: 0.0
56 |   NESTEROV: true
57 | BN:
58 |   WEIGHT_DECAY: 0.0
59 |   EPS: 1e-3
60 | DATA_LOADER:
61 |   NUM_WORKERS: 4
62 |   PIN_MEMORY: false
63 |   ENABLE_MULTI_THREAD_DECODE: true
64 | NUM_GPUS: 8
65 | SHARD_ID: 0
66 | NUM_SHARDS: 1
67 | RANDOM_SEED: 0
68 | OUTPUT_DIR: 
69 | OUTPUT_CFG_FILE: configuration.log
70 | LOG_PERIOD: 10
71 | DIST_BACKEND: nccl
72 | LOG_MODEL_INFO: true
73 | LOG_CONFIG_INFO: true
74 | AUGMENTATION:
75 |   COLOR_AUG: true
76 |   BRIGHTNESS: 0.5
77 |   CONTRAST: 0.5
78 |   SATURATION: 0.5
79 |   HUE: 0.25
80 |   GRAYSCALE: 0.3
81 |   CONSISTENT: true
82 |   SHUFFLE: true
83 |   GRAY_FIRST: true
84 |   RATIO: [0.857142857142857, 1.1666666666666667]
85 |   USE_GPU: true
86 | PAI: false


--------------------------------------------------------------------------------
/configs/pool/run/training/from_scratch_large.yaml:
--------------------------------------------------------------------------------
 1 | PRETRAIN:
 2 |   ENABLE: false
 3 | TRAIN:
 4 |   ENABLE: true
 5 |   DATASET:                          # !!@1
 6 |   BATCH_SIZE: 256 # 256 for 32 gpus
 7 |   LOG_FILE: training_log.log
 8 |   LOSS_FUNC: cross_entropy
 9 |   EVAL_PERIOD: 5
10 |   NUM_FOLDS: 1
11 |   AUTO_RESUME: true
12 |   CHECKPOINT_PERIOD: 5
13 |   CHECKPOINT_FILE_PATH: ""          # !!@2
14 |   CHECKPOINT_TYPE: pytorch
15 |   CHECKPOINT_INFLATE: false
16 |   FINE_TUNE: false
17 |   ONLY_LINEAR: false
18 | TEST:
19 |   ENABLE: true                     # !!@3
20 |   DATASET:                          # !!@3
21 |   BATCH_SIZE: 256
22 |   NUM_SPATIAL_CROPS: 1
23 |   SPATIAL_CROPS: cc
24 |   NUM_ENSEMBLE_VIEWS: 1
25 |   LOG_FILE: val.log
26 |   CHECKPOINT_FILE_PATH: ""
27 |   CHECKPOINT_TYPE: pytorch
28 |   AUTOMATIC_MULTI_SCALE_TEST: true
29 |   AUTOMATIC_MULTI_SCALE_TEST_SPATIAL: true
30 | DATA:
31 |   DATA_ROOT_DIR: 
32 |   ANNO_DIR: 
33 |   NUM_INPUT_FRAMES: 16
34 |   NUM_INPUT_CHANNELS: 3
35 |   SAMPLING_MODE: interval_based
36 |   SAMPLING_RATE: 4
37 |   TRAIN_JITTER_SCALES: [256, 320]
38 |   TRAIN_CROP_SIZE: 224
39 |   TEST_SCALE: 224
40 |   TEST_CROP_SIZE: 224
41 |   MEAN: [0.45, 0.45, 0.45]
42 |   STD: [0.225, 0.225, 0.225]
43 |   MULTI_LABEL: false
44 |   ENSEMBLE_METHOD: sum
45 |   FPS: 30
46 |   TARGET_FPS: 30
47 | OPTIMIZER:
48 |   BASE_LR: 0.001
49 |   ADJUST_LR: false
50 |   LR_POLICY: cosine
51 |   MAX_EPOCH: 100
52 |   MOMENTUM: 0.9
53 |   WEIGHT_DECAY: 1e-4
54 |   WARMUP_EPOCHS: 10
55 |   WARMUP_START_LR: 0.0001
56 |   OPTIM_METHOD: adam
57 |   DAMPENING: 0.0
58 |   NESTEROV: true
59 | BN:
60 |   WEIGHT_DECAY: 0.0
61 | DATA_LOADER:
62 |   NUM_WORKERS: 8
63 |   PIN_MEMORY: false
64 |   ENABLE_MULTI_THREAD_DECODE: true
65 | NUM_GPUS: 32
66 | SHARD_ID: 0
67 | NUM_SHARDS: 1
68 | RANDOM_SEED: 0
69 | OUTPUT_DIR: 
70 | OUTPUT_CFG_FILE: configuration.log
71 | LOG_PERIOD: 10
72 | DIST_BACKEND: nccl
73 | LOG_MODEL_INFO: true
74 | LOG_CONFIG_INFO: true
75 | AUGMENTATION:
76 |   COLOR_AUG: false
77 |   BRIGHTNESS: 0.5
78 |   CONTRAST: 0.5
79 |   SATURATION: 0.5
80 |   HUE: 0.25
81 |   GRAYSCALE: 0.3
82 |   CONSISTENT: true
83 |   SHUFFLE: true
84 |   GRAY_FIRST: true
85 |   RATIO: [0.857142857142857, 1.1666666666666667]
86 |   USE_GPU: false
87 | PAI: false


--------------------------------------------------------------------------------
/configs/pool/run/training/localization.yaml:
--------------------------------------------------------------------------------
  1 | TASK_TYPE: localization
  2 | LOCALIZATION:
  3 |   ENABLE: true
  4 |   LOSS: Tem+PemReg+PemCls
  5 |   LOSS_WEIGHTS: [1,10,1]
  6 |   POS_CLS_THRES: 0.9
  7 |   POS_REG_THRES: 0.7
  8 |   NEG_REG_THRES: 0.3
  9 | 
 10 |   TEST_OUTPUT_DIR: ./output/
 11 |   PROPS_DIR: prop_results
 12 |   PROPS_REGRESSION_LOSS: smoothl1
 13 |   RESULT_FILE: localization_detection_res
 14 |   CLASSIFIER_FILE: ""
 15 |   POST_PROCESS:
 16 |     THREAD: 32
 17 |     SOFT_NMS_ALPHA: 0.4
 18 |     SOFT_NMS_LOW_THRES: 0.0
 19 |     SOFT_NMS_HIGH_THRES: 0.0
 20 |     PROP_NUM: 100
 21 |     SELECT_SCORE: 0.0001
 22 |     SCORE_TYPE: 'cr'
 23 |     CLR_POWER: 1.2
 24 |     REG_POWER: 1.2
 25 |     IOU_POWER: 2.0
 26 |     TCA_POWER: 1.0
 27 |     ACTION_SCORE_POWER: 1.0
 28 |     VIDEO_SCORES_WEIGHT: 1.0
 29 |   
 30 | TRAIN:
 31 |   ENABLE: true
 32 |   DATASET:  Epickitchen100Localization                        # !!@1
 33 |   BATCH_SIZE: 64
 34 |   LOG_FILE: training_log.log
 35 |   EVAL_PERIOD: 1
 36 |   NUM_FOLDS: 1
 37 |   AUTO_RESUME: true
 38 |   CHECKPOINT_PERIOD: 1
 39 |   CHECKPOINT_FILE_PATH: ""          # !!@2
 40 |   CHECKPOINT_TYPE: pytorch
 41 |   CHECKPOINT_INFLATE: false
 42 |   FINE_TUNE: false
 43 |   LR_REDUCE: false
 44 | TEST:
 45 |   ENABLE: false                     # !!@3
 46 |   OUTPUT_TEST: false
 47 |   FORCE_FORWARD: false
 48 |   DATASET: Epickitchen100Localization                         # !!@3
 49 |   BATCH_SIZE: 128
 50 |   LOG_FILE: val.log
 51 |   TEST_SET: val
 52 |   CHECKPOINT_FILE_PATH: ""
 53 |   SAVE_RESULTS_PATH: "preds.log"
 54 |   CHECKPOINT_TYPE: pytorch
 55 |   AUTOMATIC_MULTI_SCALE_TEST: false
 56 |   TEST_CHECKPOINT: [7,8,9,10]
 57 | 
 58 | DATA:
 59 |   DATA_ROOT_DIR: 
 60 |   ANNO_DIR: 
 61 |   TEMPORAL_SCALE: 200
 62 |   DURATION_SCALE: -1
 63 |   TEMPORAL_MODE: resize
 64 |   NUM_INPUT_CHANNELS: 2304
 65 |   TEMPORAL_INTERVAL: 0.53333333
 66 |   NORM_FEATURE: true
 67 |   ANNO_NAME: ""
 68 |   LABELS_TYPE: bmn
 69 | 
 70 | SOLVER:
 71 |   BASE_LR: 0.001
 72 |   ADJUST_LR: true
 73 |   LR_POLICY: cosine
 74 |   MAX_EPOCH: 10
 75 |   MOMENTUM: 0.9
 76 |   WEIGHT_DECAY: 1e-4
 77 |   WARMUP_EPOCHS: 1
 78 |   WARMUP_START_LR: 0.0001
 79 |   OPTIM_METHOD: adam
 80 |   DAMPENING: 0.0
 81 |   NESTEROV: true
 82 | BN:
 83 |   USE_BN: false
 84 |   WEIGHT_DECAY: 0.0
 85 | DATA_LOADER:
 86 |   NUM_WORKERS: 8
 87 |   PIN_MEMORY: true
 88 | 
 89 | NUM_GPUS: 8
 90 | SHARD_ID: 0
 91 | NUM_SHARDS: 1
 92 | RANDOM_SEED: 0
 93 | OUTPUT_DIR: output/test
 94 | OUTPUT_CFG_FILE: configuration.log
 95 | LOG_PERIOD: 10
 96 | DIST_BACKEND: nccl
 97 | DEBUG_MODE: false
 98 | LOG_MODEL_INFO: true
 99 | LOG_CONFIG_INFO: true
100 | OSS:
101 |   ENABLE: false
102 | PAI: true
103 | 


--------------------------------------------------------------------------------
/configs/pool/run/training/mosi.yaml:
--------------------------------------------------------------------------------
  1 | PRETRAIN:
  2 |   ENABLE: true
  3 |   GENERATOR: MoSIGenerator
  4 |   LOSS: MoSIJoint
  5 |   LOSS_WEIGHTS: [1]
  6 |   DISTANCE_JITTER: [1, 1]
  7 |   SCALE_JITTER: false
  8 |   NUM_FRAMES: 16
  9 |   DATA_MODE: xy
 10 |   DECOUPLE: true
 11 |   FRAME_SIZE_STANDARDIZE_ENABLE: true
 12 |   STANDARD_SIZE: 320
 13 |   LABEL_MODE: joint                     # seperate / joint
 14 |   ZERO_OUT: false
 15 |   STATIC_MASK: true
 16 |   ASPECT_RATIO: [1, 1]
 17 |   MASK_SIZE_RATIO: [0.3, 0.5]
 18 |   NUM_CLIPS_PER_VIDEO: 1
 19 | TRAIN:
 20 |   ENABLE: true
 21 |   DATASET:                          # !!@1
 22 |   BATCH_SIZE: 80 # 80 for 8 gpus
 23 |   LOG_FILE: training_log.log
 24 |   EVAL_PERIOD: 5
 25 |   NUM_FOLDS: 1
 26 |   AUTO_RESUME: true
 27 |   CHECKPOINT_PERIOD: 10
 28 |   CHECKPOINT_FILE_PATH: ""          # !!@2
 29 |   CHECKPOINT_TYPE: pytorch
 30 |   CHECKPOINT_INFLATE: false
 31 |   FINE_TUNE: false
 32 |   ONLY_LINEAR: false
 33 | TEST:
 34 |   ENABLE: false                     # !!@3
 35 |   DATASET:                          # !!@3
 36 |   BATCH_SIZE: 80 # 80 for 8 gpus
 37 |   NUM_SPATIAL_CROPS: 1
 38 |   SPATIAL_CROPS: cc
 39 |   NUM_ENSEMBLE_VIEWS: 1
 40 |   LOG_FILE: val.log
 41 |   CHECKPOINT_FILE_PATH: ""
 42 |   CHECKPOINT_TYPE: pytorch
 43 |   AUTOMATIC_MULTI_SCALE_TEST: false
 44 | DATA:
 45 |   DATA_ROOT_DIR: 
 46 |   ANNO_DIR: 
 47 |   NUM_INPUT_FRAMES: 1
 48 |   NUM_INPUT_CHANNELS: 3
 49 |   SAMPLING_MODE: interval_based
 50 |   SAMPLING_RATE: 4
 51 |   TRAIN_JITTER_SCALES: [168, 224]
 52 |   TRAIN_CROP_SIZE: 112
 53 |   TEST_SCALE: 224
 54 |   TEST_CROP_SIZE: 112
 55 |   MEAN: [0.45, 0.45, 0.45]
 56 |   STD: [0.225, 0.225, 0.225]
 57 |   MULTI_LABEL: false
 58 |   ENSEMBLE_METHOD: sum
 59 |   FPS: 30
 60 |   TARGET_FPS: 30
 61 | OPTIMIZER:
 62 |   BASE_LR: 0.001
 63 |   LR_POLICY: cosine
 64 |   MAX_EPOCH: 100
 65 |   MOMENTUM: 0.9
 66 |   WEIGHT_DECAY: 1e-4
 67 |   WARMUP_EPOCHS: 10
 68 |   WARMUP_START_LR: 0.0001
 69 |   OPTIM_METHOD: adam
 70 |   DAMPENING: 0.0
 71 |   NESTEROV: true
 72 | BN:
 73 |   WEIGHT_DECAY: 0.0
 74 |   EPS: 1e-3
 75 | DATA_LOADER:
 76 |   NUM_WORKERS: 4
 77 |   PIN_MEMORY: false
 78 |   ENABLE_MULTI_THREAD_DECODE: true
 79 | NUM_GPUS: 8
 80 | SHARD_ID: 0
 81 | NUM_SHARDS: 1
 82 | RANDOM_SEED: 0
 83 | OUTPUT_DIR: 
 84 | OUTPUT_CFG_FILE: configuration.log
 85 | LOG_PERIOD: 10
 86 | DIST_BACKEND: nccl
 87 | LOG_MODEL_INFO: true
 88 | LOG_CONFIG_INFO: true
 89 | AUGMENTATION:
 90 |   COLOR_AUG: true
 91 |   BRIGHTNESS: 0.5
 92 |   CONTRAST: 0.5
 93 |   SATURATION: 0.5
 94 |   HUE: 0.25
 95 |   GRAYSCALE: 0.3
 96 |   CONSISTENT: false
 97 |   SHUFFLE: true
 98 |   GRAY_FIRST: true
 99 |   RATIO: [0.857142857142857, 1.1666666666666667]
100 |   USE_GPU: true
101 | PAI: false
102 | 
103 | MODEL:
104 |   NAME: MoSINet
105 | VIDEO:
106 |   HEAD:
107 |     NAME: MoSIHeadJoint
108 |     NUM_CLASSES: 5
109 |     DROPOUT_RATE: 0.5


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/csn_ek100.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/csn.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: epickitchen100
 9 |   BATCH_SIZE: 256
10 |   CHECKPOINT_FILE_PATH: ""
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: epickitchen100
14 |   BATCH_SIZE: 256
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/
18 |   NUM_INPUT_FRAMES: 32
19 |   SAMPLING_RATE: 2
20 |   TEST_SCALE: 256
21 |   TEST_CROP_SIZE: 256
22 |   MULTI_LABEL: true
23 |   TARGET_FPS: 60
24 | VIDEO:
25 |   HEAD:
26 |     NAME: BaseHeadx2
27 |     NUM_CLASSES: [97, 300]
28 |     DROPOUT_RATE: 0.5
29 | DATA_LOADER:
30 |   NUM_WORKERS: 4
31 | OPTIMIZER:
32 |   BASE_LR: 0.0001
33 |   ADJUST_LR: false
34 |   LR_POLICY: cosine
35 |   MAX_EPOCH: 50
36 |   MOMENTUM: 0.9
37 |   WEIGHT_DECAY: 0.05
38 |   WARMUP_EPOCHS: 5
39 |   WARMUP_START_LR: 0.000001
40 |   OPTIM_METHOD: adamw
41 |   DAMPENING: 0.0
42 |   NESTEROV: true
43 | NUM_GPUS: 32


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/csn_ek100_submission.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/csn.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: false
 8 |   DATASET: epickitchen100
 9 |   BATCH_SIZE: 256
10 |   CHECKPOINT_FILE_PATH: ""
11 | TEST:
12 |   ENABLE: false
13 |   DATASET: epickitchen100
14 |   BATCH_SIZE: 256
15 | SUBMISSION:
16 |   ENABLE: true
17 |   ACTION_CLASS_ENSUMBLE_METHOD: "sum" # sum or calculate
18 | TASK_TYPE: submission
19 | DATA:
20 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/
21 |   ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/
22 |   NUM_INPUT_FRAMES: 32
23 |   SAMPLING_RATE: 2
24 |   TEST_SCALE: 256
25 |   TEST_CROP_SIZE: 256
26 |   MULTI_LABEL: true
27 |   TARGET_FPS: 60
28 | VIDEO:
29 |   HEAD:
30 |     NAME: BaseHeadx2
31 |     NUM_CLASSES: [97, 300]
32 |     DROPOUT_RATE: 0.5
33 | DATA_LOADER:
34 |   NUM_WORKERS: 4
35 | NUM_GPUS: 32


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/ek100/csn.yaml:
--------------------------------------------------------------------------------
 1 | _BASE: ../csn_ek100.yaml
 2 | TRAIN:
 3 |   CHECKPOINT_PERIOD: 1
 4 |   CHECKPOINT_FILE_PATH: "" # pretrained weights from K400/K700/IG65M...
 5 |   FINE_TUNE: true
 6 |   CHECKPOINT_PRE_PROCESS:
 7 |     ENABLE: true
 8 |     POP_HEAD: true
 9 |     POS_EMBED: 
10 |     PATCH_EMBD: 
11 | AUGMENTATION:
12 |   COLOR_AUG: true
13 |   BRIGHTNESS: 0.5
14 |   CONTRAST: 0.5
15 |   SATURATION: 0.5
16 |   HUE: 0.25
17 |   GRAYSCALE: 0.0
18 |   CONSISTENT: true
19 |   SHUFFLE: false
20 |   GRAY_FIRST: false
21 |   USE_GPU: false
22 |   MIXUP: 
23 |     ENABLE: true
24 |     ALPHA: 0.2
25 |     PROB: 1.0
26 |     MODE: batch
27 |     SWITCH_PROB: 0.5
28 |   CUTMIX: 
29 |     ENABLE: true
30 |     ALPHA: 1.0
31 |     MINMAX: 
32 |   RANDOM_ERASING:
33 |     ENABLE: true
34 |     PROB: 0.25
35 |     MODE: pixel
36 |     COUNT: [1, 1]
37 |     NUM_SPLITS: 0
38 |     AREA_RANGE: [0.02, 0.33]
39 |     MIN_ASPECT: 0.3
40 |   LABEL_SMOOTHING: 0.2
41 | BN:
42 |   WB_LOCK: false
43 |   FREEZE: true
44 | OUTPUT_DIR: output/csn_ek100
45 | 


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/ek100/csn_submit.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../csn_ek100_submission.yaml
2 | TRAIN:
3 |   CHECKPOINT_FILE_PATH: ./checkpoints/csn152_pt_k700_ft_ek100_32x224x224_4452_public.pyth
4 |   BATCH_SIZE: 256
5 | TEST:
6 |   BATCH_SIZE: 256
7 | OUTPUT_DIR: output/csn_ek100_submit


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/ek100/csn_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../csn_ek100.yaml
2 | TRAIN:
3 |   ENABLE: false
4 |   CHECKPOINT_FILE_PATH: ./checkpoints/csn152_pt_k700_ft_ek100_32x224x224_4452_public.pyth
5 | BN:
6 |   WB_LOCK: false
7 |   FREEZE: true
8 | OUTPUT_DIR: output/csn_ek100_test
9 | 


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc.yaml:
--------------------------------------------------------------------------------
 1 | _BASE: ../vivit_fac_enc_ek100.yaml
 2 | TRAIN:
 3 |   CHECKPOINT_PERIOD: 1
 4 |   EVAL_PERIOD: 1
 5 |   CHECKPOINT_FILE_PATH: "" # directory of pretrained models
 6 |   FINE_TUNE: true
 7 |   BATCH_SIZE: 128
 8 |   CHECKPOINT_PRE_PROCESS:
 9 |     ENABLE: true
10 |     POP_HEAD: true
11 |     POS_EMBED: super-resolution
12 |     PATCH_EMBD: 
13 | 
14 | DATA:
15 |   TRAIN_JITTER_SCALES: [336, 448]
16 |   TRAIN_CROP_SIZE: 320
17 |   TEST_SCALE: 320
18 |   TEST_CROP_SIZE: 320
19 | 
20 | AUGMENTATION:
21 |   COLOR_AUG: true
22 |   BRIGHTNESS: 0.5
23 |   CONTRAST: 0.5
24 |   SATURATION: 0.5
25 |   HUE: 0.25
26 |   GRAYSCALE: 0.0
27 |   CONSISTENT: true
28 |   SHUFFLE: false
29 |   GRAY_FIRST: false
30 |   USE_GPU: false
31 |   MIXUP: 
32 |     ENABLE: true
33 |     ALPHA: 0.2
34 |     PROB: 1.0
35 |     MODE: batch
36 |     SWITCH_PROB: 0.5
37 |   CUTMIX: 
38 |     ENABLE: true
39 |     ALPHA: 1.0
40 |     MINMAX: 
41 |   RANDOM_ERASING:
42 |     ENABLE: true
43 |     PROB: 0.25
44 |     MODE: pixel
45 |     COUNT: [1, 1]
46 |     NUM_SPLITS: 0
47 |     AREA_RANGE: [0.02, 0.33]
48 |     MIN_ASPECT: 0.3
49 |   LABEL_SMOOTHING: 0.2
50 | 
51 | VIDEO:
52 |   BACKBONE:
53 |     DROP_PATH: 0.2
54 |   HEAD:
55 |     DROPOUT_RATE: 0.0
56 | 
57 | DATA_LOADER:
58 |   NUM_WORKERS: 8
59 | 
60 | OUTPUT_DIR: output/vivit_fac_enc_ek100


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc_submit.yaml:
--------------------------------------------------------------------------------
 1 | _BASE: ../vivit_fac_enc_ek100_submission.yaml
 2 | TRAIN:
 3 |   CHECKPOINT_PERIOD: 1
 4 |   EVAL_PERIOD: 1
 5 |   CHECKPOINT_FILE_PATH: ./checkpoints/vivit_fac_enc_b16x2_pt_k700_ft_ek100_32x224x224_4630_public.pyth
 6 |   FINE_TUNE: true
 7 |   BATCH_SIZE: 256
 8 | 
 9 | DATA:
10 |   TRAIN_JITTER_SCALES: [336, 448]
11 |   TRAIN_CROP_SIZE: 320
12 |   TEST_SCALE: 320
13 |   TEST_CROP_SIZE: 320
14 | 
15 | DATA_LOADER:
16 |   NUM_WORKERS: 8
17 | 
18 | OUTPUT_DIR: output/vivit_fac_enc_ek100_submit


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc_test.yaml:
--------------------------------------------------------------------------------
 1 | _BASE: ../vivit_fac_enc_ek100.yaml
 2 | TRAIN:
 3 |   ENABLE: false
 4 |   CHECKPOINT_FILE_PATH: ./checkpoints/vivit_fac_enc_b16x2_pt_k700_ft_ek100_32x224x224_4630_public.pyth
 5 |   CHECKPOINT_PRE_PROCESS:
 6 |     ENABLE: true
 7 |     POP_HEAD: true
 8 |     POS_EMBED: super-resolution
 9 |     PATCH_EMBD: 
10 | 
11 | DATA:
12 |   TRAIN_JITTER_SCALES: [336, 448]
13 |   TRAIN_CROP_SIZE: 320
14 |   TEST_SCALE: 320
15 |   TEST_CROP_SIZE: 320
16 | 
17 | DATA_LOADER:
18 |   NUM_WORKERS: 8
19 | 
20 | OUTPUT_DIR: output/vivit_fac_enc_ek100_test


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/k400/vivit_fac_enc_b16x2.yaml:
--------------------------------------------------------------------------------
 1 | _BASE: ../vivit_fac_enc_k400.yaml
 2 | TRAIN:
 3 |   CHECKPOINT_PERIOD: 1
 4 |   EVAL_PERIOD: 1
 5 |   CHECKPOINT_FILE_PATH: "" # directory to the pretrained imagenet vit b16 224 model
 6 |   FINE_TUNE: true
 7 | OPTIMIZER:
 8 |   BASE_LR: 0.0001
 9 |   ADJUST_LR: false
10 |   LR_POLICY: cosine
11 |   MAX_EPOCH: 30
12 |   MOMENTUM: 0.9
13 |   WEIGHT_DECAY: 0.1
14 |   WARMUP_EPOCHS: 2.5
15 |   WARMUP_START_LR: 0.000001
16 |   OPTIM_METHOD: adamw
17 |   DAMPENING: 0.0
18 |   NESTEROV: true
19 | MODEL:
20 |   EMA:
21 |     ENABLE: true
22 |     DECAY: 0.999
23 | 
24 | AUGMENTATION:
25 |   COLOR_AUG: true
26 |   BRIGHTNESS: 0.5
27 |   CONTRAST: 0.5
28 |   SATURATION: 0.5
29 |   HUE: 0.25
30 |   GRAYSCALE: 0.3
31 |   CONSISTENT: true
32 |   SHUFFLE: true
33 |   GRAY_FIRST: true
34 |   USE_GPU: false
35 |   MIXUP: 
36 |     ENABLE: true
37 |     ALPHA: 0.2
38 |     PROB: 1.0
39 |     MODE: batch
40 |     SWITCH_PROB: 0.5
41 |   LABEL_SMOOTHING: 0.1
42 | 
43 | VIDEO:
44 |   HEAD:
45 |     DROPOUT_RATE: 0.0
46 | 
47 | OUTPUT_DIR: output/vivit_fac_enc_k400


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/k400/vivit_fac_enc_b16x2_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../vivit_fac_enc_k400.yaml
2 | TRAIN:
3 |   ENABLE: false
4 |   CHECKPOINT_FILE_PATH: "./checkpoints/vivit_fac_enc_b16x2_k400_32x224x224_7935_public.pyth"
5 | 
6 | OUTPUT_DIR: output/vivit_fac_enc_k400_test


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: epickitchen100
 9 |   BATCH_SIZE: 256
10 |   CHECKPOINT_FILE_PATH: ""
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: epickitchen100
14 |   BATCH_SIZE: 256
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/
18 |   NUM_INPUT_FRAMES: 32
19 |   SAMPLING_RATE: 2
20 |   MULTI_LABEL: true
21 |   TARGET_FPS: 60
22 | VIDEO:
23 |   HEAD:
24 |     NAME: TransformerHeadx2
25 |     NUM_CLASSES: [97, 300]
26 |     DROPOUT_RATE: 0.5
27 | 
28 | DATA_LOADER:
29 |   NUM_WORKERS: 4
30 | 
31 | OPTIMIZER:
32 |   BASE_LR: 0.0001
33 |   ADJUST_LR: false
34 |   LR_POLICY: cosine
35 |   MAX_EPOCH: 50
36 |   MOMENTUM: 0.9
37 |   WEIGHT_DECAY: 0.05
38 |   WARMUP_EPOCHS: 5
39 |   WARMUP_START_LR: 0.000001
40 |   OPTIM_METHOD: adamw
41 |   DAMPENING: 0.0
42 |   NESTEROV: true
43 | NUM_GPUS: 32


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100_submission.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: false
 8 |   DATASET: epickitchen100
 9 |   BATCH_SIZE: 256
10 |   CHECKPOINT_FILE_PATH: ""
11 | TEST:
12 |   ENABLE: false
13 |   DATASET: epickitchen100
14 |   BATCH_SIZE: 256
15 | SUBMISSION:
16 |   ENABLE: true
17 |   ACTION_CLASS_ENSUMBLE_METHOD: "sum" # sum or calculate
18 | TASK_TYPE: submission
19 | DATA:
20 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/
21 |   ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/
22 |   NUM_INPUT_FRAMES: 32
23 |   SAMPLING_RATE: 2
24 |   MULTI_LABEL: true
25 |   TARGET_FPS: 60
26 | VIDEO:
27 |   HEAD:
28 |     NAME: TransformerHeadx2
29 |     NUM_CLASSES: [97, 300]
30 |     DROPOUT_RATE: 0.5
31 | 
32 | DATA_LOADER:
33 |   NUM_WORKERS: 10
34 | NUM_GPUS: 32


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-ar/vivit_fac_enc_k400.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: kinetics400
 9 |   BATCH_SIZE: 256
10 |   CHECKPOINT_FILE_PATH: ""
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: kinetics400
14 |   BATCH_SIZE: 256
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/
18 |   SAMPLING_RATE: 2
19 |   NUM_INPUT_FRAMES: 32
20 | VIDEO:
21 |   HEAD:
22 |     NUM_CLASSES: 400
23 |     DROPOUT_RATE: 0.5
24 | 
25 | DATA_LOADER:
26 |   NUM_WORKERS: 4
27 | NUM_GPUS: 32


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml:
--------------------------------------------------------------------------------
 1 | _BASE: ../bmn_epic.yaml
 2 | TRAIN:
 3 |   ENABLE: true
 4 |   BATCH_SIZE: 4
 5 |   CHECKPOINT_FILE_PATH: ""
 6 | TEST:
 7 |   ENABLE: true
 8 |   BATCH_SIZE: 4
 9 |   TEST_CHECKPOINT: [9]
10 |   CHECKPOINT_FILE_PATH: ""
11 | OUTPUT_DIR: /mnt/data-nas/qingzhiwu/results/checkpoints/epic_tal/vvt-os/
12 | 
13 | 


--------------------------------------------------------------------------------
/configs/projects/epic-kitchen-tal/bmn_epic.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/localization.yaml
 2 | _BASE_MODEL: ../../pool/backbone/localization-conv.yaml
 3 | 
 4 | TRAIN:
 5 |   ENABLE: true
 6 |   BATCH_SIZE: 16
 7 |   DATASET: Epickitchen100Localization
 8 |   CHECKPOINT_FILE_PATH:          # !!@2
 9 | TEST:
10 |   ENABLE: true
11 |   BATCH_SIZE: 16
12 |   DATASET: Epickitchen100Localization
13 | 
14 | LOCALIZATION:
15 |   ENABLE: true
16 |   LOSS: Tem+PemReg+PemCls
17 |   LOSS_WEIGHTS: [1,10,1,1]
18 |   TEST_OUTPUT_DIR: ./output/
19 |   PROPS_DIR: prop_results
20 |   RESULT_FILE: tal_detection_res
21 |   CLASSIFIER_FILE: 
22 |   POST_PROCESS:
23 |     PROP_NUM_RATIO: 2
24 |     THREAD: 32
25 |     SOFT_NMS_ALPHA: 0.4
26 |     SOFT_NMS_LOW_THRES: 0.25
27 |     SOFT_NMS_HIGH_THRES: 0.9
28 |     PROP_NUM_RATIO: 1.0
29 |     SELECT_SCORE: 0.0
30 |     SCORE_TYPE: 'cr'
31 |     CLR_POWER: 1.2
32 |     REG_POWER: 1.0
33 |     IOU_POWER: 2.0
34 |     ACTION_SCORE_POWER: 1.0
35 |     VIDEO_SCORES_WEIGHT: 1.0
36 | 
37 | DATA:
38 |   DATA_ROOT_DIR: [/mnt/data-nas/qingzhiwu/dataset/epic-tal/features/features_s8_fps60_320_-1_train/]
39 |   ANNO_DIR: /mnt/data-nas/qingzhiwu/dataset/epic-tal/annotations/
40 |   VIDEO_LENGTH_FILE: epic_videos_len.txt
41 |   ANNO_NAME: "EPIC_100_validation.json"
42 |   TEMPORAL_SCALE: 200
43 |   DURATION_SCALE: 100
44 |   NUM_INPUT_CHANNELS: 6912
45 |   NORM_FEATURE: false
46 |   LABELS_TYPE: bmn
47 |   LOAD_TYPE: torch
48 |   CLIPS_LIST_FILE: 5s_clips.txt
49 |   TARGET_FPS: 60
50 |   NUM_INPUT_FRAMES: 32
51 |   SAMPLING_RATE: 2
52 |   CLIP_INTERVAL: 8
53 |   MULTI_LABEL: true
54 |   CLASSIFIER_ROOT_DIR: /mnt/data-nas/qingzhiwu/dataset/epic-tal/features/cls_res_s8_fps60_320_-1_train/
55 |   LOAD_CLASSIFIER_RES: true
56 | 
57 | OPTIMIZER:
58 |   BASE_LR: 0.002
59 |   ADJUST_LR: true
60 |   LR_POLICY: cosine
61 |   MAX_EPOCH: 10
62 |   MOMENTUM: 0.9
63 |   WEIGHT_DECAY: 1e-4
64 |   WARMUP_EPOCHS: 1
65 |   WARMUP_START_LR: 0.00001
66 |   OPTIM_METHOD: adamw
67 |   DAMPENING: 0.0
68 |   NESTEROV: true
69 | 
70 | VIDEO:
71 |   HEAD:
72 |     NAME: BaseBMN
73 |     ACTIVATION: sigmoid
74 |     DROPOUT_RATE: 0
75 |     NUM_SAMPLE: 32
76 |     NUM_SAMPLE_PERBIN: 3
77 |     BOUNDARY_RATIO: 0.5
78 |     USE_BMN_REGRESSION: false
79 | 
80 | LOG_PERIOD: 50
81 | USE_MULTISEG_VAL_DIST: true


--------------------------------------------------------------------------------
/configs/projects/mosi/baselines/r2d3ds_hmdb.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml
 2 | _BASE_MODEL: ../../../pool/backbone/r2d3ds.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: HMDB51
 9 |   CHECKPOINT_FILE_PATH: ""
10 |   BATCH_SIZE: 1024
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: HMDB51
14 |   BATCH_SIZE: 1024
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/
18 | VIDEO:
19 |   HEAD:
20 |     NUM_CLASSES: 51
21 |     DROPOUT_RATE: 0.5
22 | OUTPUT_DIR: output/r2d3ds_hmdb_from_scratch
23 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/mosi/baselines/r2d3ds_ucf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml
 2 | _BASE_MODEL: ../../../pool/backbone/r2d3ds.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: UCF101
 9 |   CHECKPOINT_FILE_PATH: ""
10 |   BATCH_SIZE: 1024
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: UCF101
14 |   BATCH_SIZE: 1024
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/
18 | VIDEO:
19 |   HEAD:
20 |     NUM_CLASSES: 101
21 |     DROPOUT_RATE: 0.5
22 | OUTPUT_DIR: output/r2d3ds_ucf_from_scratch
23 | NUM_GPUS: 8
24 |   


--------------------------------------------------------------------------------
/configs/projects/mosi/baselines/r2p1d_hmdb.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml
 2 | _BASE_MODEL: ../../../pool/backbone/r2p1d.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: HMDB51
 9 |   CHECKPOINT_FILE_PATH: ""
10 |   BATCH_SIZE: 384
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: HMDB51
14 |   BATCH_SIZE: 384
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/
18 | VIDEO:
19 |   HEAD:
20 |     NUM_CLASSES: 51
21 |     DROPOUT_RATE: 0.5
22 | OUTPUT_DIR: output/r2p1d_hmdb_from_scratch
23 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/mosi/baselines/r2p1d_ucf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml
 2 | _BASE_MODEL: ../../../pool/backbone/r2p1d.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: UCF101
 9 |   CHECKPOINT_FILE_PATH: ""
10 |   BATCH_SIZE: 384
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: UCF101
14 |   BATCH_SIZE: 384
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/
18 | VIDEO:
19 |   HEAD:
20 |     NUM_CLASSES: 101
21 |     DROPOUT_RATE: 0.5
22 | OUTPUT_DIR: output/r2p1d_ucf_from_scratch
23 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/mosi/ft-hmdb/r2d3ds.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../ft_r2d3ds_hmdb.yaml
2 | TRAIN:
3 |   CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_hmdb_mosi_public.pyth
4 | OUTPUT_DIR: output/r2d3ds_mosi_ft_hmdb


--------------------------------------------------------------------------------
/configs/projects/mosi/ft-hmdb/r2d3ds_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../ft_r2d3ds_hmdb.yaml
2 | TRAIN:
3 |   ENABLE: false
4 |   CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_hmdb_ft_hmdb_4693_public.pyth
5 | OUTPUT_DIR: output/r2d3ds_mosi_ft_hmdb_test


--------------------------------------------------------------------------------
/configs/projects/mosi/ft-hmdb/r2p1d.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../ft_r2p1d_hmdb.yaml
2 | TRAIN:
3 |   CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_hmdb_mosi_public.pyth
4 | OUTPUT_DIR: output/r2p1d_mosi_ft_hmdb


--------------------------------------------------------------------------------
/configs/projects/mosi/ft-hmdb/r2p1d_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../ft_r2p1d_hmdb.yaml
2 | TRAIN:
3 |   ENABLE: false
4 |   CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_hmdb_ft_hmdb_5183_public.pyth
5 | OUTPUT_DIR: output/r2p1d_mosi_ft_hmdb_test


--------------------------------------------------------------------------------
/configs/projects/mosi/ft-ucf/r2d3ds.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../ft_r2d3ds_ucf.yaml
2 | TRAIN:
3 |   CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_ucf_mosi_public.pyth
4 | OUTPUT_DIR: output/r2d3ds_mosi_ft_ucf


--------------------------------------------------------------------------------
/configs/projects/mosi/ft-ucf/r2d3ds_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../ft_r2d3ds_ucf.yaml
2 | TRAIN:
3 |   ENABLE: false
4 |   CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_ucf_ft_ucf_7175_public.pyth
5 | OUTPUT_DIR: output/r2d3ds_mosi_ft_ucf_test


--------------------------------------------------------------------------------
/configs/projects/mosi/ft-ucf/r2p1d.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../ft_r2p1d_ucf.yaml
2 | TRAIN:
3 |   CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_ucf_mosi_public.pyth
4 | OUTPUT_DIR: output/r2p1d_mosi_ft_ucf
5 | 


--------------------------------------------------------------------------------
/configs/projects/mosi/ft-ucf/r2p1d_test.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../ft_r2p1d_ucf.yaml
2 | TRAIN:
3 |   ENABLE: false
4 |   CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_ucf_ft_ucf_8279_public.pyth
5 | OUTPUT_DIR: output/r2p1d_mosi_ft_ucf_test
6 | 


--------------------------------------------------------------------------------
/configs/projects/mosi/ft_r2d3ds_hmdb.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: HMDB51
 9 |   CHECKPOINT_FILE_PATH: ""          # !!@2
10 |   BATCH_SIZE: 1024
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: HMDB51
14 |   BATCH_SIZE: 1024
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/
18 |   MINUS_INTERVAL: false
19 | VIDEO:
20 |   HEAD:
21 |     NUM_CLASSES: 51
22 |     DROPOUT_RATE: 0.5
23 | OPTIMIZER:
24 |   BASE_LR: 0.002
25 |   WARMUP_START_LR: 0.0002
26 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/mosi/ft_r2d3ds_ucf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: UCF101
 9 |   CHECKPOINT_FILE_PATH: ""          # !!@2
10 |   BATCH_SIZE: 1024
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: UCF101
14 |   BATCH_SIZE: 1024
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/
18 |   MINUS_INTERVAL: false
19 | VIDEO:
20 |   HEAD:
21 |     NUM_CLASSES: 101
22 |     DROPOUT_RATE: 0.5
23 | OPTIMIZER:
24 |   BASE_LR: 0.004
25 |   WARMUP_START_LR: 0.0004
26 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/mosi/ft_r2p1d_hmdb.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: HMDB51
 9 |   CHECKPOINT_FILE_PATH: ""          # !!@2
10 |   BATCH_SIZE: 384 
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: HMDB51
14 |   BATCH_SIZE: 384
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/
18 |   MINUS_INTERVAL: false
19 | VIDEO:
20 |   HEAD:
21 |     NUM_CLASSES: 51
22 |     DROPOUT_RATE: 0.5
23 | OPTIMIZER:
24 |   BASE_LR: 0.00075
25 |   WARMUP_START_LR: 0.000075
26 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/mosi/ft_r2p1d_ucf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: UCF101
 9 |   CHECKPOINT_FILE_PATH: ""          # !!@2
10 |   BATCH_SIZE: 384
11 | TEST:
12 |   ENABLE: true
13 |   DATASET: UCF101
14 |   BATCH_SIZE: 384
15 | DATA:
16 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/
17 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/
18 |   MINUS_INTERVAL: false
19 | VIDEO:
20 |   HEAD:
21 |     NUM_CLASSES: 101
22 |     DROPOUT_RATE: 0.5
23 | OPTIMIZER:
24 |   BASE_LR: 0.0015
25 |   WARMUP_START_LR: 0.00015
26 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/mosi/mosi_r2d3ds_hmdb.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml
 3 | 
 4 | TRAIN:
 5 |   ENABLE: true
 6 |   DATASET: HMDB51
 7 |   BATCH_SIZE: 10 # 10 per gpu
 8 |   LOG_FILE: training_log.log
 9 |   EVAL_PERIOD: 5
10 |   NUM_FOLDS: 20
11 |   AUTO_RESUME: true
12 |   CHECKPOINT_PERIOD: 10
13 |   CHECKPOINT_FILE_PATH: ""          # !!@2
14 |   CHECKPOINT_TYPE: pytorch
15 |   CHECKPOINT_INFLATE: false
16 |   FINE_TUNE: false
17 |   ONLY_LINEAR: false
18 | TEST:
19 |   ENABLE: false
20 |   DATASET: HMDB51
21 |   BATCH_SIZE: 10
22 |   NUM_SPATIAL_CROPS: 1
23 |   SPATIAL_CROPS: cc
24 |   NUM_ENSEMBLE_VIEWS: 1
25 |   LOG_FILE: val.log
26 |   CHECKPOINT_FILE_PATH: ""
27 |   CHECKPOINT_TYPE: pytorch
28 |   AUTOMATIC_MULTI_SCALE_TEST: false
29 | DATA:
30 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/
31 |   ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/
32 | NUM_GPUS: 16


--------------------------------------------------------------------------------
/configs/projects/mosi/mosi_r2d3ds_imagenet.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml
 3 | 
 4 | PRETRAIN:
 5 |   IMAGENET_DATA_SIZE: 
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: imagenet
 9 |   BATCH_SIZE: 10 # 10 per gpu
10 |   LOG_FILE: training_log.log
11 |   EVAL_PERIOD: 5
12 |   NUM_FOLDS: 20
13 |   AUTO_RESUME: true
14 |   CHECKPOINT_PERIOD: 10
15 |   CHECKPOINT_FILE_PATH: ""          # !!@2p
16 |   CHECKPOINT_TYPE: pytorch
17 |   CHECKPOINT_INFLATE: false
18 |   FINE_TUNE: false
19 |   ONLY_LINEAR: false
20 | TEST:
21 |   ENABLE: false
22 |   DATASET: imagenet
23 |   BATCH_SIZE: 10
24 |   NUM_SPATIAL_CROPS: 1
25 |   SPATIAL_CROPS: cc
26 |   NUM_ENSEMBLE_VIEWS: 1
27 |   LOG_FILE: val.log
28 |   CHECKPOINT_FILE_PATH: ""
29 |   CHECKPOINT_TYPE: pytorch
30 |   AUTOMATIC_MULTI_SCALE_TEST: false
31 | DATA:
32 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/imagenet/
33 |   ANNO_DIR: /mnt/ziyuan/ziyuan/imagenet/
34 |   MEAN: [0.485, 0.456, 0.406]
35 |   STD: [0.229, 0.224, 0.225]
36 | NUM_GPUS: 16


--------------------------------------------------------------------------------
/configs/projects/mosi/mosi_r2d3ds_ucf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml
 3 | 
 4 | TRAIN:
 5 |   ENABLE: true
 6 |   DATASET: UCF101
 7 |   BATCH_SIZE: 10 # 10 per gpu
 8 |   LOG_FILE: training_log.log
 9 |   EVAL_PERIOD: 5
10 |   NUM_FOLDS: 20
11 |   AUTO_RESUME: true
12 |   CHECKPOINT_PERIOD: 10
13 |   CHECKPOINT_FILE_PATH: ""          # !!@2
14 |   CHECKPOINT_TYPE: pytorch
15 |   CHECKPOINT_INFLATE: false
16 |   FINE_TUNE: false
17 |   ONLY_LINEAR: false
18 | TEST:
19 |   ENABLE: false
20 |   DATASET: UCF101
21 |   BATCH_SIZE: 10
22 |   NUM_SPATIAL_CROPS: 1
23 |   SPATIAL_CROPS: cc
24 |   NUM_ENSEMBLE_VIEWS: 1
25 |   LOG_FILE: val.log
26 |   CHECKPOINT_FILE_PATH: ""
27 |   CHECKPOINT_TYPE: pytorch
28 |   AUTOMATIC_MULTI_SCALE_TEST: false
29 | DATA:
30 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/
31 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/
32 | NUM_GPUS: 16


--------------------------------------------------------------------------------
/configs/projects/mosi/mosi_r2p1d_hmdb.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml
 3 | 
 4 | TRAIN:
 5 |   ENABLE: true
 6 |   DATASET: HMDB51
 7 |   BATCH_SIZE: 5 # 5 per gpu
 8 |   LOG_FILE: training_log.log
 9 |   EVAL_PERIOD: 5
10 |   NUM_FOLDS: 20
11 |   AUTO_RESUME: true
12 |   CHECKPOINT_PERIOD: 10
13 |   CHECKPOINT_FILE_PATH: ""          # !!@2
14 |   CHECKPOINT_TYPE: pytorch
15 |   CHECKPOINT_INFLATE: false
16 |   FINE_TUNE: false
17 |   ONLY_LINEAR: false
18 | TEST:
19 |   ENABLE: false
20 |   DATASET: HMDB51
21 |   BATCH_SIZE: 5
22 |   NUM_SPATIAL_CROPS: 1
23 |   SPATIAL_CROPS: cc
24 |   NUM_ENSEMBLE_VIEWS: 1
25 |   LOG_FILE: val.log
26 |   CHECKPOINT_FILE_PATH: ""
27 |   CHECKPOINT_TYPE: pytorch
28 |   AUTOMATIC_MULTI_SCALE_TEST: false
29 | DATA:
30 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/
31 |   ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/
32 | NUM_GPUS: 16


--------------------------------------------------------------------------------
/configs/projects/mosi/mosi_r2p1d_ucf.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml
 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml
 3 | 
 4 | TRAIN:
 5 |   ENABLE: true
 6 |   DATASET: UCF101
 7 |   BATCH_SIZE: 5 # 5 per gpu
 8 |   LOG_FILE: training_log.log
 9 |   EVAL_PERIOD: 5
10 |   NUM_FOLDS: 20
11 |   AUTO_RESUME: true
12 |   CHECKPOINT_PERIOD: 10
13 |   CHECKPOINT_FILE_PATH: ""          # !!@2
14 |   CHECKPOINT_TYPE: pytorch
15 |   CHECKPOINT_INFLATE: false
16 |   FINE_TUNE: false
17 |   ONLY_LINEAR: false
18 | TEST:
19 |   ENABLE: false
20 |   DATASET: UCF101
21 |   BATCH_SIZE: 5
22 |   NUM_SPATIAL_CROPS: 1
23 |   SPATIAL_CROPS: cc
24 |   NUM_ENSEMBLE_VIEWS: 1
25 |   LOG_FILE: val.log
26 |   CHECKPOINT_FILE_PATH: ""
27 |   CHECKPOINT_TYPE: pytorch
28 |   AUTOMATIC_MULTI_SCALE_TEST: false
29 | DATA:
30 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/
31 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/
32 | NUM_GPUS: 16


--------------------------------------------------------------------------------
/configs/projects/mosi/pt-hmdb/r2d3ds.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../mosi_r2d3ds_hmdb.yaml
2 | TRAIN:
3 |   EVAL_PERIOD: 10
4 | OUTPUT_DIR: output/r2d3ds_pt_hmdb


--------------------------------------------------------------------------------
/configs/projects/mosi/pt-hmdb/r2p1d.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../mosi_r2p1d_hmdb.yaml
2 | TRAIN:
3 |   EVAL_PERIOD: 10
4 | OUTPUT_DIR: output/r2p1d_pt_hmdb
5 | 


--------------------------------------------------------------------------------
/configs/projects/mosi/pt-imagenet/r2d3ds.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../mosi_r2d3ds_imagenet.yaml
2 | TRAIN:
3 |   EVAL_PERIOD: 10
4 | PRETRAIN:
5 |   IMAGENET_DATA_SIZE: 5
6 | OUTPUT_DIR: output/r2d3ds_pt_imagenet


--------------------------------------------------------------------------------
/configs/projects/mosi/pt-ucf/r2d3ds.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../mosi_r2d3ds_ucf.yaml
2 | TRAIN:
3 |   EVAL_PERIOD: 10
4 | OUTPUT_DIR: output/r2d3ds_pt_ucf


--------------------------------------------------------------------------------
/configs/projects/mosi/pt-ucf/r2p1d.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../mosi_r2p1d_ucf.yaml
2 | TRAIN:
3 |   EVAL_PERIOD: 10
4 | OUTPUT_DIR: output/r2p1d_pt_ucf
5 | 


--------------------------------------------------------------------------------
/configs/projects/tada/k400/tada2d_16x5.yaml:
--------------------------------------------------------------------------------
 1 | _BASE: ../tada2d_k400.yaml
 2 | TRAIN:
 3 |   FINE_TUNE: true
 4 |   BATCH_SIZE: 64
 5 |   INIT: in1k
 6 |   CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights
 7 | OPTIMIZER:
 8 |   BASE_LR: 0.12
 9 | DATA:
10 |   SAMPLING_RATE: 5
11 |   NUM_INPUT_FRAMES: 16
12 | OUTPUT_DIR: output/tada2d_16x5_k400


--------------------------------------------------------------------------------
/configs/projects/tada/k400/tada2d_8x8.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../tada2d_k400.yaml
2 | TRAIN:
3 |   FINE_TUNE: true
4 |   INIT: in1k
5 |   CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights
6 | DATA:
7 |   SAMPLING_RATE: 8
8 |   NUM_INPUT_FRAMES: 8
9 | OUTPUT_DIR: output/tada2d_8x8_k400


--------------------------------------------------------------------------------
/configs/projects/tada/ssv2/tada2d_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE: ../tada2d_ssv2.yaml
 2 | TRAIN:
 3 |   FINE_TUNE: true
 4 |   BATCH_SIZE: 64
 5 |   INIT: in1k
 6 |   CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights
 7 | DATA:
 8 |   NUM_INPUT_FRAMES: 16
 9 | OPTIMIZER:
10 |   BASE_LR: 0.24
11 | OUTPUT_DIR: output/tada2d_ssv2_16f


--------------------------------------------------------------------------------
/configs/projects/tada/ssv2/tada2d_8f.yaml:
--------------------------------------------------------------------------------
1 | _BASE: ../tada2d_ssv2.yaml
2 | TRAIN:
3 |   FINE_TUNE: true
4 |   INIT: in1k
5 |   CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights
6 | DATA:
7 |   NUM_INPUT_FRAMES: 8
8 | OUTPUT_DIR: output/tada2d_ssv2_8f


--------------------------------------------------------------------------------
/configs/projects/tada/tada2d_k400.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tada2d.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: kinetics400
 9 |   BATCH_SIZE: 128
10 |   FINE_TUNE: true
11 |   INIT: in1k
12 |   CHECKPOINT_FILE_PATH: ""          # !!@2
13 | TEST:
14 |   ENABLE: true
15 |   DATASET: kinetics400
16 |   BATCH_SIZE: 128
17 | DATA:
18 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/
19 |   ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/
20 |   SAMPLING_RATE: 8
21 |   NUM_INPUT_FRAMES: 8
22 |   TRAIN_JITTER_SCALES: [224, 340]
23 |   TRAIN_CROP_SIZE: 224
24 |   TEST_SCALE: 256
25 |   TEST_CROP_SIZE: 256
26 | VIDEO:
27 |   HEAD:
28 |     NUM_CLASSES: 400
29 |     DROPOUT_RATE: 0.5
30 | DATA_LOADER:
31 |   NUM_WORKERS: 8
32 | OPTIMIZER:
33 |   BASE_LR: 0.24
34 |   ADJUST_LR: false
35 |   LR_POLICY: cosine
36 |   MAX_EPOCH: 100
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 1e-4
39 |   WARMUP_EPOCHS: 8
40 |   WARMUP_START_LR: 0.01
41 |   OPTIM_METHOD: sgd
42 |   DAMPENING: 0.0
43 |   NESTEROV: true
44 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/tada/tada2d_ssv2.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tada2d.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: ssv2
 9 |   BATCH_SIZE: 128
10 |   FINE_TUNE: true
11 |   INIT: in1k
12 |   CHECKPOINT_FILE_PATH: ""
13 | TEST:
14 |   ENABLE: true
15 |   DATASET: ssv2
16 |   BATCH_SIZE: 128
17 | DATA:
18 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/
19 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/
20 |   NUM_INPUT_FRAMES: 8
21 |   SAMPLING_MODE: segment_based
22 |   TRAIN_JITTER_SCALES: [224, 340]
23 |   TRAIN_CROP_SIZE: 224
24 |   TEST_SCALE: 256
25 |   TEST_CROP_SIZE: 256
26 | VIDEO:
27 |   HEAD:
28 |     NUM_CLASSES: 174
29 |     DROPOUT_RATE: 0.5
30 | DATA_LOADER:
31 |   NUM_WORKERS: 8
32 | OPTIMIZER:
33 |   BASE_LR: 0.48
34 |   ADJUST_LR: false
35 |   LR_POLICY: cosine
36 |   MAX_EPOCH: 64
37 |   MOMENTUM: 0.9
38 |   WEIGHT_DECAY: 1e-4
39 |   WARMUP_EPOCHS: 4
40 |   WARMUP_START_LR: 0.0001
41 |   OPTIM_METHOD: sgd
42 |   DAMPENING: 0.0
43 |   NESTEROV: true
44 | AUGMENTATION: 
45 |   SSV2_FLIP: true
46 | NUM_GPUS: 8


--------------------------------------------------------------------------------
/configs/projects/tadaconvnextv2/tadaconvnextv2_base_k400_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_base.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: kinetics400
 9 |   BATCH_SIZE: 64 #total batch size: 64x4=256
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: in1k
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: kinetics400
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/
21 |   SAMPLING_RATE: 5
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 256
26 |   TEST_CROP_SIZE: 256
27 | VIDEO:
28 |   BACKBONE:
29 |     DROP_PATH: 0.6
30 |   HEAD:
31 |     NUM_CLASSES: 400
32 |     DROPOUT_RATE: 0.5
33 | 
34 | OUTPUT_DIR: output/tadaconvnextv2_base_k400_16f
35 | 
36 | OPTIMIZER:
37 |   BASE_LR: 2.5e-4
38 |   ADJUST_LR: false
39 |   LR_POLICY: cosine
40 |   MAX_EPOCH: 100
41 |   MOMENTUM: 0.9
42 |   WEIGHT_DECAY: 0.02
43 |   WARMUP_EPOCHS: 8
44 |   WARMUP_START_LR: 1e-6
45 |   OPTIM_METHOD: adamw
46 |   DAMPENING: 0.0
47 |   NESTEROV: true
48 |   HEAD_LRMULT: 10
49 |   NEW_PARAMS: ["dwconv_rf", "norm_avgpool"]
50 |   NEW_PARAMS_MULT: 10
51 | AUGMENTATION: 
52 |   COLOR_AUG: true
53 |   GRAYSCALE: 0.2
54 |   COLOR_P: 0.0
55 |   CONSISTENT: true
56 |   SHUFFLE: true
57 |   GRAY_FIRST: false
58 |   IS_SPLIT: false
59 |   USE_GPU: false
60 |   SSV2_FLIP: true
61 |   RATIO: [0.75, 1.333]
62 |   MIXUP: 
63 |     ENABLE: false
64 |   CUTMIX: 
65 |     ENABLE: false
66 |   RANDOM_ERASING:
67 |     ENABLE: false
68 |   LABEL_SMOOTHING: 0.0
69 |   AUTOAUGMENT:
70 |     ENABLE: true
71 |     BEFORE_CROP: true
72 |     TYPE: rand-m9-n4-mstd0.5-inc1
73 | NUM_GPUS: 8
74 | DATA_LOADER:
75 |   NUM_WORKERS: 12
76 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaconvnextv2/tadaconvnextv2_base_ssv2_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_base.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: ssv2
 9 |   BATCH_SIZE: 64 #total batch size: 64x4=256
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: in1k # by default, the initialization is from kinetics 400 pretrain
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: ssv2
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/
21 |   SAMPLING_MODE: segment_based
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 256
26 |   TEST_CROP_SIZE: 256
27 | VIDEO:
28 |   BACKBONE:
29 |     DROP_PATH: 0.6
30 |   HEAD:
31 |     NUM_CLASSES: 174
32 |     DROPOUT_RATE: 0.5
33 | 
34 | OUTPUT_DIR: output/tadaconvnextv2_base_ssv2_16f
35 | 
36 | OPTIMIZER:
37 |   BASE_LR: 2.5e-4
38 |   ADJUST_LR: false
39 |   LR_POLICY: cosine
40 |   MAX_EPOCH: 64
41 |   MOMENTUM: 0.9
42 |   WEIGHT_DECAY: 0.02
43 |   WARMUP_EPOCHS: 2.5
44 |   WARMUP_START_LR: 1e-6
45 |   OPTIM_METHOD: adamw
46 |   DAMPENING: 0.0
47 |   NESTEROV: true
48 |   HEAD_LRMULT: 10
49 |   NEW_PARAMS: ["dwconv_rf", "norm_avgpool"]
50 |   NEW_PARAMS_MULT: 10
51 | AUGMENTATION: 
52 |   COLOR_AUG: true
53 |   GRAYSCALE: 0.2
54 |   COLOR_P: 0.0
55 |   CONSISTENT: true
56 |   SHUFFLE: true
57 |   GRAY_FIRST: false
58 |   IS_SPLIT: false
59 |   USE_GPU: false
60 |   SSV2_FLIP: true
61 |   RATIO: [0.75, 1.333]
62 |   MIXUP: 
63 |     ENABLE: false
64 |   CUTMIX: 
65 |     ENABLE: false
66 |   RANDOM_ERASING:
67 |     ENABLE: false
68 |   LABEL_SMOOTHING: 0.0
69 |   AUTOAUGMENT:
70 |     ENABLE: true
71 |     BEFORE_CROP: true
72 |     TYPE: rand-m9-n4-mstd0.5-inc1
73 | NUM_GPUS: 8
74 | DATA_LOADER:
75 |   NUM_WORKERS: 12
76 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaconvnextv2/tadaconvnextv2_small_k400_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_small.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: kinetics400
 9 |   BATCH_SIZE: 64 #total batch size: 64x4=256
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: in1k
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: kinetics400
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/
21 |   SAMPLING_RATE: 5
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 256
26 |   TEST_CROP_SIZE: 256
27 | VIDEO:
28 |   BACKBONE:
29 |     DROP_PATH: 0.4
30 |   HEAD:
31 |     NUM_CLASSES: 400
32 |     DROPOUT_RATE: 0.5
33 | 
34 | OUTPUT_DIR: output/tadaconvnextv2_small_k400_16f
35 | 
36 | OPTIMIZER:
37 |   BASE_LR: 2.5e-4
38 |   ADJUST_LR: false
39 |   LR_POLICY: cosine
40 |   MAX_EPOCH: 100
41 |   MOMENTUM: 0.9
42 |   WEIGHT_DECAY: 0.02
43 |   WARMUP_EPOCHS: 8
44 |   WARMUP_START_LR: 1e-6
45 |   OPTIM_METHOD: adamw
46 |   DAMPENING: 0.0
47 |   NESTEROV: true
48 |   HEAD_LRMULT: 10
49 |   NEW_PARAMS: ["dwconv_rf", "norm_avgpool"]
50 |   NEW_PARAMS_MULT: 10
51 | AUGMENTATION: 
52 |   COLOR_AUG: true
53 |   GRAYSCALE: 0.2
54 |   COLOR_P: 0.0
55 |   CONSISTENT: true
56 |   SHUFFLE: true
57 |   GRAY_FIRST: false
58 |   IS_SPLIT: false
59 |   USE_GPU: false
60 |   SSV2_FLIP: true
61 |   RATIO: [0.75, 1.333]
62 |   MIXUP: 
63 |     ENABLE: false
64 |   CUTMIX: 
65 |     ENABLE: false
66 |   RANDOM_ERASING:
67 |     ENABLE: false
68 |   LABEL_SMOOTHING: 0.0
69 |   AUTOAUGMENT:
70 |     ENABLE: true
71 |     BEFORE_CROP: true
72 |     TYPE: rand-m9-n4-mstd0.5-inc1
73 | NUM_GPUS: 8
74 | DATA_LOADER:
75 |   NUM_WORKERS: 12
76 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaconvnextv2/tadaconvnextv2_small_ssv2_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_small.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: ssv2
 9 |   BATCH_SIZE: 64 #total batch size: 64x4=256
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: in1k # by default, the initialization is from kinetics 400 pretrain
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: ssv2
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/
21 |   SAMPLING_MODE: segment_based
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 256
26 |   TEST_CROP_SIZE: 256
27 | VIDEO:
28 |   BACKBONE:
29 |     DROP_PATH: 0.5
30 |   HEAD:
31 |     NUM_CLASSES: 174
32 |     DROPOUT_RATE: 0.5
33 | 
34 | OUTPUT_DIR: output/tadaconvnextv2_small_ssv2_16f
35 | 
36 | OPTIMIZER:
37 |   BASE_LR: 2.5e-4
38 |   ADJUST_LR: false
39 |   LR_POLICY: cosine
40 |   MAX_EPOCH: 64
41 |   MOMENTUM: 0.9
42 |   WEIGHT_DECAY: 0.02
43 |   WARMUP_EPOCHS: 2.5
44 |   WARMUP_START_LR: 1e-6
45 |   OPTIM_METHOD: adamw
46 |   DAMPENING: 0.0
47 |   NESTEROV: true
48 |   HEAD_LRMULT: 10
49 |   NEW_PARAMS: ["dwconv_rf", "norm_avgpool"]
50 |   NEW_PARAMS_MULT: 10
51 | AUGMENTATION: 
52 |   COLOR_AUG: true
53 |   GRAYSCALE: 0.2
54 |   COLOR_P: 0.0
55 |   CONSISTENT: true
56 |   SHUFFLE: true
57 |   GRAY_FIRST: false
58 |   IS_SPLIT: false
59 |   USE_GPU: false
60 |   SSV2_FLIP: true
61 |   RATIO: [0.75, 1.333]
62 |   MIXUP: 
63 |     ENABLE: false
64 |   CUTMIX: 
65 |     ENABLE: false
66 |   RANDOM_ERASING:
67 |     ENABLE: false
68 |   LABEL_SMOOTHING: 0.0
69 |   AUTOAUGMENT:
70 |     ENABLE: true
71 |     BEFORE_CROP: true
72 |     TYPE: rand-m9-n4-mstd0.5-inc1
73 | NUM_GPUS: 8
74 | DATA_LOADER:
75 |   NUM_WORKERS: 12
76 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaconvnextv2/tadaconvnextv2_tiny_k400_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_tiny.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: kinetics400
 9 |   BATCH_SIZE: 128 #total batch size: 128x4=512
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: in1k
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: kinetics400
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/
21 |   SAMPLING_RATE: 5
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 256
26 |   TEST_CROP_SIZE: 256
27 | VIDEO:
28 |   BACKBONE:
29 |     DROP_PATH: 0.2
30 |   HEAD:
31 |     NUM_CLASSES: 400
32 |     DROPOUT_RATE: 0.5
33 | 
34 | OUTPUT_DIR: output/tadaconvnextv2_tiny_k400_16f
35 | 
36 | OPTIMIZER:
37 |   BASE_LR: 5e-4
38 |   ADJUST_LR: false
39 |   LR_POLICY: cosine
40 |   MAX_EPOCH: 100
41 |   MOMENTUM: 0.9
42 |   WEIGHT_DECAY: 0.02
43 |   WARMUP_EPOCHS: 8
44 |   WARMUP_START_LR: 1e-6
45 |   OPTIM_METHOD: adamw
46 |   DAMPENING: 0.0
47 |   NESTEROV: true
48 |   HEAD_LRMULT: 10
49 |   NEW_PARAMS: ["dwconv_rf", "norm_avgpool"]
50 |   NEW_PARAMS_MULT: 10
51 | AUGMENTATION: 
52 |   COLOR_AUG: true
53 |   GRAYSCALE: 0.2
54 |   COLOR_P: 0.0
55 |   CONSISTENT: true
56 |   SHUFFLE: true
57 |   GRAY_FIRST: false
58 |   IS_SPLIT: false
59 |   USE_GPU: false
60 |   SSV2_FLIP: true
61 |   RATIO: [0.75, 1.333]
62 |   MIXUP: 
63 |     ENABLE: false
64 |   CUTMIX: 
65 |     ENABLE: false
66 |   RANDOM_ERASING:
67 |     ENABLE: false
68 |   LABEL_SMOOTHING: 0.0
69 |   AUTOAUGMENT:
70 |     ENABLE: true
71 |     BEFORE_CROP: true
72 |     TYPE: rand-m9-n4-mstd0.5-inc1
73 | NUM_GPUS: 8
74 | DATA_LOADER:
75 |   NUM_WORKERS: 12
76 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaconvnextv2/tadaconvnextv2_tiny_ssv2_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_tiny.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: ssv2
 9 |   BATCH_SIZE: 128 #total batch size: 128x4=512
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: in1k # by default, the initialization is from kinetics 400 pretrain
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: ssv2
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/
21 |   SAMPLING_MODE: segment_based
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 256
26 |   TEST_CROP_SIZE: 256
27 | VIDEO:
28 |   BACKBONE:
29 |     DROP_PATH: 0.3
30 |   HEAD:
31 |     NUM_CLASSES: 174
32 |     DROPOUT_RATE: 0.5
33 | 
34 | OUTPUT_DIR: output/tadaconvnextv2_tiny_ssv2_16f
35 | 
36 | OPTIMIZER:
37 |   BASE_LR: 5e-4
38 |   ADJUST_LR: false
39 |   LR_POLICY: cosine
40 |   MAX_EPOCH: 64
41 |   MOMENTUM: 0.9
42 |   WEIGHT_DECAY: 0.02
43 |   WARMUP_EPOCHS: 2.5
44 |   WARMUP_START_LR: 1e-6
45 |   OPTIM_METHOD: adamw
46 |   DAMPENING: 0.0
47 |   NESTEROV: true
48 |   HEAD_LRMULT: 10
49 |   NEW_PARAMS: ["dwconv_rf", "norm_avgpool"]
50 |   NEW_PARAMS_MULT: 10
51 | AUGMENTATION: 
52 |   COLOR_AUG: true
53 |   GRAYSCALE: 0.2
54 |   COLOR_P: 0.0
55 |   CONSISTENT: true
56 |   SHUFFLE: true
57 |   GRAY_FIRST: false
58 |   IS_SPLIT: false
59 |   USE_GPU: false
60 |   SSV2_FLIP: true
61 |   RATIO: [0.75, 1.333]
62 |   MIXUP: 
63 |     ENABLE: false
64 |   CUTMIX: 
65 |     ENABLE: false
66 |   RANDOM_ERASING:
67 |     ENABLE: false
68 |   LABEL_SMOOTHING: 0.0
69 |   AUTOAUGMENT:
70 |     ENABLE: true
71 |     BEFORE_CROP: true
72 |     TYPE: rand-m9-n4-mstd0.5-inc1
73 | NUM_GPUS: 8
74 | DATA_LOADER:
75 |   NUM_WORKERS: 12
76 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaformer/tadaformer_b16_k400_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaformer_b16.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: kinetics400
 9 |   BATCH_SIZE: 256
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: clip
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: kinetics400
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/
21 |   SAMPLING_MODE: segment_based
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 224
26 |   TEST_CROP_SIZE: 224
27 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
28 |   STD: [0.26862954, 0.26130258, 0.27577711]
29 | VIDEO:
30 |   HEAD:
31 |     NUM_CLASSES: 400
32 |     DROPOUT_RATE: 0.5
33 | 
34 | OUTPUT_DIR: output/tadaformer_b16_k400_16f
35 | 
36 | OPTIMIZER:
37 |   BASE_LR: 5e-5
38 |   ADJUST_LR: false
39 |   LR_POLICY: cosine_v2
40 |   COSINE_END_LR: 1e-6
41 |   COSINE_AFTER_WARMUP: true
42 |   MAX_EPOCH: 30
43 |   MOMENTUM: 0.9
44 |   WEIGHT_DECAY: 0.05
45 |   WARMUP_EPOCHS: 5
46 |   WARMUP_START_LR: 1e-6
47 |   OPTIM_METHOD: adamw
48 |   DAMPENING: 0.0
49 |   NESTEROV: true
50 |   HEAD_LRMULT: 10
51 |   NEW_PARAMS: ["tada"]
52 |   NEW_PARAMS_MULT: 10
53 |   LAYER_WISE_LR_DECAY: 0.7
54 | AUGMENTATION: 
55 |   COLOR_AUG: true
56 |   GRAYSCALE: 0.2
57 |   COLOR_P: 0.0
58 |   CONSISTENT: true
59 |   SHUFFLE: true
60 |   GRAY_FIRST: false
61 |   IS_SPLIT: false
62 |   USE_GPU: false
63 |   SSV2_FLIP: true
64 |   RATIO: [0.75, 1.333]
65 |   MIXUP: 
66 |     ENABLE: false
67 |   CUTMIX: 
68 |     ENABLE: false
69 |   RANDOM_ERASING:
70 |     ENABLE: false
71 |   LABEL_SMOOTHING: 0.1
72 |   AUTOAUGMENT:
73 |     ENABLE: true
74 |     BEFORE_CROP: true
75 |     TYPE: rand-m9-n4-mstd0.5-inc1
76 | NUM_GPUS: 8
77 | DATA_LOADER:
78 |   NUM_WORKERS: 12
79 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaformer/tadaformer_b16_ssv2_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaformer_b16.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: ssv2
 9 |   BATCH_SIZE: 256
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: clip
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: ssv2
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/
21 |   SAMPLING_MODE: segment_based
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 224
26 |   TEST_CROP_SIZE: 224
27 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
28 |   STD: [0.26862954, 0.26130258, 0.27577711]
29 | VIDEO:
30 |   BACKBONE:
31 |     TEMP_ENHANCE: true
32 |     DOUBLE_TADA: true
33 |   HEAD:
34 |     NUM_CLASSES: 174
35 |     DROPOUT_RATE: 0.5
36 | 
37 | OUTPUT_DIR: output/tadaformer_b16_ssv2_16f
38 | 
39 | OPTIMIZER:
40 |   BASE_LR: 5e-4
41 |   ADJUST_LR: false
42 |   LR_POLICY: cosine_v2
43 |   COSINE_END_LR: 1e-6
44 |   COSINE_AFTER_WARMUP: true
45 |   MAX_EPOCH: 24
46 |   MOMENTUM: 0.9 
47 |   WEIGHT_DECAY: 0.05
48 |   WARMUP_EPOCHS: 4
49 |   WARMUP_START_LR: 1e-8
50 |   OPTIM_METHOD: adamw
51 |   DAMPENING: 0.0
52 |   NESTEROV: true
53 |   HEAD_LRMULT: 10
54 |   NEW_PARAMS: ["tada"]
55 |   NEW_PARAMS_MULT: 10
56 |   LAYER_WISE_LR_DECAY: 0.7
57 | AUGMENTATION: 
58 |   COLOR_AUG: true
59 |   GRAYSCALE: 0.2
60 |   COLOR_P: 0.0
61 |   CONSISTENT: true
62 |   SHUFFLE: true
63 |   GRAY_FIRST: false
64 |   IS_SPLIT: false
65 |   USE_GPU: false
66 |   SSV2_FLIP: true
67 |   RATIO: [0.75, 1.333]
68 |   MIXUP: 
69 |     ENABLE: false
70 |   CUTMIX: 
71 |     ENABLE: false
72 |   RANDOM_ERASING:
73 |     ENABLE: false
74 |   LABEL_SMOOTHING: 0.1
75 |   AUTOAUGMENT:
76 |     ENABLE: true
77 |     BEFORE_CROP: true
78 |     TYPE: rand-m9-n4-mstd0.5-inc1
79 | NUM_GPUS: 8
80 | DATA_LOADER:
81 |   NUM_WORKERS: 12
82 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaformer/tadaformer_l14_k400_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaformer_l14.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: kinetics400
 9 |   BATCH_SIZE: 64
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: clip
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: kinetics400
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/
21 |   SAMPLING_MODE: segment_based
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 224
26 |   TEST_CROP_SIZE: 224
27 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
28 |   STD: [0.26862954, 0.26130258, 0.27577711]
29 | VIDEO:
30 |   HEAD:
31 |     NUM_CLASSES: 400
32 |     DROPOUT_RATE: 0.5
33 | 
34 | OUTPUT_DIR: output/tadaformer_l14_k400_16f
35 | 
36 | OPTIMIZER:
37 |   BASE_LR: 2e-5
38 |   ADJUST_LR: false
39 |   LR_POLICY: cosine_v2
40 |   COSINE_END_LR: 1e-6
41 |   COSINE_AFTER_WARMUP: true
42 |   MAX_EPOCH: 24
43 |   MOMENTUM: 0.9
44 |   WEIGHT_DECAY: 0.05
45 |   WARMUP_EPOCHS: 5
46 |   WARMUP_START_LR: 1e-6
47 |   OPTIM_METHOD: adamw
48 |   DAMPENING: 0.0
49 |   NESTEROV: true
50 |   HEAD_LRMULT: 10
51 |   NEW_PARAMS: ["tada"]
52 |   NEW_PARAMS_MULT: 10
53 |   LAYER_WISE_LR_DECAY: 0.85
54 | AUGMENTATION: 
55 |   COLOR_AUG: true
56 |   GRAYSCALE: 0.2
57 |   COLOR_P: 0.0
58 |   CONSISTENT: true
59 |   SHUFFLE: true
60 |   GRAY_FIRST: false
61 |   IS_SPLIT: false
62 |   USE_GPU: false
63 |   SSV2_FLIP: true
64 |   RATIO: [0.75, 1.333]
65 |   MIXUP: 
66 |     ENABLE: false
67 |   CUTMIX: 
68 |     ENABLE: false
69 |   RANDOM_ERASING:
70 |     ENABLE: false
71 |   LABEL_SMOOTHING: 0.1
72 |   AUTOAUGMENT:
73 |     ENABLE: true
74 |     BEFORE_CROP: true
75 |     TYPE: rand-m9-n4-mstd0.5-inc1
76 | NUM_GPUS: 16
77 | DATA_LOADER:
78 |   NUM_WORKERS: 12
79 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/configs/projects/tadaformer/tadaformer_l14_ssv2_16f.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml
 2 | _BASE_MODEL: ../../pool/backbone/tadaformer_l14.yaml
 3 | 
 4 | PRETRAIN:
 5 |   ENABLE: false
 6 | TRAIN:
 7 |   ENABLE: true
 8 |   DATASET: ssv2
 9 |   BATCH_SIZE: 128
10 |   FINE_TUNE: true
11 |   LR_REDUCE: true
12 |   INIT: clip
13 |   CHECKPOINT_FILE_PATH: ""
14 | TEST:
15 |   ENABLE: true
16 |   DATASET: ssv2
17 |   BATCH_SIZE: 256
18 | DATA:
19 |   DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/
20 |   ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/
21 |   SAMPLING_MODE: segment_based
22 |   NUM_INPUT_FRAMES: 16
23 |   TRAIN_JITTER_SCALES: [0.08, 1.0]
24 |   TRAIN_CROP_SIZE: 224
25 |   TEST_SCALE: 224
26 |   TEST_CROP_SIZE: 224
27 |   MEAN: [0.48145466, 0.4578275, 0.40821073]
28 |   STD: [0.26862954, 0.26130258, 0.27577711]
29 | VIDEO:
30 |   BACKBONE:
31 |     DROP_PATH: 0.2
32 |     TEMP_ENHANCE: true
33 |     DOUBLE_TADA: true
34 |   HEAD:
35 |     NUM_CLASSES: 174
36 |     DROPOUT_RATE: 0.5
37 | 
38 | OUTPUT_DIR: output/tadaformer_l14_ssv2_16f
39 | 
40 | OPTIMIZER:
41 |   BASE_LR: 2.5e-4
42 |   ADJUST_LR: false
43 |   LR_POLICY: cosine_v2
44 |   COSINE_END_LR: 1e-6
45 |   COSINE_AFTER_WARMUP: true
46 |   MAX_EPOCH: 24
47 |   MOMENTUM: 0.9 
48 |   WEIGHT_DECAY: 0.05
49 |   WARMUP_EPOCHS: 4
50 |   WARMUP_START_LR: 1e-8
51 |   OPTIM_METHOD: adamw
52 |   DAMPENING: 0.0
53 |   NESTEROV: true
54 |   HEAD_LRMULT: 10
55 |   NEW_PARAMS: ["tada"]
56 |   NEW_PARAMS_MULT: 10
57 |   LAYER_WISE_LR_DECAY: 0.85
58 | AUGMENTATION: 
59 |   COLOR_AUG: true
60 |   GRAYSCALE: 0.2
61 |   COLOR_P: 0.0
62 |   CONSISTENT: true
63 |   SHUFFLE: true
64 |   GRAY_FIRST: false
65 |   IS_SPLIT: false
66 |   USE_GPU: false
67 |   SSV2_FLIP: true
68 |   RATIO: [0.75, 1.333]
69 |   MIXUP: 
70 |     ENABLE: false
71 |   CUTMIX: 
72 |     ENABLE: false
73 |   RANDOM_ERASING:
74 |     ENABLE: false
75 |   LABEL_SMOOTHING: 0.1
76 |   AUTOAUGMENT:
77 |     ENABLE: true
78 |     BEFORE_CROP: true
79 |     TYPE: rand-m9-n4-mstd0.5-inc1
80 | NUM_GPUS: 8
81 | DATA_LOADER:
82 |   NUM_WORKERS: 12
83 |   PIN_MEMORY: true


--------------------------------------------------------------------------------
/projects/epic-kitchen-ar/README.md:
--------------------------------------------------------------------------------
 1 | # Towards training stronger video vision transformers for epic-kitchens-100 action recognition (CVPR 2021 Workshop)
 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Zhiwu Qing](https://scholar.google.com/citations?user=q9refl4AAAAJ&hl=zh-CN), Xiang Wang, Yutong Feng, [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ&hl=zh-CN&authuser=1), Jianwen Jiang, Zhurong Xia, Mingqian Tang, Nong Sang, and [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/). <br/>
 3 | In arXiv, 2021. [[Paper]](https://arxiv.org/pdf/2106.05058).
 4 | 
 5 | # Running instructions
 6 | Action recognition on Epic-Kitchens-100 share the same pipline with classification. Refer to `configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml` for more details. We also include some trained weights in the [MODEL ZOO](MODEL_ZOO.md).
 7 | 
 8 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml`, and run the command
 9 | 
10 | ```
11 | python runs/run.py --cfgconfigs/projects/epic-kitchen-ar/ek100/vivit_fac_enc.yaml
12 | ```
13 | 
14 | # Citing this report
15 | If you find the training setting useful, please consider citing the paper as follows:
16 | ```BibTeX
17 | @article{huang2021towards,
18 |   title={Towards training stronger video vision transformers for epic-kitchens-100 action recognition},
19 |   author={Huang, Ziyuan and Qing, Zhiwu and Wang, Xiang and Feng, Yutong and Zhang, Shiwei and Jiang, Jianwen and Xia, Zhurong and Tang, Mingqian and Sang, Nong and Ang Jr, Marcelo H},
20 |   journal={arXiv preprint arXiv:2106.05058},
21 |   year={2021}
22 | }
23 | ```


--------------------------------------------------------------------------------
/projects/epic-kitchen-tal/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # A Stronger Baseline for Ego-Centric Action Detection (CVPR 2021 Workshop)
 3 | 
 4 | 
 5 | # Running instructions
 6 | To train the action localization model, set the `_BASE_RUN` to point to `configs/pool/run/training/localization.yaml`. See `configs/projects/epic-kitchen-tal/bmn_epic.yaml` for more details. Alternatively, you can also find some pre-trained model in the `MODEL_ZOO.md`.
 7 | 
 8 | For detailed explanations on the approach itself, please refer to the [paper](https://arxiv.org/pdf/2106.06942).
 9 | 
10 | For preparing dataset, please download [features](), [classification results]() and [dataset annotations]().
11 | 
12 | 
13 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR`, `CLASSIFIER_ROOT_DIR` and `NUM_GPUS` in `configs/projects/epic-kitchen-tal/bmn_epic.yaml`, and run the command
14 | 
15 | ```
16 | python runs/run.py --cfg configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml
17 | ```
18 | 
19 | 
20 | # Citing this report
21 | If you find this report useful for your research, please consider citing the paper as follows:
22 | ```BibTeX
23 | @article{qing2021stronger,
24 |   title={A Stronger Baseline for Ego-Centric Action Detection},
25 |   author={Qing, Zhiwu and Huang, Ziyuan and Wang, Xiang and Feng, Yutong and Zhang, Shiwei and Jiang, Jianwen and Tang, Mingqian and Gao, Changxin and Ang Jr, Marcelo H and Sang, Nong},
26 |   journal={arXiv preprint arXiv:2106.06942},
27 |   year={2021}
28 | }
29 | ```
30 | 


--------------------------------------------------------------------------------
/projects/mosi/MoSI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/projects/mosi/MoSI.png


--------------------------------------------------------------------------------
/projects/mosi/README.md:
--------------------------------------------------------------------------------
 1 | # Self-supervised Motion Learning from Static Images (CVPR 2021)
 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ&hl=zh-CN&authuser=1), Jianwen Jiang, Mingqian Tang,
 3 | [Rong Jin](https://www.cse.msu.edu/~rongjin/), [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/), <br/>
 4 | In CVPR, 2021. 
 5 | 
 6 | [[Paper](https://openaccess.thecvf.com/content/CVPR2021/papers/Huang_Self-Supervised_Motion_Learning_From_Static_Images_CVPR_2021_paper.pdf)]
 7 | 
 8 | # Running instructions
 9 | To train the model with MoSI, set the `_BASE_RUN` to point to `configs/pool/run/training/mosi.yaml`. See `configs/projects/mosi/mosi_*.yaml` for more details. Alternatively, you can also find some pre-trained model in the `MODEL_ZOO.md`.
10 | 
11 | For detailed explanations on the approach itself, please refer to the [paper](https://openaccess.thecvf.com/content/CVPR2021/papers/Huang_Self-Supervised_Motion_Learning_From_Static_Images_CVPR_2021_paper.pdf).
12 | 
13 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/mosi/mosi_r2d3ds_hmdb.yaml`, and run the command
14 | 
15 | ```
16 | python runs/run.py --cfg configs/projects/mosi/pt-hmdb/r2d3ds.yaml
17 | ```
18 | 
19 | <br/>
20 | <div align="center">
21 |     <img src="MoSI.png" width="350px" />
22 | </div>
23 | <br/>
24 | 
25 | # Citing MoSI
26 | If you find MoSI useful for your research, please consider citing the paper as follows:
27 | ```BibTeX
28 | @inproceedings{mosi2021,
29 |   title={Self-supervised motion learning from static images},
30 |   author={Huang, Ziyuan and Zhang, Shiwei and Jiang, Jianwen and Tang, Mingqian and Jin, Rong and Ang, Marcelo H},
31 |   booktitle={{CVPR}},
32 |   pages={1276--1285},
33 |   year={2021}
34 | }
35 | ```


--------------------------------------------------------------------------------
/projects/tada/README.md:
--------------------------------------------------------------------------------
 1 | # TAda! Temporally-Adaptive Convolutions for Video Understanding (ICLR 2022)
 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ&hl=zh-CN&authuser=1), [Liang Pan](https://scholar.google.com/citations?user=lSDISOcAAAAJ&hl=zh-CN&authuser=1), [Zhiwu Qing](https://scholar.google.com/citations?user=q9refl4AAAAJ&hl=zh-CN&authuser=1),
 3 | Mingqian Tang, [Ziwei Liu](https://liuziwei7.github.io/), [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/), <br/>
 4 | In ICLR, 2022. 
 5 | 
 6 | [[Paper](https://arxiv.org/pdf/2110.06178)][[Project homepage](https://tadaconv-iclr2022.github.io)]
 7 | 
 8 | # Running instructions
 9 | To train TAda2D networks, set the `_BASE_MODEL` to point to `configs/pool/backbone/tada2d.yaml`. See `configs/projects/tada/tada2d_*.yaml` for more details. 
10 | TAda2D networks trained on Kinetics and Something-Something can be found in [`MODEL_ZOO.md`](../../MODEL_ZOO.md).
11 | 
12 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/tada/tada2d_k400.yaml`, and run the command
13 | 
14 | ```
15 | python runs/run.py --cfg configs/projects/tada/k400/tada2d_8x8.yaml
16 | ```
17 | 
18 | <br/>
19 | <div align="center">
20 |     <img src="TAda2D.png" width="600px" />
21 | </div>
22 | <br/>
23 | 
24 | 
25 | # Model Zoo
26 | 
27 | | Dataset | architecture | #frames x sampling rate | top1 | top5| checkpoint | config |
28 | | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
29 | | K400 | TAda2D-R50 | 8 x 8 | 76.7 | 92.6 | [[google drive](https://drive.google.com/file/d/1YsbTKLoDwxtStAsP5oxUMbIsw85NvY0O/view?usp=sharing)][[baidu](https://pan.baidu.com/s/1rPPZtVDlEoftkg-r_Di59w)(code:p06d)] |  [tada2d_8x8.yaml](../../configs/projects/tada/k400/tada2d_8x8.yaml) |
30 | | K400 | TAda2D-R50 | 16 x 5 | 77.4 | 93.1 | [[google drive](https://drive.google.com/file/d/1UQDurxakmnDxa5D2tBuTqTH60BVyW3XM/view?usp=sharing)][[baidu](https://pan.baidu.com/s/1MzFCZU1G1JR2ur9gWd3hCg)(code:6k8h)] | [tada2d_16x5.yaml](../../configs/projects/tada/k400/tada2d_16x5.yaml) |
31 | | SSV2 | TAda2D-R50 | 8 | 64.0 | 88.0 | [[google drive](https://drive.google.com/file/d/16y6dDf-hcMmJ2jDCV9tRla8aRJZKJXSk/view?usp=sharing)][[baidu](https://pan.baidu.com/s/1CWy35SlWMbKnYqZXESndKg)(code:dlil)] | [tada2d_8f.yaml](../../configs/projects/tada/ssv2/tada2d_8f.yaml) | 
32 | | SSV2 | TAda2D-R50 | 16 | 65.6 | 89.1 | [[google drive](https://drive.google.com/file/d/1xwCxuFW6DZ0xpEsp_tFJYQRGuHPJe4uS/view?usp=sharing)][[baidu](https://pan.baidu.com/s/1GKUKyDytaKKeCBAerh-4IQ)(code:f857)] | [tada2d_16f.yaml](../../configs/projects/tada/ssv2/tada2d_16f.yaml) | 
33 | 
34 | # Citing TAda!
35 | If you find TAdaConv or TAda2D useful for your research, please consider citing the paper as follows:
36 | ```BibTeX
37 | @inproceedings{huang2021tada,
38 |   title={TAda! Temporally-Adaptive Convolutions for Video Understanding},
39 |   author={Huang, Ziyuan and Zhang, Shiwei and Pan, Liang and Qing, Zhiwu and Tang, Mingqian and Liu, Ziwei and Ang Jr, Marcelo H},
40 |   booktitle={{ICLR}},
41 |   year={2022}
42 | }
43 | ```


--------------------------------------------------------------------------------
/projects/tada/TAda2D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/projects/tada/TAda2D.png


--------------------------------------------------------------------------------
/projects/tadaconvv2/TAdaConvV2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/projects/tadaconvv2/TAdaConvV2.png


--------------------------------------------------------------------------------
/runs/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """Entry file for training, evaluating and testing a video model."""
 5 | 
 6 | import os
 7 | import sys
 8 | import time
 9 | sys.path.append(os.path.abspath(os.curdir))
10 | 
11 | from tadaconv.utils.launcher import launch_task
12 | 
13 | from test import test
14 | from train import train
15 | from test_epic_localization import test_epic_localization
16 | from submission_test import submission_test
17 | 
18 | from tadaconv.utils.config import Config
19 | 
20 | def _prepare_data(cfg):
21 |     if cfg.TASK_TYPE in ['classification']:
22 |         train_func = train
23 |         test_func = test
24 |     elif cfg.TASK_TYPE in ['localization']:
25 |         train_func = train
26 |         test_func = test_epic_localization
27 |     elif cfg.TASK_TYPE in ["submission"]:
28 |         cfg.TRAIN.ENABLE = False
29 |         cfg.TEST.ENABLE = False
30 |         train_func = None
31 |         test_func = None
32 |         submission_func = submission_test
33 |     else:
34 |         raise ValueError("unknown TASK_TYPE {}".format(cfg.TASK_TYPE))
35 |     
36 |     run_list = []
37 |     if cfg.TRAIN.ENABLE:
38 |         # Training process is performed by the entry function defined above.
39 |         run_list.append([cfg.deep_copy(), train_func])
40 |     
41 |     if cfg.TEST.ENABLE:
42 |         # Test is performed by the entry function defined above.
43 |         run_list.append([cfg.deep_copy(), test_func])
44 |         if cfg.TEST.AUTOMATIC_MULTI_SCALE_TEST:
45 |             """
46 |                 By default, test_func performs single view test. 
47 |                 AUTOMATIC_MULTI_SCALE_TEST automatically performs multi-view test after the single view test.
48 |             """
49 |             cfg.LOG_MODEL_INFO = False
50 |             cfg.LOG_CONFIG_INFO = False
51 | 
52 |             cfg.TEST.NUM_ENSEMBLE_VIEWS = 10
53 |             cfg.TEST.NUM_SPATIAL_CROPS = 1
54 | 
55 |             if "kinetics" in cfg.TEST.DATASET or "epickitchen" in cfg.TEST.DATASET:
56 |                 cfg.TEST.NUM_SPATIAL_CROPS = 3
57 |             if "imagenet" in cfg.TEST.DATASET and not cfg.PRETRAIN.ENABLE:
58 |                 cfg.TEST.NUM_ENSEMBLE_VIEWS = 1
59 |                 cfg.TEST.NUM_SPATIAL_CROPS = 3
60 |             if "ssv2" in cfg.TEST.DATASET:
61 |                 cfg.TEST.NUM_ENSEMBLE_VIEWS = 2
62 |                 cfg.TEST.NUM_SPATIAL_CROPS = 3
63 |             cfg.TEST.LOG_FILE = "val_{}clipsx{}crops.log".format(
64 |                 cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.TEST.NUM_SPATIAL_CROPS
65 |             )
66 |             run_list.append([cfg.deep_copy(), test_func])
67 | 
68 |     if cfg.SUBMISSION.ENABLE:
69 |         # currently only supports epic kitchen submission
70 |         cfg.LOG_MODEL_INFO = False
71 |         cfg.TEST.NUM_ENSEMBLE_VIEWS = 10
72 |         cfg.TEST.NUM_SPATIAL_CROPS = 3
73 | 
74 |         cfg.TEST.LOG_FILE = "test_{}clipsx{}crops.log".format(
75 |             cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.TEST.NUM_SPATIAL_CROPS
76 |         )
77 |         run_list.append([cfg.deep_copy(), submission_func])
78 |   
79 |     return run_list
80 | 
81 | def main():
82 |     """
83 |     Entry function for spawning all the function processes. 
84 |     """
85 |     cfg = Config(load=True)
86 | 
87 |     # get the list of configs and functions for running
88 |     run_list = _prepare_data(cfg)
89 | 
90 |     for run in run_list:
91 |         launch_task(cfg=run[0], init_method=run[0].get_args().init_method, func=run[1])
92 | 
93 |     print("Finish running with config: {}".format(cfg.args.cfg_file))
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/tadaconv/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/tadaconv/datasets/__init__.py


--------------------------------------------------------------------------------
/tadaconv/datasets/base/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | from .ucf101 import Ucf101
 5 | from .hmdb51 import Hmdb51
 6 | from .kinetics400 import Kinetics400
 7 | from .ssv2 import Ssv2
 8 | from .imagenet import Imagenet
 9 | from .epickitchen100_feature import Epickitchen100localization
10 | from .epickitchen100 import Epickitchen100
11 | 


--------------------------------------------------------------------------------
/tadaconv/datasets/base/builder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (C) Alibaba Group Holding Limited. 
  3 | 
  4 | """ Builder for the dataloader."""
  5 | 
  6 | import itertools
  7 | import numpy as np
  8 | import torch
  9 | import tadaconv.utils.misc as misc
 10 | from tadaconv.utils.sampler import MultiFoldDistributedSampler
 11 | from torch.utils.data._utils.collate import default_collate
 12 | from torch.utils.data.distributed import DistributedSampler
 13 | from torch.utils.data.sampler import RandomSampler
 14 | from tadaconv.utils.val_dist_sampler import MultiSegValDistributedSampler
 15 | from tadaconv.datasets.utils.collate_functions import COLLATE_FN_REGISTRY
 16 | 
 17 | 
 18 | from tadaconv.utils.registry import Registry
 19 | 
 20 | DATASET_REGISTRY = Registry("DATASET")
 21 | 
 22 | def get_sampler(cfg, dataset, split, shuffle):
 23 |     """
 24 |         Returns the sampler object for the dataset.
 25 |         Args:
 26 |             dataset (Dataset): constructed dataset. 
 27 |             split   (str):     which split is the dataset for.
 28 |             shuffle (bool):    whether or not to shuffle the dataset.
 29 |         Returns:
 30 |             sampler (Sampler): dataset sampler. 
 31 |     """
 32 |     if misc.get_num_gpus(cfg) > 1:
 33 |         if split == "train" and cfg.TRAIN.NUM_FOLDS > 1:
 34 |             return MultiFoldDistributedSampler(
 35 |                 dataset, cfg.TRAIN.NUM_FOLDS
 36 |             )
 37 |         elif cfg.USE_MULTISEG_VAL_DIST and cfg.TRAIN.ENABLE is False:
 38 |             return MultiSegValDistributedSampler(dataset, shuffle=False)
 39 |         else:
 40 |             return DistributedSampler(
 41 |                 dataset,
 42 |                 shuffle=shuffle
 43 |             )
 44 |     else:
 45 |         return None
 46 | 
 47 | def build_loader(cfg, split):
 48 |     """
 49 |     Constructs the data loader for the given dataset.
 50 |     Args:
 51 |         cfg (Configs): global config object. details in utils/config.py
 52 |         split (str): the split of the data loader. Options include `train`,
 53 |             `val`, `test`, and `submission`.
 54 |     Returns:
 55 |         loader object. 
 56 |     """
 57 |     assert split in ["train", "val", "test", "submission"]
 58 |     if split in ["train"]:
 59 |         dataset_name = cfg.TRAIN.DATASET
 60 |         batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 61 |         shuffle = True
 62 |         drop_last = True
 63 |     elif split in ["val"]:
 64 |         dataset_name = cfg.TEST.DATASET
 65 |         batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 66 |         shuffle = False
 67 |         drop_last = False
 68 |     elif split in ["test", "submission"]:
 69 |         dataset_name = cfg.TEST.DATASET
 70 |         batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS))
 71 |         shuffle = False
 72 |         drop_last = False
 73 | 
 74 |     # Construct the dataset
 75 |     dataset = build_dataset(dataset_name, cfg, split)
 76 | 
 77 |     # Create a sampler for multi-process training
 78 |     sampler = get_sampler(cfg, dataset, split, shuffle)
 79 |     # Create a loader
 80 |     if hasattr(cfg.DATA_LOADER, "COLLATE_FN") and cfg.DATA_LOADER.COLLATE_FN is not None:
 81 |         collate_fn = COLLATE_FN_REGISTRY.get(cfg.DATA_LOADER.COLLATE_FN)(cfg)
 82 |     else:
 83 |         collate_fn = None
 84 |     loader = torch.utils.data.DataLoader(
 85 |         dataset,
 86 |         batch_size=batch_size,
 87 |         shuffle=(False if sampler else shuffle),
 88 |         sampler=sampler,
 89 |         num_workers=cfg.DATA_LOADER.NUM_WORKERS,
 90 |         pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
 91 |         drop_last=drop_last,
 92 |         collate_fn=collate_fn
 93 |     )
 94 |     return loader
 95 | 
 96 | 
 97 | def shuffle_dataset(loader, cur_epoch):
 98 |     """"
 99 |     Shuffles the sampler for the dataset.
100 |     Args:
101 |         loader      (loader):   data loader to perform shuffle.
102 |         cur_epoch   (int):      number of the current epoch.
103 |     """
104 |     sampler = loader.sampler
105 |     assert isinstance(
106 |         sampler, (RandomSampler, DistributedSampler, MultiFoldDistributedSampler)
107 |     ), "Sampler type '{}' not supported".format(type(sampler))
108 |     # RandomSampler handles shuffling automatically
109 |     if isinstance(sampler, (DistributedSampler, MultiFoldDistributedSampler)):
110 |         # DistributedSampler shuffles data based on epoch
111 |         sampler.set_epoch(cur_epoch)
112 | 
113 | def build_dataset(dataset_name, cfg, split):
114 |     """
115 |     Builds a dataset according to the "dataset_name".
116 |     Args:
117 |         dataset_name (str):     the name of the dataset to be constructed.
118 |         cfg          (Config):  global config object. 
119 |         split        (str):     the split of the data loader.
120 |     Returns:
121 |         Dataset      (Dataset):    a dataset object constructed for the specified dataset_name.
122 |     """
123 |     name = dataset_name.capitalize()
124 |     return DATASET_REGISTRY.get(name)(cfg, split)
125 | 


--------------------------------------------------------------------------------
/tadaconv/datasets/base/epickitchen100.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Epic-Kitchens dataset. """
 5 | 
 6 | import os
 7 | import random
 8 | import torch
 9 | import torch.utils.data
10 | import tadaconv.utils.logging as logging
11 | 
12 | import time
13 | import oss2 as oss
14 | 
15 | import tadaconv.utils.bucket as bu
16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY
17 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset
18 | 
19 | logger = logging.get_logger(__name__)
20 | 
21 | @DATASET_REGISTRY.register()
22 | class Epickitchen100(BaseVideoDataset):
23 |     def __init__(self, cfg, split):
24 |         super(Epickitchen100, self).__init__(cfg, split) 
25 |         if (self.split == "test" or self.split == "submission") and self.cfg.PRETRAIN.ENABLE == False:
26 |             self._pre_transformation_config_required = True
27 |         
28 |     def _get_dataset_list_name(self):
29 |         """
30 |         Returns the list for the dataset. 
31 |         Returns:
32 |             dataset_list_name (str)
33 |         """
34 |         if self.split == "train":
35 |             if self.cfg.TRAIN.TRAIN_VAL_COMBINE:
36 |                 train_list = "train_val"
37 |             else:
38 |                 train_list = "train"
39 |         name = "EPIC_100_{}.csv".format(
40 |             train_list if self.split == "train" else "validation" if not self.split == "submission" else "test_timestamps",
41 |         )
42 |         logger.info("Reading video list from file: {}".format(name))
43 |         return name
44 | 
45 |     def _get_sample_info(self, index):
46 |         """
47 |         Returns the sample info corresponding to the index.
48 |         Args: 
49 |             index (int): target index
50 |         Returns:
51 |             sample_info (dict): contains different informations to be used later
52 |                 "name": the name of the video
53 |                 "path": the path of the video for the specified index
54 |                 "verb_class": verb label of the video
55 |                 "noun_class": noun label of the video
56 |         """
57 |         if not self.split == "submission":
58 |             video_name  = self._samples[index][0]
59 |             verb_class  = self._samples[index][10]
60 |             noun_class  = self._samples[index][12]
61 |             video_path  = os.path.join(self.data_root_dir, video_name+".MP4")
62 |         else:
63 |             # if the split is submission, then no label is available
64 |             # we simply set the verb class and the noun class to zero
65 |             video_name  = self._samples[index][0]
66 |             verb_class  = 0
67 |             noun_class  = 0
68 |             video_path  = os.path.join(self.data_root_dir, video_name+".MP4")
69 |         
70 |         if self.cfg.DATA.MULTI_LABEL or not hasattr(self.cfg.DATA, "TRAIN_VERSION"):
71 |             supervised_label = {
72 |                 "verb_class": verb_class,
73 |                 "noun_class": noun_class
74 |             }
75 |         else:
76 |             if self.cfg.DATA.TRAIN_VERSION == "only_train_verb":
77 |                 supervised_label = verb_class
78 |             elif self.cfg.DATA.TRAIN_VERSION == "only_train_noun":
79 |                 supervised_label = noun_class
80 | 
81 |         sample_info = {
82 |             "name": video_name,
83 |             "path": video_path,
84 |             "supervised_label": supervised_label
85 |         }
86 |         return sample_info
87 | 
88 |     def _pre_transformation_config(self):
89 |         """
90 |         Set transformation parameters if required.
91 |         """
92 |         self.resize_video.set_spatial_index(self.spatial_idx)
93 |     
94 |     def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True):
95 |         pass # making python happy
96 | 
97 |     def _get_ssl_label(self):
98 |         pass # making python happy


--------------------------------------------------------------------------------
/tadaconv/datasets/base/hmdb51.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ HMDB51 dataset. """
 5 | 
 6 | import os
 7 | import random
 8 | import time
 9 | 
10 | import oss2 as oss
11 | import tadaconv.utils.bucket as bu
12 | import tadaconv.utils.logging as logging
13 | import torch
14 | import torch.utils.data
15 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset
16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY
17 | 
18 | logger = logging.get_logger(__name__)
19 | 
20 | @DATASET_REGISTRY.register()
21 | class Hmdb51(BaseVideoDataset):
22 |     def __init__(self, cfg, split):
23 |         super(Hmdb51, self).__init__(cfg, split) 
24 |         if self.split == "test" and self.cfg.PRETRAIN.ENABLE == False:
25 |             self._pre_transformation_config_required = True
26 |         
27 |     
28 |     def _get_dataset_list_name(self):
29 |         """
30 |         Returns the list for the dataset. 
31 |         Returns:
32 |             name (str): name of the list to be read
33 |         """
34 |         name = "hmdb51_{}_list.txt".format(
35 |             "train" if "train" in self.split else "test",
36 |         )
37 |         logger.info("Reading video list from file: {}".format(name))
38 |         return name
39 | 
40 |     def _get_sample_info(self, index):
41 |         """
42 |         Returns the sample info corresponding to the index.
43 |         Args: 
44 |             index (int): target index
45 |         Returns:
46 |             sample_info (dict): contains different informations to be used later
47 |                 "path": indicating the target's path w.r.t. index
48 |                 "supervised_label": indicating the class of the target 
49 |         """
50 |         video_path, class_, = self._samples[index].strip().split(" ")
51 |         class_ = int(class_)
52 |         video_path = os.path.join(self.data_root_dir, video_path)
53 |         sample_info = {
54 |             "path": video_path,
55 |             "supervised_label": class_,
56 |         }
57 |         return sample_info
58 | 
59 |     def _pre_transformation_config(self):
60 |         """
61 |         Set transformation parameters if required.
62 |         """
63 |         self.resize_video.set_spatial_index(self.spatial_idx)
64 | 
65 |     def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True):
66 |         return self._interval_based_sampling(vid_length, vid_fps, clip_idx, num_clips, num_frames, interval)
67 |     
68 |     def _get_ssl_label(self):
69 |         pass # making python happy
70 | 


--------------------------------------------------------------------------------
/tadaconv/datasets/base/kinetics400.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Kinetics400 dataset. """
 5 | 
 6 | import os
 7 | import random
 8 | import torch
 9 | import torch.utils.data
10 | import tadaconv.utils.logging as logging
11 | 
12 | import time
13 | import oss2 as oss
14 | 
15 | import tadaconv.utils.bucket as bu
16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY
17 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset
18 | 
19 | logger = logging.get_logger(__name__)
20 | 
21 | @DATASET_REGISTRY.register()
22 | class Kinetics400(BaseVideoDataset):
23 |     def __init__(self, cfg, split):
24 |         super(Kinetics400, self).__init__(cfg, split) 
25 |         if self.split == "test" and self.cfg.PRETRAIN.ENABLE == False:
26 |             self._pre_transformation_config_required = True
27 |         
28 |     def _get_dataset_list_name(self):
29 |         """
30 |         Returns the list for the dataset. 
31 |         Returns:
32 |             name (str): name of the list to be read
33 |         """
34 |         name = "kinetics400_{}_list.txt".format(
35 |             self.split,
36 |         )
37 |         logger.info("Reading video list from file: {}".format(name))
38 |         return name
39 | 
40 |     def _get_sample_info(self, index):
41 |         """
42 |         Returns the sample info corresponding to the index.
43 |         Args: 
44 |             index (int): target index
45 |         Returns:
46 |             sample_info (dict): contains different informations to be used later
47 |                 "path": indicating the target's path w.r.t. index
48 |                 "supervised_label": indicating the class of the target 
49 |         """
50 |         video_path, class_, = self._samples[index].strip().split(" ")
51 |         class_ = int(class_)
52 |         video_path = os.path.join(self.data_root_dir, video_path)
53 |         sample_info = {
54 |             "path": video_path,
55 |             "supervised_label": class_,
56 |         }
57 |         return sample_info
58 | 
59 |     def _pre_transformation_config(self):
60 |         """
61 |         Set transformation parameters if required.
62 |         """
63 |         self.resize_video.set_spatial_index(self.spatial_idx)
64 | 
65 |     def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True):
66 |         return self._interval_based_sampling(vid_length, vid_fps, clip_idx, num_clips, num_frames, interval)
67 | 
68 |     def _get_ssl_label(self):
69 |         pass # making python happy


--------------------------------------------------------------------------------
/tadaconv/datasets/base/ssv2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Something-Something-V2 dataset. """
 5 | 
 6 | import os
 7 | import random
 8 | import torch
 9 | import torch.utils.data
10 | import tadaconv.utils.logging as logging
11 | 
12 | import time
13 | import oss2 as oss
14 | 
15 | import tadaconv.utils.bucket as bu
16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY
17 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset
18 | 
19 | logger = logging.get_logger(__name__)
20 | 
21 | @DATASET_REGISTRY.register()
22 | class Ssv2(BaseVideoDataset):
23 |     def __init__(self, cfg, split):
24 |         super(Ssv2, self).__init__(cfg, split) 
25 |         if self.split == "test" and self.cfg.PRETRAIN.ENABLE == False:
26 |             self._pre_transformation_config_required = True
27 |         
28 |     def _get_dataset_list_name(self):
29 |         """
30 |         Returns the list for the dataset. 
31 |         Returns:
32 |             name (str): name of the list to be read
33 |         """
34 |         name = "something-something-v2-{}-with-label.json".format(
35 |             "train" if self.split == "train" else "validation",
36 |         )
37 |         logger.info("Reading video list from file: {}".format(name))
38 |         return name
39 | 
40 |     def _get_sample_info(self, index):
41 |         """
42 |         Returns the sample info corresponding to the index.
43 |         Args: 
44 |             index (int): target index
45 |         Returns:
46 |             sample_info (dict): contains different informations to be used later
47 |                 "path": indicating the target's path w.r.t. index
48 |                 "supervised_label": indicating the class of the target 
49 |         """
50 |         class_ = self._samples[index]["label_idx"]
51 |         video_path = os.path.join(self.data_root_dir, self._samples[index]["id"]+".mp4")
52 |         sample_info = {
53 |             "path": video_path,
54 |             "supervised_label": class_,
55 |         }
56 |         return sample_info
57 | 
58 |     def _pre_transformation_config(self):
59 |         """
60 |         Set transformation parameters if required.
61 |         """
62 |         self.resize_video.set_spatial_index(self.spatial_idx)
63 |     
64 |     def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True):
65 |         pass # making python happy
66 | 
67 |     def _get_ssl_label(self):
68 |         pass # making python happy


--------------------------------------------------------------------------------
/tadaconv/datasets/base/ucf101.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ UCF101 dataset. """
 5 | 
 6 | import os
 7 | import random
 8 | import torch
 9 | import torch.utils.data
10 | import tadaconv.utils.logging as logging
11 | 
12 | import time
13 | import oss2 as oss
14 | 
15 | import tadaconv.utils.bucket as bu
16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY
17 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset
18 | 
19 | logger = logging.get_logger(__name__)
20 | 
21 | @DATASET_REGISTRY.register()
22 | class Ucf101(BaseVideoDataset):
23 |     def __init__(self, cfg, split):
24 |         super(Ucf101, self).__init__(cfg, split) 
25 |         if self.split == "test" and self.cfg.PRETRAIN.ENABLE == False:
26 |             self._pre_transformation_config_required = True
27 |     
28 |     def _get_dataset_list_name(self):
29 |         """
30 |         Returns the list for the dataset. 
31 |         Returns:
32 |             name (str): name of the list to be read
33 |         """
34 |         name = "ucf101_{}_list.txt".format(
35 |             "train" if "train" in self.split else "test",
36 |         )
37 |         logger.info("Reading video list from file: {}".format(name))
38 |         return name
39 | 
40 |     def _get_sample_info(self, index):
41 |         """
42 |         Returns the sample info corresponding to the index.
43 |         Args: 
44 |             index (int): target index
45 |         Returns:
46 |             sample_info (dict): contains different informations to be used later
47 |                 "path": indicating the target's path w.r.t. index
48 |                 "supervised_label": indicating the class of the target 
49 |         """
50 |         video_path, class_, = self._samples[index].strip().split(" ")
51 |         class_ = int(class_)
52 |         video_path = os.path.join(self.data_root_dir, video_path)
53 |         sample_info = {
54 |             "path": video_path,
55 |             "supervised_label": class_,
56 |         }
57 |         return sample_info
58 | 
59 |     def _pre_transformation_config(self):
60 |         """
61 |         Set transformation parameters if required.
62 |         """
63 |         self.resize_video.set_spatial_index(self.spatial_idx)
64 | 
65 |     def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True):
66 |         return self._interval_based_sampling(vid_length, vid_fps, clip_idx, num_clips, num_frames, interval)
67 | 
68 |     def _get_ssl_label(self):
69 |         pass # making python happy
70 | 


--------------------------------------------------------------------------------
/tadaconv/datasets/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/tadaconv/datasets/utils/__init__.py


--------------------------------------------------------------------------------
/tadaconv/datasets/utils/collate_functions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Collate functions. """
 5 | 
 6 | import random
 7 | from tadaconv.utils.registry import Registry
 8 | from torch.utils.data._utils.collate import default_collate
 9 | import torch.nn.functional as F
10 | 
11 | COLLATE_FN_REGISTRY = Registry()
12 | 
13 | @COLLATE_FN_REGISTRY.register()
14 | class ZeroShotCollate(object):
15 |     def __init__(self, cfg):
16 |         self.cfg = cfg
17 | 
18 |     def __call__(self, batch):
19 |         batch = default_collate(batch) 
20 |         batch[0]["text_embedding"] = batch[0]["text_embedding"][0].unsqueeze(0)
21 |         return batch


--------------------------------------------------------------------------------
/tadaconv/datasets/utils/preprocess_ssv2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from email.policy import default
  3 | import os
  4 | import sys
  5 | import json
  6 | import tqdm
  7 | import argparse
  8 | 
  9 | 
 10 | # ---- config
 11 | 
 12 | anno_path = "" # where you put your annotation files
 13 | data_path = "" # where you put original webm videos
 14 | data_out_path = "" # where to put the converted mp4 videos
 15 | 
 16 | def main(
 17 |     anno_conversion, data_conversion, num_splits, split_id, split,
 18 |     anno_path, data_path, data_out_path
 19 | ):
 20 | 
 21 |     # ---- anno conversion
 22 | 
 23 |     if anno_conversion:
 24 | 
 25 |         with open(os.path.join(anno_path, "something-something-v2-labels.json"), "r") as f:
 26 |             labels = json.load(f)
 27 | 
 28 |         print(f"Converting file: {os.path.join(anno_path, 'something-something-v2-train.json')}.")
 29 |         trainset_samples = []
 30 |         with open(os.path.join(anno_path, "something-something-v2-train.json"), "r") as f:
 31 |             lines = json.load(f)
 32 |         for line in lines:
 33 |             line['label_idx'] = int(labels[line['template'].replace('[', '').replace(']', '') ])
 34 |             trainset_samples.append(line)
 35 | 
 36 |         with open(os.path.join(anno_path, "something-something-v2-train-with-label.json"), "w") as f:
 37 |             json.dump(trainset_samples, f, indent=4)
 38 | 
 39 |         print(f"Converting file: {os.path.join(anno_path, 'something-something-v2-validation.json')}.")
 40 |         val_samples = []
 41 |         with open(os.path.join(anno_path, "something-something-v2-validation.json"), "r") as f:
 42 |             lines = json.load(f)
 43 |         for line in lines:
 44 |             line['label_idx'] = int(labels[line['template'].replace('[', '').replace(']', '') ])
 45 |             val_samples.append(line)
 46 | 
 47 | 
 48 |         with open(os.path.join(anno_path, "something-something-v2-validation-with-label.json"), "w") as f:
 49 |             json.dump(val_samples, f, indent=4)
 50 | 
 51 |     # ---- convert files
 52 | 
 53 |     if data_conversion:
 54 | 
 55 |         if not os.path.exists(data_out_path):
 56 |             os.mkdir(data_out_path)
 57 | 
 58 |         if not anno_conversion:
 59 |             print("Loading train samples")
 60 |             trainset_samples = []
 61 |             with open(os.path.join(anno_path, "something-something-v2-train.json"), "r") as f:
 62 |                 lines = json.load(f)
 63 |             for line in lines:
 64 |                 trainset_samples.append(line)
 65 |             print("Loading val samples")
 66 |             val_samples = []
 67 |             with open(os.path.join(anno_path, "something-something-v2-validation.json"), "r") as f:
 68 |                 lines = json.load(f)
 69 |             for line in lines:
 70 |                 val_samples.append(line)
 71 |         print(len(trainset_samples))
 72 |         print(len(val_samples))
 73 | 
 74 |         if split_id < num_splits-1:
 75 |             trainset_samples_torun = trainset_samples[
 76 |                 split_id * round(len(trainset_samples)/num_splits): (split_id+1) * round(len(trainset_samples)/num_splits)
 77 |             ]
 78 |             val_samples_torun = val_samples[
 79 |                 split_id * round(len(val_samples)/num_splits): (split_id+1) * round(len(val_samples)/num_splits)
 80 |             ]
 81 |         else:
 82 |             trainset_samples_torun = trainset_samples[
 83 |                 split_id * round(len(trainset_samples)/num_splits):
 84 |             ]
 85 |             val_samples_torun = val_samples[
 86 |                 split_id * round(len(val_samples)/num_splits):
 87 |             ]
 88 | 
 89 |         if split in ['all', 'train']:
 90 |             print("converting train samples")
 91 |             for i, sample in enumerate(tqdm.tqdm(trainset_samples_torun)):
 92 |                 name = sample['id']
 93 |                 input_file = f'{name}.webm'
 94 |                 output_file = f'{name}.mp4'
 95 |                 cmd = f"ffmpeg -i {data_path}/{input_file} -vf 'pad=ceil(iw/2)*2:ceil(ih/2)*2' {data_out_path}/{output_file} -loglevel error -y"
 96 |                 os.system(cmd)
 97 | 
 98 |         if split in ['all', 'val']:
 99 |             print("converting val samples")
100 |             for i, sample in enumerate(tqdm.tqdm(val_samples_torun)):
101 |                 name = sample['id']
102 |                 input_file = f'{name}.webm'
103 |                 output_file = f'{name}.mp4'
104 |                 cmd = f"ffmpeg -i {data_path}/{input_file} -vf 'pad=ceil(iw/2)*2:ceil(ih/2)*2' {data_out_path}/{output_file} -loglevel error -y"
105 |                 os.system(cmd)
106 |     
107 | if __name__ == "__main__":
108 |     parser = argparse.ArgumentParser(description='Process SSV2 annos and data.')
109 |     parser.add_argument('--anno', action='store_true')
110 |     parser.add_argument('--data', action='store_true')
111 |     parser.add_argument('--num_splits', type=int, default=1)
112 |     parser.add_argument('--split_id', type=int, default=0)
113 |     parser.add_argument('--split', type=str, default="all")
114 |     parser.add_argument('--anno_path', type=str, default=anno_path)
115 |     parser.add_argument('--data_path', type=str, default=data_path)
116 |     parser.add_argument('--data_out_path', type=str, default=data_out_path)
117 |     args = parser.parse_args()
118 |     main(
119 |         args.anno, args.data, args.num_splits, args.split_id, args.split,
120 |         args.anno_path, args.data_path, args.data_out_path
121 |     )


--------------------------------------------------------------------------------
/tadaconv/datasets/utils/random_erasing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (C) Alibaba Group Holding Limited. 
  3 | 
  4 | """ 
  5 | Random erasing classes.
  6 | This file is modified from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/random_erasing.py.
  7 | """
  8 | 
  9 | import random
 10 | import math
 11 | import torch
 12 | 
 13 | 
 14 | def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'):
 15 |     # NOTE I've seen CUDA illegal memory access errors being caused by the normal_()
 16 |     # paths, flip the order so normal is run on CPU if this becomes a problem
 17 |     # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508
 18 |     if per_pixel:
 19 |         return torch.empty(patch_size, dtype=dtype, device=device).normal_()
 20 |     elif rand_color:
 21 |         return torch.empty((patch_size[0], 1, 1, 1), dtype=dtype, device=device).normal_()
 22 |     else:
 23 |         return torch.zeros((patch_size[0], 1, 1, 1), dtype=dtype, device=device)
 24 | 
 25 | 
 26 | class RandomErasing:
 27 |     """ Randomly selects a rectangle region in an image and erases its pixels.
 28 |         'Random Erasing Data Augmentation' by Zhong et al.
 29 |         See https://arxiv.org/pdf/1708.04896.pdf
 30 | 
 31 |         This variant of RandomErasing is intended to be applied to either a batch
 32 |         or single image tensor after it has been normalized by dataset mean and std.
 33 |     Args:
 34 |          probability: Probability that the Random Erasing operation will be performed.
 35 |          min_area: Minimum percentage of erased area wrt input image area.
 36 |          max_area: Maximum percentage of erased area wrt input image area.
 37 |          min_aspect: Minimum aspect ratio of erased area.
 38 |          mode: pixel color mode, one of 'const', 'rand', or 'pixel'
 39 |             'const' - erase block is constant color of 0 for all channels
 40 |             'rand'  - erase block is same per-channel random (normal) color
 41 |             'pixel' - erase block is per-pixel random (normal) color
 42 |         max_count: maximum number of erasing blocks per image, area per box is scaled by count.
 43 |             per-image count is randomly chosen between 1 and this value.
 44 |     """
 45 | 
 46 |     def __init__(self, cfg,):
 47 |         """
 48 |         Args:
 49 |             cfg (Config): global config object. 
 50 |         """
 51 |         self.enable                     = cfg.AUGMENTATION.RANDOM_ERASING.ENABLE
 52 |         self.probability                = cfg.AUGMENTATION.RANDOM_ERASING.PROB
 53 |         self.min_area, self.max_area    = cfg.AUGMENTATION.RANDOM_ERASING.AREA_RANGE
 54 | 
 55 |         min_aspect                      = cfg.AUGMENTATION.RANDOM_ERASING.MIN_ASPECT
 56 |         max_aspect                      = 1 / min_aspect
 57 |         self.log_aspect_ratio           = (math.log(min_aspect), math.log(max_aspect))
 58 | 
 59 |         self.min_count, self.max_count  = cfg.AUGMENTATION.RANDOM_ERASING.COUNT
 60 |         self.num_splits                 = cfg.AUGMENTATION.RANDOM_ERASING.NUM_SPLITS
 61 |         mode                            = cfg.AUGMENTATION.RANDOM_ERASING.MODE.lower()
 62 |         self.rand_color = False
 63 |         self.per_pixel = False
 64 |         if mode == 'rand':
 65 |             self.rand_color = True  # per block random normal
 66 |         elif mode == 'pixel':
 67 |             self.per_pixel = True  # per pixel random normal
 68 |         else:
 69 |             assert not mode or mode == 'const'
 70 | 
 71 |     def _erase(self, img, chan, num_frames, img_h, img_w, dtype):
 72 |         if random.random() > self.probability:
 73 |             return
 74 |         area = img_h * img_w
 75 |         count = self.min_count if self.min_count == self.max_count else \
 76 |             random.randint(self.min_count, self.max_count)
 77 |         for _ in range(count):
 78 |             for attempt in range(10):
 79 |                 target_area = random.uniform(self.min_area, self.max_area) * area / count
 80 |                 aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
 81 |                 h = int(round(math.sqrt(target_area * aspect_ratio)))
 82 |                 w = int(round(math.sqrt(target_area / aspect_ratio)))
 83 |                 if w < img_w and h < img_h:
 84 |                     top = random.randint(0, img_h - h)
 85 |                     left = random.randint(0, img_w - w)
 86 |                     img[:, :, top:top + h, left:left + w] = _get_pixels(
 87 |                         self.per_pixel, self.rand_color, (chan, num_frames, h, w),
 88 |                         dtype=dtype, device=img.device)
 89 |                     break
 90 | 
 91 |     def __call__(self, input):
 92 |         if self.enable:
 93 |             if len(input.size()) == 4:
 94 |                 self._erase(input, *input.size(), input.dtype)
 95 |             else:
 96 |                 batch_size, chan, num_frames, img_h, img_w = input.size()
 97 |                 # skip first slice of batch if num_splits is set (for clean portion of samples)
 98 |                 batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0
 99 |                 for i in range(batch_start, batch_size):
100 |                     self._erase(input[i], chan, num_frames, img_h, img_w, input.dtype)
101 |         return input


--------------------------------------------------------------------------------
/tadaconv/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/tadaconv/models/__init__.py


--------------------------------------------------------------------------------
/tadaconv/models/base/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (C) Alibaba Group Holding Limited. 
3 | 
4 | import tadaconv.models.module_zoo
5 | from tadaconv.models.base.base_blocks import BaseHead, Base3DStem
6 | import tadaconv.models.base.transformer
7 | import tadaconv.models.base.slowfast


--------------------------------------------------------------------------------
/tadaconv/models/base/builder.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Builder for video models. """
 5 | 
 6 | import sys
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | import traceback
11 | 
12 | import tadaconv.utils.logging as logging
13 | 
14 | from tadaconv.models.base.models import BaseVideoModel, MODEL_REGISTRY
15 | from tadaconv.models.utils.model_ema import ModelEmaV2
16 | 
17 | logger = logging.get_logger(__name__)
18 | 
19 | def build_model(cfg, gpu_id=None):
20 |     """
21 |     Builds the video model.
22 |     Args:
23 |         cfg (Config): global config object that provides specifics to construct the model.
24 |         gpu_id (Optional[int]): specify the gpu index to build model.
25 |     Returns:
26 |         model: constructed model
27 |         model_ema: copied model for ema
28 |     """
29 |     # Construct the model
30 |     if MODEL_REGISTRY.get(cfg.MODEL.NAME) == None:
31 |         # attempt to find standard models
32 |         model = BaseVideoModel(cfg)
33 |     else:
34 |         # if the model is explicitly defined,
35 |         # it is directly constructed from the model pool
36 |         model = MODEL_REGISTRY.get(cfg.MODEL.NAME)(cfg)
37 | 
38 |     if torch.cuda.is_available():
39 |         assert (
40 |             cfg.NUM_GPUS <= torch.cuda.device_count()
41 |         ), "Cannot use more GPU devices than available"
42 |     else:
43 |         assert (
44 |             cfg.NUM_GPUS == 0
45 |         ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs."
46 | 
47 |     if cfg.NUM_GPUS:
48 |         if gpu_id is None:
49 |             # Determine the GPU used by the current process
50 |             cur_device = torch.cuda.current_device()
51 |         else:
52 |             cur_device = gpu_id
53 |         model = model.cuda(device=cur_device)
54 |     
55 |     model_ema = None
56 |     if cfg.MODEL.EMA.ENABLE:
57 |         model_ema = ModelEmaV2(model, decay=cfg.MODEL.EMA.DECAY)
58 | 
59 |     try:
60 |         # convert batchnorm to be synchronized across 
61 |         # different GPUs if needed
62 |         sync_bn = cfg.BN.SYNC
63 |         if sync_bn == True and cfg.NUM_GPUS * cfg.NUM_SHARDS > 1:
64 |             model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
65 |     except:
66 |         sync_bn = None
67 | 
68 |     # Use multi-process data parallel model in the multi-gpu setting
69 |     if cfg.NUM_GPUS*cfg.NUM_SHARDS > 1:
70 |         # Make model replica operate on the current device
71 |         if cfg.PAI:
72 |             # Support distributed training on the cluster
73 |             model = torch.nn.parallel.DistributedDataParallel(
74 |                 module=model
75 |             )
76 |         else:
77 |             model = torch.nn.parallel.DistributedDataParallel(
78 |                 module=model, device_ids=[cur_device], output_device=cur_device
79 |             )
80 | 
81 |     return model, model_ema


--------------------------------------------------------------------------------
/tadaconv/models/base/models.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from tadaconv.utils.registry import Registry
 7 | from tadaconv.models.base.backbone import BACKBONE_REGISTRY
 8 | from tadaconv.models.base.base_blocks import HEAD_REGISTRY
 9 | 
10 | MODEL_REGISTRY = Registry("Model")
11 | 
12 | class BaseVideoModel(nn.Module):
13 |     """
14 |     Standard video model.
15 |     The model is divided into the backbone and the head, where the backbone
16 |     extracts features and the head performs classification.
17 | 
18 |     The backbones can be defined in model/base/backbone.py or anywhere else
19 |     as long as the backbone is registered by the BACKBONE_REGISTRY.
20 |     The heads can be defined in model/module_zoo/heads/ or anywhere else
21 |     as long as the head is registered by the HEAD_REGISTRY.
22 | 
23 |     The registries automatically finds the registered modules and construct 
24 |     the base video model.
25 |     """
26 |     def __init__(self, cfg):
27 |         """
28 |         Args: 
29 |             cfg (Config): global config object. 
30 |         """
31 |         super(BaseVideoModel, self).__init__()
32 |         self.cfg = cfg
33 |         
34 |         # the backbone is created according to meta-architectures 
35 |         # defined in models/base/backbone.py
36 |         self.backbone = BACKBONE_REGISTRY.get(cfg.VIDEO.BACKBONE.META_ARCH)(cfg=cfg)
37 | 
38 |         # the head is created according to the heads 
39 |         # defined in models/module_zoo/heads
40 |         self.head = HEAD_REGISTRY.get(cfg.VIDEO.HEAD.NAME)(cfg=cfg)
41 |         
42 |     def forward(self, x):
43 |         x = self.backbone(x)
44 |         x = self.head(x)
45 |         return x
46 |     
47 |     def train(self, mode=True):
48 |         r"""Sets the module in training mode.
49 | 
50 |         This has any effect only on certain modules. See documentations of
51 |         particular modules for details of their behaviors in training/evaluation
52 |         mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
53 |         etc.
54 | 
55 |         Args:
56 |             mode (bool): whether to set training mode (``True``) or evaluation
57 |                          mode (``False``). Default: ``True``.
58 | 
59 |         Returns:
60 |             Module: self
61 |         """
62 |         self.training = mode
63 |         super(BaseVideoModel, self).train(mode)
64 |         for module in self.modules():
65 |             if isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.LayerNorm)) and self.cfg.BN.FREEZE:
66 |                 module.train(False)
67 |         return self
68 | 
69 | @MODEL_REGISTRY.register()
70 | class MoSINet(BaseVideoModel):
71 |     def __init__(self, cfg):
72 |         super(MoSINet, self).__init__(cfg)
73 |     
74 |     def forward(self, x):
75 |         if isinstance(x, dict):
76 |             x_data = x["video"]
77 |         else:
78 |             x_data = x
79 |         b, n, c, t, h, w = x_data.shape
80 |         x_data = x_data.reshape(b*n, c, t, h, w)
81 |         res, logits = super(MoSINet, self).forward(x_data)
82 |         pred = {}
83 |         if isinstance(res, dict):
84 |             for k, v in res.items():
85 |                 pred[k] = v
86 |         else:
87 |             pred["move_joint"] = res
88 |         return pred, logits


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (C) Alibaba Group Holding Limited. 
3 | 
4 | from tadaconv.models.module_zoo.heads import *
5 | from tadaconv.models.module_zoo.stems import *
6 | from tadaconv.models.module_zoo.branches import *
7 | from tadaconv.models.module_zoo.ops import *


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/branches/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | from tadaconv.models.module_zoo.branches.r2plus1d_branch import R2Plus1DBranch
 5 | from tadaconv.models.module_zoo.branches.r2d3d_branch import R2D3DBranch
 6 | from tadaconv.models.module_zoo.branches.csn_branch import CSNBranch
 7 | from tadaconv.models.module_zoo.branches.slowfast_branch import SlowfastBranch
 8 | from tadaconv.models.module_zoo.branches.s3dg_branch import STConv3d
 9 | from tadaconv.models.module_zoo.branches.non_local import NonLocal
10 | from tadaconv.models.module_zoo.branches.tada_branch import TAda2DBlock
11 | from tadaconv.models.module_zoo.branches.tadaformer import TAdaFormerBlock
12 | from tadaconv.models.module_zoo.branches.tadaconvnextv2 import TAdaConvNeXtV2Block


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/branches/csn_branch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ CSN Branch. """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from tadaconv.models.base.base_blocks import BaseBranch, Base3DStem, BaseHead
10 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY
11 | 
12 | @BRANCH_REGISTRY.register()
13 | class CSNBranch(BaseBranch):
14 |     """
15 |     The ir-CSN branch.
16 |     
17 |     See Du Tran et al.
18 |     Video Classification with Channel-Separated Convolutional Networks.
19 |     """
20 |     def __init__(self, cfg, block_idx):
21 |         """
22 |         Args: 
23 |             cfg              (Config): global config object. 
24 |             block_idx        (list):   list of [stage_id, block_id], both starting from 0.
25 |         """
26 |         super(CSNBranch, self).__init__(cfg, block_idx)
27 |     
28 |     def _construct_bottleneck(self):
29 |         self.a = nn.Conv3d(
30 |             in_channels     = self.dim_in,
31 |             out_channels    = self.num_filters//self.expansion_ratio,
32 |             kernel_size     = 1,
33 |             stride          = 1,
34 |             padding         = 0,
35 |             bias            = False
36 |         )
37 |         self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
38 |         self.a_relu = nn.ReLU(inplace=True)
39 | 
40 |         self.b = nn.Conv3d(
41 |             in_channels     = self.num_filters//self.expansion_ratio,
42 |             out_channels    = self.num_filters//self.expansion_ratio,
43 |             kernel_size     = self.kernel_size,
44 |             stride          = self.stride,
45 |             padding         = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2],
46 |             bias            = False,
47 |             groups          = self.num_filters//self.expansion_ratio,
48 |         )
49 |         self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
50 |         self.b_relu = nn.ReLU(inplace=True)
51 | 
52 |         self.c = nn.Conv3d(
53 |             in_channels     = self.num_filters//self.expansion_ratio,
54 |             out_channels    = self.num_filters,
55 |             kernel_size     = 1,
56 |             stride          = 1,
57 |             padding         = 0,
58 |             bias            = False
59 |         )
60 |         self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt)
61 |     
62 |     def forward(self, x):
63 |         if self.transformation == 'bottleneck':
64 |             x = self.a(x)
65 |             x = self.a_bn(x)
66 |             x = self.a_relu(x)
67 | 
68 |             x = self.b(x)
69 |             x = self.b_bn(x)
70 |             x = self.b_relu(x)
71 | 
72 |             x = self.c(x)
73 |             x = self.c_bn(x)
74 |             return x
75 | 


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/branches/non_local.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ NonLocal block. """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | from tadaconv.models.base.base_blocks import BaseBranch, BRANCH_REGISTRY
10 | 
11 | @BRANCH_REGISTRY.register()
12 | class NonLocal(BaseBranch):
13 |     """
14 |     Non-local block.
15 |     
16 |     See Xiaolong Wang et al.
17 |     Non-local Neural Networks.
18 |     """
19 | 
20 |     def __init__(self, cfg, block_idx):
21 |         super(NonLocal, self).__init__(cfg, block_idx)
22 | 
23 |         self.dim_middle = self.dim_in // 2
24 | 
25 |         self.qconv = nn.Conv3d(
26 |             self.dim_in,
27 |             self.dim_middle,
28 |             kernel_size=1,
29 |             stride=1,
30 |             padding=0
31 |         )
32 | 
33 |         self.kconv = nn.Conv3d(
34 |             self.dim_in,
35 |             self.dim_middle,
36 |             kernel_size=1,
37 |             stride=1,
38 |             padding=0
39 |         )
40 | 
41 |         self.vconv = nn.Conv3d(
42 |             self.dim_in,
43 |             self.dim_middle,
44 |             kernel_size=1,
45 |             stride=1,
46 |             padding=0
47 |         )
48 | 
49 |         self.out_conv = nn.Conv3d(
50 |             self.dim_middle,
51 |             self.num_filters,
52 |             kernel_size=1,
53 |             stride=1,
54 |             padding=0,
55 |         )
56 |         self.out_bn = nn.BatchNorm3d(self.num_filters, eps=1e-5, momentum=self.bn_mmt)
57 | 
58 |     def forward(self, x):
59 |         n,c,t,h,w = x.shape
60 | 
61 |         query = self.qconv(x).view(n, self.dim_middle, -1)
62 |         key = self.kconv(x).view(n, self.dim_middle, -1)
63 |         value = self.vconv(x).view(n, self.dim_middle, -1)
64 | 
65 |         attn = torch.einsum("nct,ncp->ntp", (query, key))
66 |         attn = attn * (self.dim_middle ** -0.5)
67 |         attn = F.softmax(attn, dim=2)
68 | 
69 |         out = torch.einsum("ntg,ncg->nct", (attn, value))
70 |         out = out.view(n, self.dim_middle, t, h, w)
71 |         out = self.out_conv(out)
72 |         out = self.out_bn(out)
73 |         return x + out
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/branches/r2d3d_branch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (C) Alibaba Group Holding Limited. 
  3 | 
  4 | """ R2D3D branch. """ 
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from tadaconv.models.base.base_blocks import BaseBranch, BaseHead
 10 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY
 11 | 
 12 | @BRANCH_REGISTRY.register()
 13 | class R2D3DBranch(BaseBranch):
 14 |     """
 15 |     The R2D3D Branch. 
 16 | 
 17 |     Essentially the MCx model in 
 18 |     Du Tran et al.
 19 |     A Closer Look at Spatiotemporal Convoluitions for Action Recognition.
 20 | 
 21 |     The model is used in DPC, MemDPC for self-supervised video 
 22 |     representation learning.
 23 |     """
 24 |     def __init__(self, cfg, block_idx):
 25 |         """
 26 |         Args: 
 27 |             cfg              (Config): global config object. 
 28 |             block_idx        (list):   list of [stage_id, block_id], both starting from 0.
 29 |         """
 30 |         super(R2D3DBranch, self).__init__(cfg, block_idx)
 31 | 
 32 |     def _construct_simple_block(self):
 33 |         self.a = nn.Conv3d(
 34 |             in_channels     = self.dim_in,
 35 |             out_channels    = self.num_filters,
 36 |             kernel_size     = self.kernel_size,
 37 |             stride          = self.stride,
 38 |             padding         = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2],
 39 |             bias            = False
 40 |         )
 41 |         self.a_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt)
 42 |         self.a_relu = nn.ReLU(inplace=True)
 43 | 
 44 |         self.b = nn.Conv3d(
 45 |             in_channels     = self.num_filters,
 46 |             out_channels    = self.num_filters,
 47 |             kernel_size     = self.kernel_size,
 48 |             stride          = 1,
 49 |             padding         = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2],
 50 |             bias            = False
 51 |         )
 52 |         self.b_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt)
 53 |     
 54 |     def _construct_bottleneck(self):
 55 |         self.a = nn.Conv3d(
 56 |             in_channels     = self.dim_in,
 57 |             out_channels    = self.num_filters//self.expansion_ratio,
 58 |             kernel_size     = 1,
 59 |             stride          = 1,
 60 |             padding         = 0,
 61 |             bias            = False
 62 |         )
 63 |         self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
 64 |         self.a_relu = nn.ReLU(inplace=True)
 65 | 
 66 |         self.b = nn.Conv3d(
 67 |             in_channels     = self.num_filters//self.expansion_ratio,
 68 |             out_channels    = self.num_filters//self.expansion_ratio,
 69 |             kernel_size     = self.kernel_size,
 70 |             stride          = self.stride,
 71 |             padding         = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2],
 72 |             bias            = False
 73 |         )
 74 |         self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
 75 |         self.b_relu = nn.ReLU(inplace=True)
 76 | 
 77 |         self.c = nn.Conv3d(
 78 |             in_channels     = self.num_filters//self.expansion_ratio,
 79 |             out_channels    = self.num_filters,
 80 |             kernel_size     = 1,
 81 |             stride          = 1,
 82 |             padding         = 0,
 83 |             bias            = False
 84 |         )
 85 |         self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt)
 86 |     
 87 |     def forward(self, x):
 88 |         if self.transformation == 'simple_block':
 89 |             x = self.a(x)
 90 |             x = self.a_bn(x)
 91 |             x = self.a_relu(x)
 92 | 
 93 |             x = self.b(x)
 94 |             x = self.b_bn(x)
 95 |             return x
 96 |         elif self.transformation == 'bottleneck':
 97 |             x = self.a(x)
 98 |             x = self.a_bn(x)
 99 |             x = self.a_relu(x)
100 | 
101 |             x = self.b(x)
102 |             x = self.b_bn(x)
103 |             x = self.b_relu(x)
104 | 
105 |             x = self.c(x)
106 |             x = self.c_bn(x)
107 |             return x
108 | 
109 | 


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/branches/s3dg_branch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (C) Alibaba Group Holding Limited. 
  3 | 
  4 | """ S3D/S3DG branch. """ 
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from tadaconv.models.base.base_blocks import (
 10 |     BRANCH_REGISTRY, InceptionBaseConv3D
 11 | )
 12 | 
 13 | class InceptionBlock3D(nn.Module):
 14 |     """
 15 |     Element constructing the S3D/S3DG.
 16 |     See models/base/backbone.py L99-186.
 17 | 
 18 |     Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
 19 |     """
 20 |     def __init__(self, cfg, in_planes, out_planes):
 21 |         super(InceptionBlock3D, self).__init__()
 22 | 
 23 |         _gating = cfg.VIDEO.BACKBONE.BRANCH.GATING
 24 | 
 25 |         assert len(out_planes) == 6
 26 |         assert isinstance(out_planes, list)
 27 | 
 28 |         [num_out_0_0a, 
 29 |         num_out_1_0a, num_out_1_0b,
 30 |         num_out_2_0a, num_out_2_0b, 
 31 |         num_out_3_0b] = out_planes
 32 | 
 33 |         self.branch0 = nn.Sequential(
 34 |             InceptionBaseConv3D(cfg, in_planes, num_out_0_0a, kernel_size=1, stride=1),
 35 |         )
 36 |         self.branch1 = nn.Sequential(
 37 |             InceptionBaseConv3D(cfg, in_planes, num_out_1_0a, kernel_size=1, stride=1),
 38 |             BRANCH_REGISTRY.get(cfg.VIDEO.BACKBONE.BRANCH.NAME)(cfg, num_out_1_0a, num_out_1_0b, kernel_size=3, stride=1, padding=1),
 39 |         )
 40 |         self.branch2 = nn.Sequential(
 41 |             InceptionBaseConv3D(cfg, in_planes, num_out_2_0a, kernel_size=1, stride=1),
 42 |             BRANCH_REGISTRY.get(cfg.VIDEO.BACKBONE.BRANCH.NAME)(cfg, num_out_2_0a, num_out_2_0b, kernel_size=3, stride=1, padding=1),
 43 |         )
 44 |         self.branch3 = nn.Sequential(
 45 |             nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1),
 46 |             InceptionBaseConv3D(cfg, in_planes, num_out_3_0b, kernel_size=1, stride=1),
 47 |         )
 48 | 
 49 |         self.out_channels = sum([num_out_0_0a, num_out_1_0b, num_out_2_0b, num_out_3_0b])
 50 | 
 51 |         self.gating = _gating 
 52 |         if _gating:
 53 |             self.gating_b0 = SelfGating(num_out_0_0a)
 54 |             self.gating_b1 = SelfGating(num_out_1_0b)
 55 |             self.gating_b2 = SelfGating(num_out_2_0b)
 56 |             self.gating_b3 = SelfGating(num_out_3_0b)
 57 | 
 58 | 
 59 |     def forward(self, x):
 60 |         x0 = self.branch0(x)
 61 |         x1 = self.branch1(x)
 62 |         x2 = self.branch2(x)
 63 |         x3 = self.branch3(x)
 64 |         if self.gating:
 65 |             x0 = self.gating_b0(x0)
 66 |             x1 = self.gating_b1(x1)
 67 |             x2 = self.gating_b2(x2)
 68 |             x3 = self.gating_b3(x3)
 69 | 
 70 |         out = torch.cat((x0, x1, x2, x3), 1)
 71 | 
 72 |         return out
 73 | 
 74 | class SelfGating(nn.Module):
 75 |     def __init__(self, input_dim):
 76 |         super(SelfGating, self).__init__()
 77 |         self.fc = nn.Linear(input_dim, input_dim)
 78 | 
 79 |     def forward(self, input_tensor):
 80 |         """Feature gating as used in S3D-G"""
 81 |         spatiotemporal_average = torch.mean(input_tensor, dim=[2, 3, 4])
 82 |         weights = self.fc(spatiotemporal_average)
 83 |         weights = torch.sigmoid(weights)
 84 |         return weights[:, :, None, None, None] * input_tensor
 85 |     
 86 | @BRANCH_REGISTRY.register()
 87 | class STConv3d(nn.Module):
 88 |     """
 89 |     Element constructing the S3D/S3DG.
 90 |     See models/base/backbone.py L99-186.
 91 | 
 92 |     Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
 93 |     """
 94 |     def __init__(self,cfg,in_planes,out_planes,kernel_size,stride,padding=0):
 95 |         super(STConv3d, self).__init__()
 96 |         if isinstance(stride, tuple):
 97 |             t_stride = stride[0]
 98 |             stride = stride[-1]
 99 |         else: # int
100 |             t_stride = stride
101 |         
102 |         self.bn_mmt = cfg.BN.MOMENTUM
103 |         self.bn_eps = cfg.BN.EPS
104 |         self._construct_branch(
105 |             cfg,
106 |             in_planes,
107 |             out_planes,
108 |             kernel_size,
109 |             stride,
110 |             t_stride,
111 |             padding
112 |         )
113 | 
114 |     def _construct_branch(
115 |         self,
116 |         cfg,
117 |         in_planes,
118 |         out_planes,
119 |         kernel_size,
120 |         stride,
121 |         t_stride,
122 |         padding=0
123 |     ):
124 |         self.conv1 = nn.Conv3d(in_planes, out_planes, kernel_size=(1,kernel_size,kernel_size),
125 |                               stride=(1,stride,stride),padding=(0,padding,padding), bias=False)
126 |         self.conv2 = nn.Conv3d(out_planes,out_planes,kernel_size=(kernel_size,1,1),
127 |                                stride=(t_stride,1,1),padding=(padding,0,0), bias=False)
128 | 
129 |         self.bn1=nn.BatchNorm3d(out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
130 |         self.bn2=nn.BatchNorm3d(out_planes, eps=self.bn_eps, momentum=self.bn_mmt)
131 |         self.relu = nn.ReLU(inplace=True)
132 |         
133 |         # init
134 |         self.conv1.weight.data.normal_(mean=0, std=0.01) # original s3d is truncated normal within 2 std
135 |         self.conv2.weight.data.normal_(mean=0, std=0.01) # original s3d is truncated normal within 2 std
136 |         self.bn1.weight.data.fill_(1)
137 |         self.bn1.bias.data.zero_()
138 |         self.bn2.weight.data.fill_(1)
139 |         self.bn2.bias.data.zero_()
140 |     
141 |     def forward(self,x):
142 |         x=self.conv1(x)
143 |         x=self.bn1(x)
144 |         x=self.relu(x)
145 |         x=self.conv2(x)
146 |         x=self.bn2(x)
147 |         x=self.relu(x)
148 |         return x
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/branches/slowfast_branch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (C) Alibaba Group Holding Limited. 
  3 | 
  4 | """ SlowFast architectures. """
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from tadaconv.models.base.base_blocks import BaseBranch
 10 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY
 11 | from tadaconv.models.utils.init_helper import _init_convnet_weights
 12 | 
 13 | @BRANCH_REGISTRY.register()
 14 | class SlowfastBranch(BaseBranch):
 15 |     """
 16 |     Constructs SlowFast conv branch.
 17 | 
 18 |     See Christoph Feichtenhofer et al.
 19 |     SlowFast Networks for Video Recognition.
 20 |     """
 21 |     def __init__(self, cfg, block_idx):
 22 |         super(SlowfastBranch, self).__init__(cfg, block_idx)
 23 | 
 24 |     def _construct_simple_block(self):
 25 |         self.a = nn.Conv3d(
 26 |             in_channels     = self.dim_in,
 27 |             out_channels    = self.num_filters,
 28 |             kernel_size     = self.kernel_size,
 29 |             stride          = self.stride,
 30 |             padding         = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2],
 31 |             bias            = False
 32 |         )
 33 |         self.a_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt)
 34 |         self.a_relu = nn.ReLU(inplace=True)
 35 | 
 36 |         self.b = nn.Conv3d(
 37 |             in_channels     = self.num_filters,
 38 |             out_channels    = self.num_filters,
 39 |             kernel_size     = self.kernel_size,
 40 |             stride          = 1,
 41 |             padding         = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2],
 42 |             bias            = False
 43 |         )
 44 |         self.b_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt)
 45 |         self.b_bn.transform_final_bn = True
 46 |     
 47 |     def _construct_bottleneck(self):
 48 |         self.a = nn.Conv3d(
 49 |             in_channels     = self.dim_in,
 50 |             out_channels    = self.num_filters//self.expansion_ratio,
 51 |             kernel_size     = [3, 1, 1] if self.cfg.VIDEO.BACKBONE.TEMPORAL_CONV_BOTTLENECK[self.stage_id] else 1,
 52 |             stride          = 1,
 53 |             padding         = [1, 0, 0] if self.cfg.VIDEO.BACKBONE.TEMPORAL_CONV_BOTTLENECK[self.stage_id] else 0,
 54 |             bias            = False
 55 |         )
 56 |         self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
 57 |         self.a_relu = nn.ReLU(inplace=True)
 58 | 
 59 |         self.b = nn.Conv3d(
 60 |             in_channels     = self.num_filters//self.expansion_ratio,
 61 |             out_channels    = self.num_filters//self.expansion_ratio,
 62 |             kernel_size     = self.kernel_size,
 63 |             stride          = self.stride,
 64 |             padding         = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2],
 65 |             bias            = False
 66 |         )
 67 |         self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
 68 |         self.b_relu = nn.ReLU(inplace=True)
 69 | 
 70 |         self.c = nn.Conv3d(
 71 |             in_channels     = self.num_filters//self.expansion_ratio,
 72 |             out_channels    = self.num_filters,
 73 |             kernel_size     = 1,
 74 |             stride          = 1,
 75 |             padding         = 0,
 76 |             bias            = False
 77 |         )
 78 |         self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt)
 79 |         self.c_bn.transform_final_bn = True
 80 |     
 81 |     def forward(self, x):
 82 |         if self.transformation == 'simple_block':
 83 |             x = self.a(x)
 84 |             x = self.a_bn(x)
 85 |             x = self.a_relu(x)
 86 | 
 87 |             x = self.b(x)
 88 |             x = self.b_bn(x)
 89 |             return x
 90 |         elif self.transformation == 'bottleneck':
 91 |             x = self.a(x)
 92 |             x = self.a_bn(x)
 93 |             x = self.a_relu(x)
 94 | 
 95 |             x = self.b(x)
 96 |             x = self.b_bn(x)
 97 |             x = self.b_relu(x)
 98 | 
 99 |             x = self.c(x)
100 |             x = self.c_bn(x)
101 |             return x


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/branches/tada_branch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (C) Alibaba Group Holding Limited. 
  3 | 
  4 | """ TAda Branch. """
  5 | 
  6 | import math
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.nn.modules.utils import _triple
 11 | 
 12 | from tadaconv.models.base.base_blocks import BaseBranch, Base3DStem, BaseHead
 13 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY
 14 | from tadaconv.models.module_zoo.ops.tadaconv import RouteFuncMLP, TAdaConv2d
 15 | 
 16 | @BRANCH_REGISTRY.register()
 17 | class TAda2DBlock(BaseBranch):
 18 |     """
 19 |     The TAdaConv branch with average pooling as the feature aggregation scheme.
 20 | 
 21 |     For details, see
 22 |     Ziyuan Huang, Shiwei Zhang, Liang Pan, Zhiwu Qing, Mingqian Tang, Ziwei Liu, and Marcelo H. Ang Jr.
 23 |     "TAda! Temporally-Adaptive Convolutions for Video Understanding."
 24 |     
 25 |     """
 26 |     def __init__(self, cfg, block_idx):
 27 |         super(TAda2DBlock, self).__init__(cfg, block_idx, construct_branch=False)
 28 | 
 29 |         self._construct_branch()
 30 |     
 31 |     def _construct_bottleneck(self):
 32 |         self.a = nn.Conv3d(
 33 |             in_channels     = self.dim_in,
 34 |             out_channels    = self.num_filters//self.expansion_ratio,
 35 |             kernel_size     = 1,
 36 |             stride          = 1,
 37 |             padding         = 0,
 38 |             bias            = False
 39 |         )
 40 |         self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
 41 |         self.a_relu = nn.ReLU(inplace=True)
 42 | 
 43 |         self.b = TAdaConv2d(
 44 |             in_channels     = self.num_filters//self.expansion_ratio,
 45 |             out_channels    = self.num_filters//self.expansion_ratio,
 46 |             kernel_size     = [1, self.kernel_size[1], self.kernel_size[2]],
 47 |             stride          = [1, self.stride[1], self.stride[2]],
 48 |             padding         = [0, self.kernel_size[1]//2, self.kernel_size[2]//2],
 49 |             bias            = False
 50 |         )
 51 |         self.b_rf = RouteFuncMLP(
 52 |             c_in=self.num_filters//self.expansion_ratio,
 53 |             ratio=self.cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_R,
 54 |             kernels=self.cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_K,
 55 |         )
 56 |         self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
 57 | 
 58 |         self.b_avgpool = nn.AvgPool3d(
 59 |             kernel_size=[
 60 |                 self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[0],
 61 |                 self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[1],
 62 |                 self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[2]
 63 |             ],
 64 |             stride=1,
 65 |             padding=[
 66 |                 self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[0]//2,
 67 |                 self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[1]//2,
 68 |                 self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[2]//2
 69 |             ],
 70 |         )
 71 |         self.b_avgpool_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt)
 72 |         self.b_avgpool_bn.skip_init=True
 73 |         self.b_avgpool_bn.weight.data.zero_()
 74 |         self.b_avgpool_bn.bias.data.zero_()
 75 |         
 76 |         self.b_relu = nn.ReLU(inplace=True)
 77 | 
 78 |         self.c = nn.Conv3d(
 79 |             in_channels     = self.num_filters//self.expansion_ratio,
 80 |             out_channels    = self.num_filters,
 81 |             kernel_size     = 1,
 82 |             stride          = 1,
 83 |             padding         = 0,
 84 |             bias            = False
 85 |         )
 86 |         self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt)
 87 |     
 88 |     def forward(self, x):
 89 |         if self.transformation == 'bottleneck':
 90 |             x = self.a(x)
 91 |             x = self.a_bn(x)
 92 |             x = self.a_relu(x)
 93 | 
 94 |             x = self.b(x, self.b_rf(x))
 95 |             x = self.b_bn(x) + self.b_avgpool_bn(self.b_avgpool(x))
 96 |             x = self.b_relu(x)
 97 | 
 98 |             x = self.c(x)
 99 |             x = self.c_bn(x)
100 |             return x


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/branches/tadaconvnextv2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ TAdaConvNeXtV2 block. """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from collections import OrderedDict
10 | 
11 | from tadaconv.models.module_zoo.ops.misc import QuickGELU
12 | from tadaconv.models.utils.init_helper import trunc_normal_
13 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY, DropPath
14 | from tadaconv.models.module_zoo.ops.tadaconv_v2 import TAdaConv2dV2, RouteFuncwTransformer
15 | from tadaconv.models.module_zoo.ops.misc import LayerNorm
16 | 
17 | @BRANCH_REGISTRY.register()
18 | class TAdaConvNeXtV2Block(nn.Module):
19 |     r""" TAdaConvNeXtV2 Block. 
20 |     Args:
21 |         cfg (Config): the global config object.
22 |         drop_path (float): Stochastic depth rate. Default: 0.0
23 |         layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
24 |     """
25 |     def __init__(self, cfg, dim, drop_path=0., layer_scale_init_value=1e-6):
26 |         super().__init__()
27 |         self.dwconv = TAdaConv2dV2(
28 |             dim, dim, kernel_size=(1,7,7), padding=(0,3,3), groups=dim,
29 |             cal_dim="cout", 
30 |             internal_rf_func=False,
31 |             internal_temp_aggr=False
32 |         )
33 |         self.dwconv_rf = RouteFuncwTransformer(
34 |             c_in=dim,
35 |             ratio=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_R,
36 |             kernels=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_K,
37 |             with_bias_cal=self.dwconv.bias is not None,
38 |             zero_init_cal=True,
39 |             head_dim=cfg.VIDEO.BACKBONE.BRANCH.HEAD_DIM if hasattr(cfg.VIDEO.BACKBONE.BRANCH, "HEAD_DIM") else 48
40 |         )
41 |         self.norm = LayerNorm(dim, eps=1e-6)
42 |         self.avgpool = nn.AvgPool3d(kernel_size=(3,1,1),stride=(1,1,1),padding=(1,0,0))
43 |         self.norm_avgpool = LayerNorm(dim, eps=1e-6)
44 |         self.norm_avgpool.weight.data.zero_()
45 |         self.norm_avgpool.bias.data.zero_()
46 |         self.pwconv1 = nn.Linear(dim, 4 * dim) 
47 |         self.act = QuickGELU()
48 |         self.pwconv2 = nn.Linear(4 * dim, dim)
49 |         self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), 
50 |                                     requires_grad=True) if layer_scale_init_value > 0 else None
51 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
52 | 
53 |     def forward(self, x):
54 |         input = x
55 | 
56 |         x = self.dwconv(x, reshape_required=False, alpha=self.dwconv_rf(x))
57 | 
58 |         # temporal aggregation
59 |         norm_avgpool_x = self.avgpool(x)
60 |         x = x.permute(0, 2, 3, 4, 1) # (N, C, T, H, W) -> (N, T, H, W, C)
61 |         norm_avgpool_x = norm_avgpool_x.permute(0, 2, 3, 4, 1) # (N, C, T, H, W) -> (N, T, H, W, C)
62 |         x = self.norm(x) + self.norm_avgpool(norm_avgpool_x)
63 | 
64 |         x = self.pwconv1(x)
65 |         x = self.act(x)
66 |         x = self.pwconv2(x)
67 |         if self.gamma is not None:
68 |             x = self.gamma * x
69 |         x = x.permute(0, 4, 1, 2, 3) # (N, T, H, W, C) -> (N, C, T, H, W)
70 | 
71 |         x = input + self.drop_path(x)
72 |         return x


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/heads/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (C) Alibaba Group Holding Limited. 
3 | 
4 | from .mosi_head import MoSIHeadJoint
5 | from .slowfast_head import SlowFastHead
6 | from .transformer_head import TransformerHead
7 | from .bmn_head import BaseBMN
8 | 


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/heads/transformer_head.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (C) Alibaba Group Holding Limited. 
  3 | 
  4 | """ Transformer heads. """
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from tadaconv.models.base.base_blocks import BaseHead
 10 | from tadaconv.models.base.base_blocks import HEAD_REGISTRY
 11 | 
 12 | from collections import OrderedDict
 13 | from tadaconv.models.utils.init_helper import lecun_normal_, trunc_normal_, _init_transformer_weights
 14 | 
 15 | @HEAD_REGISTRY.register()
 16 | class TransformerHead(BaseHead):
 17 |     """
 18 |     Construct head for video vision transformers.
 19 |     """
 20 |     def __init__(self, cfg):
 21 |         """
 22 |         Args: 
 23 |             cfg (Config): global config object. 
 24 |         """
 25 |         super(TransformerHead, self).__init__(cfg)
 26 |         self.apply(_init_transformer_weights)
 27 |     
 28 |     def _construct_head(
 29 |         self,
 30 |         dim,
 31 |         num_classes,
 32 |         dropout_rate,
 33 |         activation_func,
 34 |     ):
 35 |         if self.cfg.VIDEO.HEAD.PRE_LOGITS:
 36 |             self.pre_logits = nn.Sequential(OrderedDict([
 37 |                 ('fc', nn.Linear(dim, dim)),
 38 |                 ('act', nn.Tanh())
 39 |             ]))
 40 |         
 41 |         self.linear = nn.Linear(dim, num_classes)
 42 | 
 43 |         if dropout_rate > 0.0: 
 44 |             self.dropout = nn.Dropout(dropout_rate)
 45 | 
 46 |         if activation_func == "softmax":
 47 |             self.activation = nn.Softmax(dim=-1)
 48 |         elif activation_func == "sigmoid":
 49 |             self.activation = nn.Sigmoid()
 50 |         elif activation_func == "identity":
 51 |             self.activation = nn.Identity()
 52 |         else:
 53 |             raise NotImplementedError(
 54 |             "{} is not supported as an activation"
 55 |             "function.".format(activation_func)
 56 |             )
 57 |     
 58 |     def forward(self, x):
 59 |         """
 60 |         Returns:
 61 |             x (Tensor): classification predictions.
 62 |             logits (Tensor): global average pooled features.
 63 |         """
 64 |         if hasattr(self, "dropout"):
 65 |             out = self.dropout(x)
 66 |         else:
 67 |             out = x
 68 |         if hasattr(self, "pre_logits"):
 69 |             out = self.pre_logits(out)
 70 |         out = self.linear(out)
 71 | 
 72 |         if not self.training:
 73 |             out = self.activation(out)
 74 |         return out, x
 75 | 
 76 | @HEAD_REGISTRY.register()
 77 | class TransformerHeadx2(BaseHead):
 78 |     """
 79 |     The Transformer head for EPIC-KITCHENS dataset.
 80 |     """
 81 |     def __init__(self, cfg):
 82 |         """
 83 |         Args: 
 84 |             cfg (Config): global config object. 
 85 |         """
 86 |         super(TransformerHeadx2, self).__init__(cfg)
 87 |         self.apply(_init_transformer_weights)
 88 |     
 89 |     def _construct_head(
 90 |         self,
 91 |         dim,
 92 |         num_classes,
 93 |         dropout_rate,
 94 |         activation_func,
 95 |     ):
 96 |         if self.cfg.VIDEO.HEAD.PRE_LOGITS:
 97 |             self.pre_logits1 = nn.Sequential(OrderedDict([
 98 |                 ('fc', nn.Linear(dim, dim)),
 99 |                 ('act', nn.Tanh())
100 |             ]))
101 |             self.pre_logits2 = nn.Sequential(OrderedDict([
102 |                 ('fc', nn.Linear(dim, dim)),
103 |                 ('act', nn.Tanh())
104 |             ]))
105 |         self.linear1 = nn.Linear(dim, num_classes[0], bias=True)
106 |         self.linear2 = nn.Linear(dim, num_classes[1], bias=True)
107 | 
108 |         if dropout_rate > 0.0: 
109 |             self.dropout = nn.Dropout(dropout_rate)
110 | 
111 |         if activation_func == "softmax":
112 |             self.activation = nn.Softmax(dim=-1)
113 |         elif activation_func == "sigmoid":
114 |             self.activation = nn.Sigmoid()
115 |         elif activation_func == "identity":
116 |             self.activation = nn.Identity()
117 |         else:
118 |             raise NotImplementedError(
119 |             "{} is not supported as an activation"
120 |             "function.".format(activation_func)
121 |             )
122 |     
123 |     def forward(self, x):
124 |         """
125 |         Returns:
126 |             x (dict): dictionary of classification predictions,
127 |                 with keys "verb_class" and "noun_class" indicating
128 |                 the predictions on the verb and noun.
129 |             logits (Tensor): global average pooled features.
130 |         """
131 |         if hasattr(self, "dropout"):
132 |             out1 = self.dropout(x)
133 |             out2 = self.dropout(x)
134 |         else:
135 |             out1 = x
136 |             out2 = x
137 | 
138 |         if hasattr(self, "pre_logits1"):
139 |             out1 = self.pre_logits1(out1)
140 |             out2 = self.pre_logits2(out2)
141 | 
142 |         out1 = self.linear1(out1)
143 |         out2 = self.linear2(out2)
144 | 
145 |         if not self.training:
146 |             out1 = self.activation(out1)
147 |             out2 = self.activation(out2)
148 |         return {"verb_class": out1, "noun_class": out2}, x


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/ops/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (C) Alibaba Group Holding Limited. 
3 | 
4 | from .tadaconv import TAdaConv2d
5 | from .tadaconv_v2 import TAdaConv2dV2
6 | from .misc import LayerNorm


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/ops/misc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Micellaneous operations. """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | class LayerNorm(nn.Module):
11 |     r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
12 |     The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
13 |     shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
14 |     with shape (batch_size, channels, height, width).
15 |     """
16 |     def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
17 |         super().__init__()
18 |         self.weight = nn.Parameter(torch.ones(normalized_shape))
19 |         self.bias = nn.Parameter(torch.zeros(normalized_shape))
20 |         self.eps = eps
21 |         self.data_format = data_format
22 |         if self.data_format not in ["channels_last", "channels_first"]:
23 |             raise NotImplementedError 
24 |         self.normalized_shape = (normalized_shape, )
25 |     
26 |     def forward(self, x):
27 |         if self.data_format == "channels_last":
28 |             return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
29 |         elif self.data_format == "channels_first":
30 |             u = x.mean(1, keepdim=True)
31 |             s = (x - u).pow(2).mean(1, keepdim=True)
32 |             x = (x - u) / torch.sqrt(s + self.eps)
33 |             if len(x.shape) == 5:
34 |                 x = self.weight[:, None, None, None] * x + self.bias[:, None, None, None]
35 |             elif len(x.shape) == 3:
36 |                 x = self.weight[:, None] * x + self.bias[:, None]
37 |             return x
38 | 
39 | class QuickGELU(nn.Module):
40 |     def forward(self, x: torch.Tensor):
41 |         return x * torch.sigmoid(1.702 * x)


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/stems/__init__.py:
--------------------------------------------------------------------------------
1 | from .downsample_stem import DownSampleStem
2 | from .r2plus1d_stem import R2Plus1DStem
3 | from .embedding_stem import PatchEmbedStem, TubeletEmbeddingStem


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/stems/downsample_stem.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Downsample Stem. """
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from tadaconv.models.base.base_blocks import Base3DStem
10 | from tadaconv.models.base.base_blocks import STEM_REGISTRY
11 | 
12 | @STEM_REGISTRY.register()
13 | class DownSampleStem(Base3DStem):
14 |     """
15 |     Inherits base 3D stem and adds a maxpool as downsampling.
16 |     """
17 |     def __init__(self, cfg):
18 |         super(DownSampleStem, self).__init__(cfg)
19 |         self.maxpool = nn.MaxPool3d(
20 |             kernel_size = (1, 3, 3),
21 |             stride      = (1, 2, 2),
22 |             padding     = (0, 1, 1)
23 |         )
24 |     
25 |     def forward(self, x):
26 |         x = self.a(x)
27 |         x = self.a_bn(x)
28 |         x = self.a_relu(x)
29 |         x = self.maxpool(x)
30 |         return x
31 | 
32 | 


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/stems/embedding_stem.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Embedding stems. """
 5 | 
 6 | import math
 7 | import torch
 8 | from torch import nn, einsum
 9 | import torch.nn.functional as F
10 | from einops import rearrange, repeat
11 | from tadaconv.models.base.backbone import BACKBONE_REGISTRY
12 | from tadaconv.models.base.base_blocks import (
13 |     STEM_REGISTRY, BRANCH_REGISTRY, HEAD_REGISTRY, DropPath, BaseHead
14 | )
15 | 
16 | @STEM_REGISTRY.register()
17 | class PatchEmbedStem(nn.Module):
18 |     """ 
19 |     Video to Patch Embedding.
20 |     """
21 |     def __init__(self, cfg):
22 |         """
23 |         Args: 
24 |             cfg (Config): global config object. 
25 |         """
26 |         super().__init__()
27 |         image_size      = cfg.DATA.TRAIN_CROP_SIZE          if cfg is not None else 224 # default 224
28 |         channels        = cfg.DATA.NUM_INPUT_CHANNELS       if cfg is not None else 3   # default 3
29 |         num_frames      = cfg.DATA.NUM_INPUT_FRAMES         if cfg is not None else 16
30 |         patch_size      = cfg.VIDEO.BACKBONE.PATCH_SIZE     if cfg is not None else 16  # default 16
31 |         dim             = cfg.VIDEO.BACKBONE.NUM_FEATURES   if cfg is not None else 768 # default 768
32 | 
33 |         num_patches_per_image = (image_size // patch_size) ** 2
34 |         num_patches = num_patches_per_image * num_frames
35 | 
36 |         self.image_size = image_size
37 |         self.patch_size = patch_size
38 |         self.num_frames = num_frames
39 |         self.num_patches = num_patches
40 | 
41 |         self.conv1 = nn.Conv3d(
42 |             in_channels     =channels, 
43 |             out_channels    =dim, 
44 |             kernel_size     =[1, patch_size, patch_size], 
45 |             stride          =[1, patch_size, patch_size], 
46 |         )
47 | 
48 |     def forward(self, x):
49 |         b, c, t, h, w, p = *x.shape, self.patch_size
50 |         assert h % p == 0 and w % p == 0, f'height {h} and width {w} of video must be divisible by the patch size {p}'
51 |         x = self.conv1(x)
52 |         # b, c, t, h, w -> b, c, p (p: num patches)
53 |         x = x.reshape(x.shape[0], x.shape[1], -1)
54 |         # b, c, p -> b, p, c
55 |         x = x.permute(0, 2, 1)
56 |         return x
57 | 
58 | @STEM_REGISTRY.register()
59 | class TubeletEmbeddingStem(nn.Module):
60 |     """ 
61 |     Video to Tubelet Embedding.
62 |     """
63 |     def __init__(self, cfg):
64 |         """
65 |         Args: 
66 |             cfg (Config): global config object. 
67 |         """
68 |         super().__init__()
69 |         image_size      = cfg.DATA.TRAIN_CROP_SIZE          if cfg is not None else 224 # default 224
70 |         channels        = cfg.DATA.NUM_INPUT_CHANNELS       if cfg is not None else 3   # default 3
71 |         num_frames      = cfg.DATA.NUM_INPUT_FRAMES         if cfg is not None else 16
72 |         patch_size      = cfg.VIDEO.BACKBONE.PATCH_SIZE     if cfg is not None else 16  # default 16
73 |         dim             = cfg.VIDEO.BACKBONE.NUM_FEATURES   if cfg is not None else 768 # default 768
74 |         tubelet_size    = cfg.VIDEO.BACKBONE.TUBELET_SIZE   if cfg is not None else 2
75 | 
76 |         num_patches_per_image = (image_size // patch_size) ** 2
77 |         num_patches = num_patches_per_image * num_frames
78 | 
79 |         self.image_size = image_size
80 |         self.patch_size = patch_size
81 |         self.num_frames = num_frames
82 |         self.num_patches = num_patches
83 | 
84 |         self.conv1 = nn.Conv3d(
85 |             in_channels     =channels, 
86 |             out_channels    =dim, 
87 |             kernel_size     =[tubelet_size, patch_size, patch_size], 
88 |             stride          =[tubelet_size, patch_size, patch_size], 
89 |         )
90 | 
91 |     def forward(self, x):
92 |         b, c, t, h, w, p = *x.shape, self.patch_size
93 |         assert h % p == 0 and w % p == 0, f'height {h} and width {w} of video must be divisible by the patch size {p}'
94 |         x = self.conv1(x)
95 |         # b, c, t, h, w -> b, c, p (p: num patches)
96 |         x = x.reshape(x.shape[0], x.shape[1], -1)
97 |         # b, c, p -> b, p, c
98 |         x = x.permute(0, 2, 1)
99 |         return x


--------------------------------------------------------------------------------
/tadaconv/models/module_zoo/stems/r2plus1d_stem.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ R2Plus1D stem. """ 
 5 | 
 6 | import math
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | from tadaconv.models.base.base_blocks import Base3DStem
11 | from tadaconv.models.base.base_blocks import STEM_REGISTRY
12 | 
13 | @STEM_REGISTRY.register()
14 | class R2Plus1DStem(Base3DStem):
15 |     """
16 |     R(2+1)D Stem.
17 |     """
18 |     def __init__(
19 |         self, 
20 |         cfg
21 |     ):
22 |         super(R2Plus1DStem, self).__init__(cfg)
23 |         
24 |     def _construct_block(
25 |         self, 
26 |         cfg,
27 |         dim_in, 
28 |         num_filters, 
29 |         kernel_sz,
30 |         stride,
31 |         bn_eps=1e-5,
32 |         bn_mmt=0.1
33 |     ):
34 |         
35 |         mid_dim = int(
36 |             math.floor((kernel_sz[0] * kernel_sz[1] * kernel_sz[2] * dim_in * num_filters) / \
37 |                        (kernel_sz[1] * kernel_sz[2] * dim_in + kernel_sz[0] * num_filters)))
38 | 
39 |         self.a1 = nn.Conv3d(
40 |             in_channels     = dim_in,
41 |             out_channels    = mid_dim,
42 |             kernel_size     = [1, kernel_sz[1], kernel_sz[2]],
43 |             stride          = [1, stride[1], stride[2]],
44 |             padding         = [0, kernel_sz[1]//2, kernel_sz[2]//2],
45 |             bias            = False
46 |         )
47 |         self.a1_bn = nn.BatchNorm3d(mid_dim, eps=bn_eps, momentum=bn_mmt)
48 |         self.a1_relu = nn.ReLU(inplace=True)
49 | 
50 |         self.a2 = nn.Conv3d(
51 |             in_channels     = mid_dim,
52 |             out_channels    = num_filters,
53 |             kernel_size     = [kernel_sz[0], 1, 1],
54 |             stride          = [stride[0], 1, 1],
55 |             padding         = [kernel_sz[0]//2, 0, 0],
56 |             bias            = False
57 |         )
58 |         self.a2_bn = nn.BatchNorm3d(num_filters, eps=bn_eps, momentum=bn_mmt)
59 |         self.a2_relu = nn.ReLU(inplace=True)
60 | 
61 |     def forward(self, x):
62 |         x = self.a1(x)
63 |         x = self.a1_bn(x)
64 |         x = self.a1_relu(x)
65 | 
66 |         x = self.a2(x)
67 |         x = self.a2_bn(x)
68 |         x = self.a2_relu(x)
69 |         return x


--------------------------------------------------------------------------------
/tadaconv/models/utils/lars.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # From https://github.com/open-mmlab/OpenSelfSup/blob/1db69ecebbc129e8fa90cdcea6f2082f0a4e3d17/openselfsup/utils/optimizers.py
  3 | 
  4 | import torch
  5 | from torch.optim.optimizer import Optimizer, required
  6 | from torch.optim import *
  7 | 
  8 | 
  9 | class LARS(Optimizer):
 10 |     r"""Implements layer-wise adaptive rate scaling for SGD.
 11 | 
 12 |     Args:
 13 |         params (iterable): iterable of parameters to optimize or dicts defining
 14 |             parameter groups
 15 |         lr (float): base learning rate (\gamma_0)
 16 |         momentum (float, optional): momentum factor (default: 0) ("m")
 17 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 18 |             ("\beta")
 19 |         dampening (float, optional): dampening for momentum (default: 0)
 20 |         eta (float, optional): LARS coefficient
 21 |         nesterov (bool, optional): enables Nesterov momentum (default: False)
 22 | 
 23 |     Based on Algorithm 1 of the following paper by You, Gitman, and Ginsburg.
 24 |     Large Batch Training of Convolutional Networks:
 25 |         https://arxiv.org/abs/1708.03888
 26 | 
 27 |     Example:
 28 |         >>> optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9,
 29 |         >>>                  weight_decay=1e-4, eta=1e-3)
 30 |         >>> optimizer.zero_grad()
 31 |         >>> loss_fn(model(input), target).backward()
 32 |         >>> optimizer.step()
 33 |     """
 34 | 
 35 |     def __init__(self,
 36 |                  params,
 37 |                  lr=required,
 38 |                  momentum=0,
 39 |                  dampening=0,
 40 |                  weight_decay=0,
 41 |                  eta=0.001,
 42 |                  nesterov=False):
 43 |         if lr is not required and lr < 0.0:
 44 |             raise ValueError("Invalid learning rate: {}".format(lr))
 45 |         if momentum < 0.0:
 46 |             raise ValueError("Invalid momentum value: {}".format(momentum))
 47 |         if weight_decay < 0.0:
 48 |             raise ValueError(
 49 |                 "Invalid weight_decay value: {}".format(weight_decay))
 50 |         if eta < 0.0:
 51 |             raise ValueError("Invalid LARS coefficient value: {}".format(eta))
 52 | 
 53 |         defaults = dict(
 54 |             lr=lr, momentum=momentum, dampening=dampening,
 55 |             weight_decay=weight_decay, nesterov=nesterov, eta=eta)
 56 |         if nesterov and (momentum <= 0 or dampening != 0):
 57 |             raise ValueError("Nesterov momentum requires a momentum and zero dampening")
 58 | 
 59 |         super(LARS, self).__init__(params, defaults)
 60 | 
 61 |     def __setstate__(self, state):
 62 |         super(LARS, self).__setstate__(state)
 63 |         for group in self.param_groups:
 64 |             group.setdefault('nesterov', False)
 65 | 
 66 |     @torch.no_grad()
 67 |     def step(self, closure=None):
 68 |         """Performs a single optimization step.
 69 | 
 70 |         Args:
 71 |             closure (callable, optional): A closure that reevaluates the model
 72 |                 and returns the loss.
 73 |         """
 74 |         loss = None
 75 |         if closure is not None:
 76 |             with torch.enable_grad():
 77 |                 loss = closure()
 78 | 
 79 |         for group in self.param_groups:
 80 |             weight_decay = group['weight_decay']
 81 |             momentum = group['momentum']
 82 |             dampening = group['dampening']
 83 |             eta = group['eta']
 84 |             nesterov = group['nesterov']
 85 |             lr = group['lr']
 86 |             lars_exclude = group.get('lars_exclude', False)
 87 | 
 88 |             for p in group['params']:
 89 |                 if p.grad is None:
 90 |                     continue
 91 | 
 92 |                 d_p = p.grad
 93 | 
 94 |                 if lars_exclude:
 95 |                     local_lr = 1.
 96 |                 else:
 97 |                     weight_norm = torch.norm(p).item()
 98 |                     grad_norm = torch.norm(d_p).item()
 99 |                     # Compute local learning rate for this layer
100 |                     local_lr = eta * weight_norm / \
101 |                         (grad_norm + weight_decay * weight_norm)
102 | 
103 |                 actual_lr = local_lr * lr
104 |                 d_p = d_p.add(p, alpha=weight_decay).mul(actual_lr)
105 |                 if momentum != 0:
106 |                     param_state = self.state[p]
107 |                     if 'momentum_buffer' not in param_state:
108 |                         buf = param_state['momentum_buffer'] = \
109 |                                 torch.clone(d_p).detach()
110 |                     else:
111 |                         buf = param_state['momentum_buffer']
112 |                         buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
113 |                     if nesterov:
114 |                         d_p = d_p.add(buf, alpha=momentum)
115 |                     else:
116 |                         d_p = buf
117 |                 p.add_(-d_p)
118 | 
119 |         return loss


--------------------------------------------------------------------------------
/tadaconv/models/utils/lr_policy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # From https://github.com/facebookresearch/SlowFast/blob/master/slowfast/utils/lr_policy.py
  4 | 
  5 | """Learning rate policy."""
  6 | 
  7 | import math
  8 | 
  9 | 
 10 | def get_lr_at_epoch(cfg, cur_epoch):
 11 |     """
 12 |     Retrieve the learning rate of the current epoch with the option to perform
 13 |     warm up in the beginning of the training stage.
 14 |     Args:
 15 |         cfg (Config): global config object. 
 16 |         cur_epoch (float): the number of epoch of the current training stage.
 17 |     """
 18 |     lr = get_lr_func(cfg.OPTIMIZER.LR_POLICY)(cfg, cur_epoch)
 19 |     # Perform warm up.
 20 |     if cur_epoch < cfg.OPTIMIZER.WARMUP_EPOCHS:
 21 |         lr_start = cfg.OPTIMIZER.WARMUP_START_LR
 22 |         lr_end = get_lr_func(cfg.OPTIMIZER.LR_POLICY)(
 23 |             cfg, cfg.OPTIMIZER.WARMUP_EPOCHS
 24 |         )
 25 |         alpha = (lr_end - lr_start) / cfg.OPTIMIZER.WARMUP_EPOCHS
 26 |         lr = cur_epoch * alpha + lr_start
 27 |     return lr
 28 | 
 29 | 
 30 | def lr_func_cosine(cfg, cur_epoch):
 31 |     """
 32 |     Retrieve the learning rate to specified values at specified epoch with the
 33 |     cosine learning rate schedule. Details can be found in:
 34 |     Ilya Loshchilov, and  Frank Hutter
 35 |     SGDR: Stochastic Gradient Descent With Warm Restarts.
 36 |     Args:
 37 |         cfg (Config): global config object. 
 38 |         cur_epoch (float): the number of epoch of the current training stage.
 39 |     """
 40 |     return (
 41 |         cfg.OPTIMIZER.BASE_LR
 42 |         * (math.cos(math.pi * cur_epoch / cfg.OPTIMIZER.MAX_EPOCH) + 1.0)
 43 |         * 0.5
 44 |     )
 45 | 
 46 | def lr_func_cosine_v2(cfg, cur_epoch):
 47 |     """
 48 |     Retrieve the learning rate to specified values at specified epoch with the
 49 |     cosine learning rate schedule. Details can be found in:
 50 |     Ilya Loshchilov, and  Frank Hutter
 51 |     SGDR: Stochastic Gradient Descent With Warm Restarts.
 52 |     Args:
 53 |         cfg (CfgNode): configs. Details can be found in
 54 |             slowfast/config/defaults.py
 55 |         cur_epoch (float): the number of epoch of the current training stage.
 56 |     """
 57 |     offset = cfg.OPTIMIZER.WARMUP_EPOCHS if cfg.OPTIMIZER.COSINE_AFTER_WARMUP else 0.0
 58 |     assert cfg.OPTIMIZER.COSINE_END_LR < cfg.OPTIMIZER.BASE_LR
 59 |     return (
 60 |         cfg.OPTIMIZER.COSINE_END_LR
 61 |         + (cfg.OPTIMIZER.BASE_LR - cfg.OPTIMIZER.COSINE_END_LR)
 62 |         * (
 63 |             math.cos(
 64 |                 math.pi * (cur_epoch - offset) / (cfg.OPTIMIZER.MAX_EPOCH - offset)
 65 |             )
 66 |             + 1.0
 67 |         )
 68 |         * 0.5
 69 |     )
 70 | 
 71 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch):
 72 |     """
 73 |     Retrieve the learning rate to specified values at specified epoch with the
 74 |     steps with relative learning rate schedule.
 75 |     Args:
 76 |         cfg (Config): global config object. 
 77 |         cur_epoch (float): the number of epoch of the current training stage.
 78 |     """
 79 |     ind = get_step_index(cfg, cur_epoch)
 80 |     return cfg.OPTIMIZER.LRS[ind] * cfg.OPTIMIZER.BASE_LR
 81 | 
 82 | 
 83 | def get_step_index(cfg, cur_epoch):
 84 |     """
 85 |     Retrieves the lr step index for the given epoch.
 86 |     Args:
 87 |         cfg (Config): global config object. 
 88 |         cur_epoch (float): the number of epoch of the current training stage.
 89 |     """
 90 |     steps = cfg.OPTIMIZER.STEPS + [cfg.OPTIMIZER.MAX_EPOCH]
 91 |     for ind, step in enumerate(steps):  # NoQA
 92 |         if cur_epoch < step:
 93 |             break
 94 |     return ind - 1
 95 | 
 96 | 
 97 | def get_lr_func(lr_policy):
 98 |     """
 99 |     Given the configs, retrieve the specified lr policy function.
100 |     Args:
101 |         lr_policy (string): the learning rate policy to use for the job.
102 |     """
103 |     policy = "lr_func_" + lr_policy
104 |     if policy not in globals():
105 |         raise NotImplementedError("Unknown LR policy: {}".format(lr_policy))
106 |     else:
107 |         return globals()[policy]
108 | 


--------------------------------------------------------------------------------
/tadaconv/models/utils/model_ema.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py
 3 | # thanks for the nice implementation
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | from copy import deepcopy
 8 | 
 9 | class ModelEmaV2(nn.Module):
10 |     """ Model Exponential Moving Average V2
11 |     Keep a moving average of everything in the model state_dict (parameters and buffers).
12 |     V2 of this module is simpler, it does not match params/buffers based on name but simply
13 |     iterates in order. It works with torchscript (JIT of full model).
14 |     This is intended to allow functionality like
15 |     https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
16 |     A smoothed version of the weights is necessary for some training schemes to perform well.
17 |     E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
18 |     RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
19 |     smoothing of weights to match results. Pay attention to the decay constant you are using
20 |     relative to your update count per epoch.
21 |     To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
22 |     disable validation of the EMA weights. Validation will have to be done manually in a separate
23 |     process, or after the training stops converging.
24 |     This class is sensitive where it is initialized in the sequence of model init,
25 |     GPU assignment and distributed training wrappers.
26 |     """
27 |     def __init__(self, model, decay=0.9999, device=None):
28 |         super(ModelEmaV2, self).__init__()
29 |         # make a copy of the model for accumulating moving average of weights
30 |         self.module = deepcopy(model)
31 |         self.module.eval()
32 |         self.decay = decay
33 |         self.device = device  # perform ema on different device from model if set
34 |         if self.device is not None:
35 |             self.module.to(device=device)
36 | 
37 |     def _update(self, model, update_fn):
38 |         with torch.no_grad():
39 |             for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
40 |                 if self.device is not None:
41 |                     model_v = model_v.to(device=self.device)
42 |                 ema_v.copy_(update_fn(ema_v, model_v))
43 | 
44 |     def update(self, model):
45 |         self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)
46 | 
47 |     def set(self, model):
48 |         self._update(model, update_fn=lambda e, m: m)


--------------------------------------------------------------------------------
/tadaconv/models/utils/params.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Params. """
 5 | 
 6 | def update_3d_conv_params(cfg, conv, idx):
 7 |     """
 8 |     Automatically decodes parameters for 3D convolution blocks according to the config and its index in the model.
 9 |     Args: 
10 |         cfg (Config):       Config object that contains model parameters such as channel dimensions, whether to downsampling or not, etc.
11 |         conv (BaseBranch):  Branch whose parameters needs to be specified. 
12 |         idx (list):         List containing the index of the current block. ([stage_id, block_id])
13 |     """
14 |     # extract current block location
15 |     stage_id, block_id  = idx
16 |     conv.stage_id       = stage_id
17 |     conv.block_id       = block_id
18 | 
19 |     # extract basic info
20 |     if block_id == 0:
21 |         conv.dim_in                 = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id-1]
22 |         if hasattr(cfg.VIDEO.BACKBONE, "ADD_FUSION_CHANNEL") and cfg.VIDEO.BACKBONE.ADD_FUSION_CHANNEL:
23 |             conv.dim_in = conv.dim_in * cfg.VIDEO.BACKBONE.SLOWFAST.CONV_CHANNEL_RATIO // cfg.VIDEO.BACKBONE.SLOWFAST.BETA + conv.dim_in
24 |         conv.downsampling           = cfg.VIDEO.BACKBONE.DOWNSAMPLING[stage_id]
25 |         conv.downsampling_temporal  = cfg.VIDEO.BACKBONE.DOWNSAMPLING_TEMPORAL[stage_id]
26 |     else:
27 |         conv.downsampling           = False
28 |         conv.dim_in                 = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id]
29 |     conv.num_filters                = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id]
30 |     conv.bn_mmt                     = cfg.BN.MOMENTUM
31 |     conv.bn_eps                     = cfg.BN.EPS
32 |     conv.kernel_size                = cfg.VIDEO.BACKBONE.KERNEL_SIZE[stage_id]
33 |     conv.expansion_ratio            = cfg.VIDEO.BACKBONE.EXPANSION_RATIO if hasattr(cfg.VIDEO.BACKBONE, "EXPANSION_RATIO") else None
34 | 
35 |     # configure downsampling
36 |     if conv.downsampling:
37 |         if conv.downsampling_temporal:
38 |             conv.stride = [2, 2, 2]
39 |         else:
40 |             conv.stride = [1, 2, 2]
41 |     else:
42 |         conv.stride = [1, 1, 1]
43 | 
44 |     # define transformation
45 |     if isinstance(cfg.VIDEO.BACKBONE.DEPTH, str):
46 |         conv.transformation = 'bottleneck'
47 |     else:
48 |         if cfg.VIDEO.BACKBONE.DEPTH <= 34:
49 |             conv.transformation = 'simple_block'
50 |         else:
51 |             conv.transformation = 'bottleneck'
52 | 
53 |     # calculate the input size
54 |     num_downsampling_spatial = sum(
55 |         cfg.VIDEO.BACKBONE.DOWNSAMPLING[:stage_id+(block_id>0)]
56 |     )
57 |     if 'DownSample' in cfg.VIDEO.BACKBONE.STEM.NAME:
58 |         num_downsampling_spatial += 1
59 |     num_downsampling_temporal = sum(
60 |         cfg.VIDEO.BACKBONE.DOWNSAMPLING_TEMPORAL[:stage_id+(block_id>0)]
61 |     )
62 |     conv.h = cfg.DATA.TRAIN_CROP_SIZE // 2**num_downsampling_spatial \
63 |         + (cfg.DATA.TRAIN_CROP_SIZE//2**(num_downsampling_spatial-1))%2
64 |     conv.w = conv.h
65 |     conv.t = cfg.DATA.NUM_INPUT_FRAMES // 2**num_downsampling_temporal


--------------------------------------------------------------------------------
/tadaconv/sslgenerators/__init__.py:
--------------------------------------------------------------------------------
1 | from .mosi.mosi_generator import MoSIGenerator


--------------------------------------------------------------------------------
/tadaconv/sslgenerators/builder.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Builder for self-supervised generator."""
 5 | 
 6 | from tadaconv.utils.registry import Registry
 7 | 
 8 | SSL_GENERATOR_REGISTRY = Registry("SSL_Methods")
 9 | 
10 | def build_ssl_generator(cfg, split): 
11 |     """
12 |     Entry point to registered self-supervised learning methods. 
13 |     Returns transformed frames and the self-supervised label.
14 |     Args: 
15 |         split (str): training, validation or test. 
16 |     """
17 |     ssl_generator = SSL_GENERATOR_REGISTRY.get(cfg.PRETRAIN.GENERATOR)(cfg, split)
18 |     return ssl_generator
19 |     


--------------------------------------------------------------------------------
/tadaconv/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/tadaconv/utils/__init__.py


--------------------------------------------------------------------------------
/tadaconv/utils/bboxes_1d.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def ioa_with_anchors(anchors_min, anchors_max, box_min, box_max):
 5 |     """
 6 |     calculate the overlap proportion between the anchor and all bbox for supervise signal,
 7 |     Args:
 8 |         anchors_min (np.ndarry): 1d anchors start position, shape is N.
 9 |         anchors_max (np.ndarry): 1d anchors end position, shape: N.
10 |         box_min (np.ndarry): 1d boxes start position, shape: N.
11 |         box_max (np.ndarry): 1d boxes end position, shape: N.
12 |     Returns:
13 |         scores: (np.ndarry)
14 |     """
15 |     len_anchors = anchors_max - anchors_min
16 |     int_xmin = np.maximum(anchors_min, box_min)
17 |     int_xmax = np.minimum(anchors_max, box_max)
18 |     inter_len = np.maximum(int_xmax - int_xmin, 0.)
19 |     scores = np.divide(inter_len, len_anchors)
20 |     return scores
21 | 
22 | 
23 | def iou_with_anchors(anchors_min, anchors_max, box_min, box_max):
24 |     """
25 |     Compute jaccard score between a box and the anchors.
26 |     Args:
27 |         anchors_min (np.ndarry): 1d anchors start position, shape is N.
28 |         anchors_max (np.ndarry): 1d anchors end position, shape: N.
29 |         box_min (np.ndarry): 1d boxes start position, shape: N.
30 |         box_max (np.ndarry): 1d boxes end position, shape: N.
31 |     Returns:
32 |         jaccard: (np.ndarry)
33 |     """
34 |     len_anchors = anchors_max - anchors_min
35 |     int_xmin = np.maximum(anchors_min, box_min)
36 |     int_xmax = np.minimum(anchors_max, box_max)
37 |     inter_len = np.maximum(int_xmax - int_xmin, 0.)
38 |     union_len = len_anchors - inter_len + box_max - box_min
39 |     # print inter_len,union_len
40 |     jaccard = np.divide(inter_len, union_len)
41 |     return jaccard


--------------------------------------------------------------------------------
/tadaconv/utils/eval_tal/eval_tal.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | import sys
 5 | from .eval_epic_detection import Epicdetection
 6 | from tadaconv.utils import logging
 7 | import numpy as np
 8 | import json
 9 | logger = logging.get_logger(__name__)
10 | 
11 | 
12 | def evaluate_detection(video_anno, detection_result_file, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
13 |     """
14 |     Evaluate action detection performance.
15 |     Args:
16 |         video_anno (str): Annotation file path.
17 |         detection_result_file (str): The detection results output by your model.
18 |         tiou_thresholds (np.array): Iou thresholds to be tested.
19 |     """
20 |     detection = Epicdetection(video_anno, detection_result_file,
21 |                                 tiou_thresholds=tiou_thresholds,
22 |                                 subset='validation', verbose=True, check_status=False)
23 |     detection.evaluate()
24 | 


--------------------------------------------------------------------------------
/tadaconv/utils/launcher.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ Task launcher. """
 4 | 
 5 | import os
 6 | import torch
 7 | from tadaconv.utils.misc import get_num_gpus
 8 | 
 9 | def launch_task(cfg, init_method, func):
10 |     """
11 |     Launches the task "func" on one or multiple devices.
12 |     Args:
13 |         cfg (Config): global config object. 
14 |         init_method (str): initialization method to launch the job with multiple
15 |             devices.
16 |         func (function): task to run.
17 |     """
18 |     torch.cuda.empty_cache()
19 |     if get_num_gpus(cfg) > 1:
20 |         if cfg.PAI:
21 |             # if using the PAI cluster, get info from the environment
22 |             cfg.SHARD_ID = int(os.environ['RANK'])
23 |             if "VISIBLE_DEVICE_LIST" in os.environ:
24 |                 cfg.NUM_GPUS = len(os.environ["VISIBLE_DEVICE_LIST"].split(","))
25 |             else:
26 |                 cfg.NUM_GPUS = torch.cuda.device_count()
27 |             cfg.NUM_SHARDS = int(os.environ['WORLD_SIZE'])
28 | 
29 |         torch.multiprocessing.spawn(
30 |             run,
31 |             nprocs=cfg.NUM_GPUS,
32 |             args=(func, init_method, cfg),
33 |             daemon=False,
34 |         )
35 |     else:
36 |         func(cfg=cfg)
37 | 
38 | def run(
39 |     local_rank, func, init_method, cfg
40 | ):
41 |     """
42 |     Runs a function from a child process.
43 |     Args:
44 |         local_rank (int): rank of the current process on the current machine.
45 |         func (function): function to execute on each of the process.
46 |         init_method (string): method to initialize the distributed training.
47 |         cfg (Config): global config object.
48 |     """
49 | 
50 |     num_proc    = cfg.NUM_GPUS      # number of nodes per machine
51 |     shard_id    = cfg.SHARD_ID
52 |     num_shards  = cfg.NUM_SHARDS    # number of machines
53 |     backend     = cfg.DIST_BACKEND  # distribued backends ('nccl', 'gloo' or 'mpi')
54 | 
55 |     world_size  = num_proc * num_shards
56 |     rank        = shard_id * num_proc + local_rank
57 |     cfg.LOCAL_RANK = rank
58 | 
59 |     # dump machine info
60 |     print("num_proc (NUM_GPU): {}".format(num_proc))
61 |     print("shard_id (os.environ['RANK']): {}".format(shard_id))
62 |     print("num_shards (os.environ['WORLD_SIZE']): {}".format(num_shards))
63 |     print("rank: {}".format(rank))
64 |     print("local_rank (GPU_ID): {}".format(local_rank))
65 | 
66 |     try:
67 |         if cfg.PAI == False:
68 |             torch.distributed.init_process_group(
69 |                 backend=backend,
70 |                 init_method=init_method,
71 |                 world_size=world_size,
72 |                 rank=rank,
73 |             )
74 |         else:
75 |             torch.distributed.init_process_group(
76 |                 backend=backend,
77 |                 world_size=world_size,
78 |                 rank=rank,
79 |             )
80 |     except Exception as e:
81 |         raise e
82 |     
83 |     if "VISIBLE_DEVICE_LIST" in os.environ:
84 |         torch.cuda.set_device(int(os.environ["VISIBLE_DEVICE_LIST"]))
85 |     else:
86 |         torch.cuda.set_device(f'cuda:{local_rank}')
87 |     os.system(f"CUDA_VISIBLE_DEVICES={local_rank}")
88 |     func(cfg)
89 | 


--------------------------------------------------------------------------------
/tadaconv/utils/logging.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """
 5 | Logging.
 6 | Modified from https://github.com/facebookresearch/SlowFast/blob/master/slowfast/utils/logging.py.
 7 | Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 8 | """
 9 | 
10 | import builtins
11 | import decimal
12 | import functools
13 | import logging
14 | import os
15 | import sys
16 | import simplejson
17 | 
18 | import tadaconv.utils.distributed as du
19 | 
20 | 
21 | def _suppress_print():
22 |     """
23 |     Suppresses printing from the current process.
24 |     """
25 | 
26 |     def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False):
27 |         pass
28 | 
29 |     builtins.print = print_pass
30 | 
31 | 
32 | def setup_logging(cfg, log_file):
33 |     """
34 |     Sets up the logging for multiple processes. Only enable the logging for the
35 |     master process, and suppress logging for the non-master processes.
36 |     """
37 |     if du.is_master_proc(du.get_world_size()):
38 |         # Enable logging for the master process.
39 |         logging.root.handlers = []
40 |     else:
41 |         # Suppress logging for non-master processes.
42 |         _suppress_print()
43 |         return
44 | 
45 |     logger = logging.getLogger()
46 |     logger.setLevel(logging.INFO)
47 |     logger.propagate = False
48 |     plain_formatter = logging.Formatter(
49 |         "[%(asctime)s][%(levelname)s] %(name)s: %(lineno)4d: %(message)s",
50 |         datefmt="%m/%d %H:%M:%S",
51 |     )
52 | 
53 |     if du.is_master_proc(du.get_world_size()):
54 |         ch = logging.StreamHandler(stream=sys.stdout)
55 |         ch.setLevel(logging.DEBUG)
56 |         ch.setFormatter(plain_formatter)
57 |         logger.addHandler(ch)
58 | 
59 |     if log_file is not None and du.is_master_proc(du.get_world_size()):
60 |         filename = os.path.join(cfg.OUTPUT_DIR, log_file)
61 |         fh = logging.FileHandler(filename)
62 |         fh.setLevel(logging.DEBUG)
63 |         fh.setFormatter(plain_formatter)
64 |         logger.addHandler(fh)
65 | 
66 | 
67 | def get_logger(name):
68 |     """
69 |     Retrieve the logger with the specified name or, if name is None, return a
70 |     logger which is the root logger of the hierarchy.
71 |     Args:
72 |         name (string): name of the logger.
73 |     """
74 |     return logging.getLogger(name)
75 | 
76 | 
77 | def log_json_stats(stats):
78 |     """
79 |     Logs json stats.
80 |     Args:
81 |         stats (dict): a dictionary of statistical information to log.
82 |     """
83 |     stats = {
84 |         k: decimal.Decimal("{:.6f}".format(v)) if isinstance(v, float) else v
85 |         for k, v in stats.items()
86 |     }
87 |     json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True)
88 |     logger = get_logger(__name__)
89 |     logger.info("{:s}".format(json_stats))
90 | 


--------------------------------------------------------------------------------
/tadaconv/utils/registry.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Registry class. """
 5 | 
 6 | class Registry(object):
 7 |     """
 8 |     The Registry class provides a registry for all things
 9 |     To initialize:
10 |         REGISTRY = Registry()
11 |     
12 |     To register a tracker:
13 |         @REGISTRY.register()
14 |         class Model():
15 |             ...
16 |     """
17 | 
18 |     def __init__(self, table_name=""):
19 |         """
20 |         Initializes the registry.
21 |         Args:
22 |             table_name (str): specifies the name of the registry
23 |         """
24 |         self._entry_map = {}
25 |         self.table_name = table_name
26 | 
27 | 
28 |     def _register(self, name, entry):
29 |         """
30 |         Registers the instance.
31 |         Args:
32 |             name (str): name of the entry
33 |             entry ():   instance of the entry, could be any type
34 |         """
35 |         assert type(name) is str
36 |         assert (name not in self._entry_map.keys()), "{} {} already registered.".format(
37 |             self.table_name, name
38 |         )
39 |         self._entry_map[name] = entry
40 |     
41 |     def register(self):
42 |         """
43 |         Wrapper function for registering a module.
44 |         """
45 |         def reg(obj):
46 |             name = obj.__name__
47 |             self._register(name, obj)
48 |             return obj
49 |         return reg
50 |     
51 |     def get(self, name):
52 |         """
53 |         Returns the instance specified by the name. 
54 |         Args:
55 |             name (str): name of the specified instance.
56 |         """
57 |         if name not in self._entry_map.keys():
58 |             return None
59 |         obj = self._entry_map.get(name)
60 |         return obj
61 |     
62 |     def get_all_registered(self):
63 |         """
64 |         Prints all registered class. 
65 |         """
66 |         return self._entry_map.keys()


--------------------------------------------------------------------------------
/tadaconv/utils/sampler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Multi-fold distributed sampler."""
 5 | 
 6 | import math
 7 | import torch
 8 | import torch.distributed as dist
 9 | from torch.utils.data.sampler import Sampler
10 | 
11 | 
12 | class MultiFoldDistributedSampler(Sampler):
13 |     """Modified from DistributedSampler, which performs multi fold training for 
14 |     accelerating distributed training with large batches.
15 |     
16 |     Sampler that restricts data loading to a subset of the dataset.
17 | 
18 |     It is especially useful in conjunction with
19 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
20 |     process can pass a DistributedSampler instance as a DataLoader sampler,
21 |     and load a subset of the original dataset that is exclusive to it.
22 | 
23 |     .. note::
24 |         Dataset is assumed to be of constant size.
25 | 
26 |     Arguments:
27 |         dataset: Dataset used for sampling.
28 |         num_replicas (optional): Number of processes participating in
29 |             distributed training.
30 |         rank (optional): Rank of the current process within num_replicas.
31 |         shuffle (optional): If true (default), sampler will shuffle the indices
32 | 
33 |     .. warning::
34 |         In distributed mode, calling the ``set_epoch`` method is needed to
35 |         make shuffling work; each process will use the same random seed
36 |         otherwise.
37 | 
38 |     Example::
39 | 
40 |         >>> sampler = DistributedSampler(dataset) if is_distributed else None
41 |         >>> loader = DataLoader(dataset, shuffle=(sampler is None),
42 |         ...                     sampler=sampler)
43 |         >>> for epoch in range(start_epoch, n_epochs):
44 |         ...     if is_distributed:
45 |     """
46 | 
47 |     def __init__(self, dataset, num_folds=1, num_replicas=None, rank=None, shuffle=True):
48 |         """
49 |             When num_folds = 1, MultiFoldDistributedSampler degenerates to DistributedSampler.
50 |         """
51 |         if num_replicas is None:
52 |             if not dist.is_available():
53 |                 raise RuntimeError("Requires distributed package to be available")
54 |             num_replicas = dist.get_world_size()
55 |         if rank is None:
56 |             if not dist.is_available():
57 |                 raise RuntimeError("Requires distributed package to be available")
58 |             rank = dist.get_rank()
59 |         self.dataset = dataset
60 |         self.num_folds = num_folds
61 |         self.num_replicas = num_replicas
62 |         self.rank = rank
63 |         self.epoch = 0
64 |         self.num_samples = int(math.ceil(len(self.dataset) * self.num_folds * 1.0 / self.num_replicas))
65 |         self.total_size = self.num_samples * self.num_replicas
66 |         self.shuffle = shuffle
67 | 
68 |     def __iter__(self):
69 |         # deterministically shuffle based on epoch
70 |         indices = []
71 |         for fold_idx in range(self.num_folds):
72 |             g = torch.Generator()
73 |             g.manual_seed(self.epoch+fold_idx)
74 |             if self.shuffle:
75 |                 indices += torch.randperm(len(self.dataset), generator=g).tolist()
76 |             else:
77 |                 indices += list(range(len(self.dataset)))
78 | 
79 | 
80 |         # add extra samples to make it evenly divisible
81 |         indices += indices[:(self.total_size - len(indices))]
82 |         assert len(indices) == self.total_size
83 | 
84 |         # subsample
85 |         indices = indices[self.rank:self.total_size:self.num_replicas]
86 |         assert len(indices) == self.num_samples
87 | 
88 |         return iter(indices)
89 | 
90 |     def __len__(self):
91 |         return self.num_samples
92 | 
93 |     def set_epoch(self, epoch):
94 |         self.epoch = epoch


--------------------------------------------------------------------------------
/tadaconv/utils/tensor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def tensor2cuda(data):
 5 |     """
 6 |     Put Tensor in iterable data into gpu.
 7 |     Args:
 8 |         data :(tensor or list or dict)
 9 |     """
10 |     if type(data) == torch.Tensor:
11 |         return data.cuda(non_blocking=True)
12 |     elif type(data) == dict:
13 |         keys = list(data.keys())
14 |         for k in keys:
15 |             data[k] = tensor2cuda(data[k])
16 |     elif type(data) == list:
17 |         for i in range(len(data)):
18 |             data[i] = tensor2cuda(data[i])
19 |     return data
20 | 


--------------------------------------------------------------------------------
/tadaconv/utils/timer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (C) Alibaba Group Holding Limited. 
 3 | 
 4 | """ Timer class. """
 5 | 
 6 | from time import perf_counter
 7 | from typing import Optional
 8 | 
 9 | 
10 | class Timer:
11 |     """
12 |     A timer which computes the time elapsed since the start/reset of the timer.
13 |     """
14 | 
15 |     def __init__(self) -> None:
16 |         self.reset()
17 | 
18 |     def reset(self) -> None:
19 |         """
20 |         Reset the timer.
21 |         """
22 |         self._start = perf_counter()
23 |         self._paused: Optional[float] = None
24 |         self._total_paused = 0
25 |         self._count_start = 1
26 | 
27 |     def pause(self) -> None:
28 |         """
29 |         Pause the timer.
30 |         """
31 |         if self._paused is not None:
32 |             raise ValueError("Trying to pause a Timer that is already paused!")
33 |         self._paused = perf_counter()
34 | 
35 |     def is_paused(self) -> bool:
36 |         """
37 |         Returns:
38 |             bool: whether the timer is currently paused
39 |         """
40 |         return self._paused is not None
41 | 
42 |     def resume(self) -> None:
43 |         """
44 |         Resume the timer.
45 |         """
46 |         if self._paused is None:
47 |             raise ValueError("Trying to resume a Timer that is not paused!")
48 |         self._total_paused += perf_counter() - self._paused  # pyre-ignore
49 |         self._paused = None
50 |         self._count_start += 1
51 | 
52 |     def seconds(self) -> float:
53 |         """
54 |         Returns:
55 |             (float): the total number of seconds since the start/reset of the
56 |                 timer, excluding the time when the timer is paused.
57 |         """
58 |         if self._paused is not None:
59 |             end_time: float = self._paused  # type: ignore
60 |         else:
61 |             end_time = perf_counter()
62 |         return end_time - self._start - self._total_paused
63 | 
64 |     def avg_seconds(self) -> float:
65 |         """
66 |         Returns:
67 |             (float): the average number of seconds between every start/reset and
68 |             pause.
69 |         """
70 |         return self.seconds() / self._count_start


--------------------------------------------------------------------------------
/tadaconv/utils/val_dist_sampler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (C) Alibaba Group Holding Limited. 
  3 | 
  4 | """ Multi-fold distributed sampler."""
  5 | 
  6 | import math
  7 | import torch
  8 | import tadaconv.utils.distributed as dist
  9 | from torch.utils.data.distributed import DistributedSampler
 10 | 
 11 | import tadaconv.utils.logging as logging
 12 | logger = logging.get_logger(__name__)
 13 | 
 14 | 
 15 | class MultiSegValDistributedSampler(DistributedSampler):
 16 |     """Modified from DistributedSampler, which performs multi fold training for 
 17 |     accelerating distributed training with large batches.
 18 |     
 19 |     Sampler that restricts data loading to a subset of the dataset.
 20 | 
 21 |     It is especially useful in conjunction with
 22 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 23 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 24 |     and load a subset of the original dataset that is exclusive to it.
 25 | 
 26 |     .. note::
 27 |         Dataset is assumed to be of constant size.
 28 | 
 29 |     Arguments:
 30 |         dataset: Dataset used for sampling.
 31 |         num_replicas (optional): Number of processes participating in
 32 |             distributed training.
 33 |         rank (optional): Rank of the current process within num_replicas.
 34 |         shuffle (optional): If true (default), sampler will shuffle the indices
 35 | 
 36 |     .. warning::
 37 |         In distributed mode, calling the ``set_epoch`` method is needed to
 38 |         make shuffling work; each process will use the same random seed
 39 |         otherwise.
 40 | 
 41 |     Example::
 42 | 
 43 |         >>> sampler = MultiSegValDistributedSampler(dataset) if is_distributed else None
 44 |         >>> loader = DataLoader(dataset, shuffle=(sampler is None),
 45 |         ...                     sampler=sampler)
 46 |         >>> for epoch in range(start_epoch, n_epochs):
 47 |         ...     if is_distributed:
 48 |     """
 49 | 
 50 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
 51 |         """
 52 |             We divide each video in epic dataset into multiple sliding windows.
 53 |             Each sliding window is a sample in validation process for efficient.
 54 |             This function will assign the sliding windows which belong to the same video to a same gpu. 
 55 |         """
 56 |         if num_replicas is None:
 57 |             num_replicas = dist.get_world_size()
 58 |         if rank is None:
 59 |             rank = dist.get_rank()
 60 |         self.dataset = dataset
 61 |         self.num_replicas = num_replicas
 62 |         self.rank = rank
 63 |         self.epoch = 0
 64 |         assert shuffle is False
 65 |         self.shuffle = shuffle
 66 |         vid_name_dict = {}
 67 |         self.vid_name_list = []
 68 |         self.vid_num_list = []
 69 |         for s in dataset._samples:
 70 |             if s[0] not in vid_name_dict:
 71 |                 vid_name_dict[s[0]] = 0
 72 |                 self.vid_name_list += [s[0]]
 73 |                 self.vid_num_list += [0]
 74 |             self.vid_num_list[-1] += 1
 75 |         self.num_samples = int(math.ceil(len(self.vid_name_list) * 1.0 / self.num_replicas))
 76 |         self.total_size = self.num_samples * self.num_replicas
 77 |         self.__init_dist__()
 78 | 
 79 |     def __init_dist__(self):
 80 |         indices = list(range(len(self.vid_name_list)))
 81 |         # add extra samples to make it evenly divisible
 82 |         indices += indices[:(self.total_size - len(indices))]
 83 |         assert len(indices) == self.total_size
 84 | 
 85 |         # subsample
 86 |         indices = indices[self.rank:self.total_size:self.num_replicas]
 87 |         assert len(indices) == self.num_samples
 88 |         self.true_indices = []
 89 |         for ind in indices:
 90 |             if ind == 0:
 91 |                 exist_num = 0
 92 |             else:
 93 |                 exist_num = sum(self.vid_num_list[:ind])
 94 |             self.true_indices.extend(list(range(exist_num, exist_num+self.vid_num_list[ind])))
 95 | 
 96 |     def __iter__(self):
 97 |         return iter(self.true_indices)
 98 | 
 99 |     def __len__(self):
100 |         return len(self.true_indices)
101 | 
102 |     def set_epoch(self, epoch):
103 |         self.epoch = epoch
104 | 


--------------------------------------------------------------------------------