├── FEATURE_ZOO.md ├── GUIDELINES.md ├── LICENSE ├── MODEL_ZOO.md ├── README.md ├── configs ├── pool │ ├── backbone │ │ ├── csn.yaml │ │ ├── localization-conv.yaml │ │ ├── r2d3ds.yaml │ │ ├── r2p1d.yaml │ │ ├── s3dg.yaml │ │ ├── slowfast_4x16.yaml │ │ ├── slowfast_8x8.yaml │ │ ├── tada2d.yaml │ │ ├── tadaconvnextv2_base.yaml │ │ ├── tadaconvnextv2_small.yaml │ │ ├── tadaconvnextv2_tiny.yaml │ │ ├── tadaformer_b16.yaml │ │ ├── tadaformer_l14.yaml │ │ ├── timesformer.yaml │ │ ├── vivit.yaml │ │ └── vivit_fac_enc.yaml │ ├── base.yaml │ └── run │ │ └── training │ │ ├── finetune.yaml │ │ ├── from_scratch.yaml │ │ ├── from_scratch_large.yaml │ │ ├── localization.yaml │ │ └── mosi.yaml └── projects │ ├── epic-kitchen-ar │ ├── csn_ek100.yaml │ ├── csn_ek100_submission.yaml │ ├── ek100 │ │ ├── csn.yaml │ │ ├── csn_submit.yaml │ │ ├── csn_test.yaml │ │ ├── vivit_fac_enc.yaml │ │ ├── vivit_fac_enc_submit.yaml │ │ └── vivit_fac_enc_test.yaml │ ├── k400 │ │ ├── vivit_fac_enc_b16x2.yaml │ │ └── vivit_fac_enc_b16x2_test.yaml │ ├── vivit_fac_enc_ek100.yaml │ ├── vivit_fac_enc_ek100_submission.yaml │ └── vivit_fac_enc_k400.yaml │ ├── epic-kitchen-tal │ ├── bmn-epic │ │ └── vivit-os-local.yaml │ └── bmn_epic.yaml │ ├── mosi │ ├── baselines │ │ ├── r2d3ds_hmdb.yaml │ │ ├── r2d3ds_ucf.yaml │ │ ├── r2p1d_hmdb.yaml │ │ └── r2p1d_ucf.yaml │ ├── ft-hmdb │ │ ├── r2d3ds.yaml │ │ ├── r2d3ds_test.yaml │ │ ├── r2p1d.yaml │ │ └── r2p1d_test.yaml │ ├── ft-ucf │ │ ├── r2d3ds.yaml │ │ ├── r2d3ds_test.yaml │ │ ├── r2p1d.yaml │ │ └── r2p1d_test.yaml │ ├── ft_r2d3ds_hmdb.yaml │ ├── ft_r2d3ds_ucf.yaml │ ├── ft_r2p1d_hmdb.yaml │ ├── ft_r2p1d_ucf.yaml │ ├── mosi_r2d3ds_hmdb.yaml │ ├── mosi_r2d3ds_imagenet.yaml │ ├── mosi_r2d3ds_ucf.yaml │ ├── mosi_r2p1d_hmdb.yaml │ ├── mosi_r2p1d_ucf.yaml │ ├── pt-hmdb │ │ ├── r2d3ds.yaml │ │ └── r2p1d.yaml │ ├── pt-imagenet │ │ └── r2d3ds.yaml │ └── pt-ucf │ │ ├── r2d3ds.yaml │ │ └── r2p1d.yaml │ ├── tada │ ├── k400 │ │ ├── tada2d_16x5.yaml │ │ └── tada2d_8x8.yaml │ ├── ssv2 │ │ ├── tada2d_16f.yaml │ │ └── tada2d_8f.yaml │ ├── tada2d_k400.yaml │ └── tada2d_ssv2.yaml │ ├── tadaconvnextv2 │ ├── tadaconvnextv2_base_k400_16f.yaml │ ├── tadaconvnextv2_base_ssv2_16f.yaml │ ├── tadaconvnextv2_small_k400_16f.yaml │ ├── tadaconvnextv2_small_ssv2_16f.yaml │ ├── tadaconvnextv2_tiny_k400_16f.yaml │ └── tadaconvnextv2_tiny_ssv2_16f.yaml │ └── tadaformer │ ├── tadaformer_b16_k400_16f.yaml │ ├── tadaformer_b16_ssv2_16f.yaml │ ├── tadaformer_l14_k400_16f.yaml │ └── tadaformer_l14_ssv2_16f.yaml ├── projects ├── epic-kitchen-ar │ └── README.md ├── epic-kitchen-tal │ └── README.md ├── mosi │ ├── MoSI.png │ └── README.md ├── tada │ ├── README.md │ └── TAda2D.png └── tadaconvv2 │ ├── README.md │ └── TAdaConvV2.png ├── runs ├── run.py ├── submission_test.py ├── test.py ├── test_epic_localization.py └── train.py └── tadaconv ├── datasets ├── __init__.py ├── base │ ├── __init__.py │ ├── base_dataset.py │ ├── builder.py │ ├── epickitchen100.py │ ├── epickitchen100_feature.py │ ├── hmdb51.py │ ├── imagenet.py │ ├── kinetics400.py │ ├── ssv2.py │ └── ucf101.py └── utils │ ├── __init__.py │ ├── auto_augment.py │ ├── collate_functions.py │ ├── mixup.py │ ├── preprocess_ssv2.py │ ├── random_erasing.py │ └── transformations.py ├── models ├── __init__.py ├── base │ ├── __init__.py │ ├── backbone.py │ ├── base_blocks.py │ ├── builder.py │ ├── models.py │ ├── slowfast.py │ └── transformer.py ├── module_zoo │ ├── __init__.py │ ├── branches │ │ ├── __init__.py │ │ ├── csn_branch.py │ │ ├── non_local.py │ │ ├── r2d3d_branch.py │ │ ├── r2plus1d_branch.py │ │ ├── s3dg_branch.py │ │ ├── slowfast_branch.py │ │ ├── tada_branch.py │ │ ├── tadaconvnextv2.py │ │ └── tadaformer.py │ ├── heads │ │ ├── __init__.py │ │ ├── bmn_head.py │ │ ├── mosi_head.py │ │ ├── slowfast_head.py │ │ └── transformer_head.py │ ├── ops │ │ ├── __init__.py │ │ ├── misc.py │ │ ├── tadaconv.py │ │ └── tadaconv_v2.py │ └── stems │ │ ├── __init__.py │ │ ├── downsample_stem.py │ │ ├── embedding_stem.py │ │ └── r2plus1d_stem.py └── utils │ ├── init_helper.py │ ├── lars.py │ ├── localization_losses.py │ ├── losses.py │ ├── lr_policy.py │ ├── model_ema.py │ ├── optimizer.py │ └── params.py ├── sslgenerators ├── __init__.py ├── builder.py └── mosi │ └── mosi_generator.py └── utils ├── __init__.py ├── bboxes_1d.py ├── bucket.py ├── checkpoint.py ├── config.py ├── distributed.py ├── eval_tal ├── eval_epic_detection.py └── eval_tal.py ├── launcher.py ├── logging.py ├── meters.py ├── metrics.py ├── misc.py ├── registry.py ├── sampler.py ├── tal_tools.py ├── tensor.py ├── timer.py └── val_dist_sampler.py /FEATURE_ZOO.md: -------------------------------------------------------------------------------- 1 | # FEATURE ZOO 2 | 3 | Here, we provide strong features for temporal action localization on HACS and Epic-Kitchens-100. 4 | 5 | | dataset | model | resolution | features | classification | average mAP | 6 | | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | 7 | | EK100 | TAda2D | 8 x 8 | [features:code dc05](https://pan.baidu.com/s/1YS9yj_O21HedIxyh2PMrqw) | [classification:code 2j51](https://pan.baidu.com/s/1z7h7OAFR2UO_Q7t8dA6YbQ) | 13.18 (A) | 8 | | HACS | TAda2D | 8 x 8 | [features:code 23kv](https://pan.baidu.com/s/1FHkRFvJldtEmD8kzYw_yMQ) | - | 32.3 | 9 | | EK100 | ViViT Fact. Enc.-B16x2 | 32 x 2 | coming soon | coming soon | 18.30 (A) | 10 | 11 | Annotations used for temporal action localization with our codebase can be found [here:code r30w](https://pan.baidu.com/s/16CtY0zTIzgDpm7sjhCAA6w). 12 | 13 | Pre-trained localization models using these features can be found in the [MODEL_ZOO.md](MODEL_ZOO.md). 14 | 15 | ## Guideline 16 | 17 | ### Feature preparation 18 | After downloading the compressed feature files, first extract the `.pkl` files as follows. For example, for TAda2D HACS features: 19 | 20 | ```bash 21 | cat features_s16_fps30_val_2G.tar.gz?? | tar zx 22 | cat features_s16_fps30_train_2G.tar.gz?? | tar zx 23 | ``` 24 | 25 | By the above commands, you should have two folders named `features_s16_fps30_train` and `features_s16_fps30_val`, containing some `.pkl` files. Each `.pkl` file corresponds to one video. 26 | 27 | ### Feature loading 28 | To load the features, please use the `load_feature` function in `datasets/base/epickitchen100_feature.py`: 29 | 30 | ```python 31 | def load_feature(path): 32 | if type(path) is str: 33 | with open(path, 'rb') as f: 34 | data = torch.load(f) 35 | else: 36 | data = torch.load(path) 37 | return data 38 | ``` 39 | 40 | ### Feature concatenation 41 | For **Epic-Kitchen-100**, we divide each video to multiple clips, and the length of each clip is 5 secs. To perform action localization, features are first concatenated using `_transform_feature_scale` function in `datasets/base/epickitchen100_feature.py`. For example, during training, if the action segment is `[8.5, 16.1]`, it will require three clip features: `[[5.0,10.0], [10.0, 15.0], [15.0, 20.0]]`. From the features from these clips, we obtain features for the ground truth action segment. For more details, please refer to [epickitchen100_feature.py](datasets/base/epickitchen100_feature.py). -------------------------------------------------------------------------------- /GUIDELINES.md: -------------------------------------------------------------------------------- 1 | # Guidelines for pytorch-video-understanding 2 | 3 | ## Installation 4 | 5 | Requirements: 6 | - Python>=3.6 7 | - torch>=1.5 8 | - torchvision (version corresponding with torch) 9 | - simplejson==3.11.1 10 | - decord>=0.6.0 11 | - pyyaml 12 | - einops 13 | - oss2 14 | - psutil 15 | - tqdm 16 | - pandas 17 | 18 | optional requirements 19 | - fvcore (for flops calculation) 20 | 21 | ## Data preparation 22 | 23 | For all datasets available in `datasets/base`, the name for each dataset list is specified in the `_get_dataset_list_name` function. 24 | Here we provide a table summarizing all the name and the formats of the datasets. 25 | 26 | | dataset | split | list file name | format | 27 | | ------- | ----- | -------------- | ------ | 28 | | epic-kitchens-100 | train | EPIC_100_train.csv | as downloaded | 29 | | epic-kitchens-100 | val | EPIC_100_validation.csv | as downloaded | 30 | | epic-kitchens-100 | test | EPIC_100_test_timestamps.csv | as downloaded | 31 | | hmdb51 | train/val | hmdb51_train_list.txt/hmdb51_val_list.txt | "video_path, supervised_label" | 32 | | imagenet | train/val | imagenet_train.txt/imagenet_val.txt | "image_path, supervised_label" | 33 | | kinetics 400 | train/val | kinetics400_train_list.txt/kinetics400_val_list.txt | "video_path, supervised_label" | 34 | | ssv2 | train | something-something-v2-train-with-label.json | json file with "label_idx" specifying the class and "id" specifying the name | 35 | | ssv2 | val | something-something-v2-val-with-label.json | json file with "label_idx" specifying the class and "id" specifying the name | 36 | | ucf101 | train/val | ucf101_train_list.txt/ucf101_val_list.txt | "video_path, supervised_label" | 37 | 38 | For epic-kitchens-features, the file name is specified in the respective configs in `configs/projects/epic-kitchen-tal`. 39 | 40 | ### Preprocessing Something-Something-V2 dataset 41 | 42 | We found the the video decoder we use [decord](https://github.com/dmlc/decord) has difficulty in decoding the original webm files. So we provide a script for preprocessing the `.webm` files in the original something-something-v2 dataset to `.mp4` files. To do this, simply run: 43 | 44 | ```bash 45 | python datasets/utils/preprocess_ssv2_annos.py --anno --anno_path path_to_your_annotation 46 | python datasets/utils/preprocess_ssv2_annos.py --data --data_path path_to_your_ssv2_videos --data_out_path path_to_put_output_videos 47 | ``` 48 | 49 | Remember to make sure the annotation files are organized as follows: 50 | ``` 51 | -- path_to_your_annotation 52 | -- something-something-v2-train.json 53 | -- something-something-v2-validation.json 54 | -- something-something-v2-labels.json 55 | ``` 56 | 57 | ## Running 58 | 59 | The entry file for all the runs are `runs/run.py`. 60 | 61 | Before running, some settings need to be configured in the config file. 62 | The codebase is designed to be experiment friendly for rapid development of new models and representation learning approaches, in that the config files are designed in a hierarchical way. 63 | 64 | Take Tada2D as an example, each experiment (such as TAda2D_8x8 on kinetics 400: `configs/projects/tada/k400/tada2d_8x8.yaml`) inherits the config from the following hierarchy. 65 | ``` 66 | --- base config file [configs/pool/base.yaml] 67 | --- base run config [configs/pool/run/training/from_scratch_large.yaml] 68 | --- base backbone config [configs/pool/backbone/tada2d.yaml] 69 | --- base experiment config [configs/projects/tada/tada2d_k400.yaml] 70 | --- current experiment config [configs/projects/tada/k400/tada2d_8x8.yaml] 71 | ``` 72 | Generally, the base config file `configs/pool/base.yaml` contains all the possible keys used in this codebase and the bottom config overwrites its base config when the same key is encountered in both files. 73 | A good practice would be set the parameters shared for all the experiments in the base experiment config, and set parameters that are different for each experiments to the current experiment config. 74 | 75 | For an example run, open `configs/projects/tada/tada2d_k400.yaml` 76 | A. Set `DATA.DATA_ROOT_DIR` and `DATA.DATA_ANNO_DIR` to point to the kinetics 400, 77 | B. Set the valid gpu number `NUM_GPUS` 78 | Then the codebase can be run by: 79 | ``` 80 | python runs/run.py --cfg configs/projects/tada/k400/tada2d_8x8.yaml 81 | ``` -------------------------------------------------------------------------------- /configs/pool/backbone/csn.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: irCSN 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 152 6 | META_ARCH: ResNet3D 7 | NUM_FILTERS: [64, 256, 512, 1024, 2048] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 2048 10 | KERNEL_SIZE: [ 11 | [3, 7, 7], 12 | [3, 3, 3], 13 | [3, 3, 3], 14 | [3, 3, 3], 15 | [3, 3, 3] 16 | ] 17 | DOWNSAMPLING: [true, false, true, true, true] 18 | DOWNSAMPLING_TEMPORAL: [false, false, true, true, true] 19 | NUM_STREAMS: 1 20 | EXPANSION_RATIO: 4 21 | BRANCH: 22 | NAME: CSNBranch 23 | STEM: 24 | NAME: DownSampleStem 25 | NONLOCAL: 26 | ENABLE: false 27 | STAGES: [5] 28 | MASK_ENABLE: false 29 | HEAD: 30 | NAME: BaseHead 31 | ACTIVATION: softmax 32 | DROPOUT_RATE: 0 33 | NUM_CLASSES: # !!! 34 | -------------------------------------------------------------------------------- /configs/pool/backbone/localization-conv.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: BaseVideoModel 3 | VIDEO: 4 | DIM1D: 256 5 | DIM2D: 128 6 | DIM3D: 512 7 | BACKBONE_LAYER: 2 8 | BACKBONE_GROUPS_NUM: 4 9 | BACKBONE: 10 | META_ARCH: SimpleLocalizationConv -------------------------------------------------------------------------------- /configs/pool/backbone/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: R2D3D 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 18 6 | META_ARCH: ResNet3D 7 | NUM_FILTERS: [64, 64, 128, 256, 256] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 256 10 | KERNEL_SIZE: [ 11 | [1, 7, 7], 12 | [1, 3, 3], 13 | [1, 3, 3], 14 | [3, 3, 3], 15 | [3, 3, 3] 16 | ] 17 | DOWNSAMPLING: [true, false, true, true, true] 18 | DOWNSAMPLING_TEMPORAL: [false, false, false, true, true] 19 | NUM_STREAMS: 1 20 | EXPANSION_RATIO: 2 21 | BRANCH: 22 | NAME: R2D3DBranch 23 | STEM: 24 | NAME: DownSampleStem 25 | NONLOCAL: 26 | ENABLE: false 27 | STAGES: [5] 28 | MASK_ENABLE: false 29 | HEAD: 30 | NAME: BaseHead 31 | ACTIVATION: softmax 32 | DROPOUT_RATE: 0 33 | NUM_CLASSES: # !!! 34 | -------------------------------------------------------------------------------- /configs/pool/backbone/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: R2Plus1D 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 10 6 | META_ARCH: ResNet3D 7 | NUM_INPUT_CHANNELS: 3 8 | NUM_FILTERS: [64, 64, 128, 256, 512] 9 | NUM_OUT_FEATURES: 512 10 | KERNEL_SIZE: [ 11 | [3, 7, 7], 12 | [3, 3, 3], 13 | [3, 3, 3], 14 | [3, 3, 3], 15 | [3, 3, 3] 16 | ] 17 | DOWNSAMPLING: [true, false, true, true, true] 18 | DOWNSAMPLING_TEMPORAL: [false, false, true, true, true] 19 | NUM_STREAMS: 1 20 | EXPANSION_RATIO: 2 21 | BRANCH: 22 | NAME: R2Plus1DBranch 23 | STEM: 24 | NAME: R2Plus1DStem 25 | NONLOCAL: 26 | ENABLE: false 27 | STAGES: [5] 28 | MASK_ENABLE: false 29 | HEAD: 30 | NAME: BaseHead 31 | ACTIVATION: softmax 32 | DROPOUT_RATE: 0 33 | NUM_CLASSES: # !!! 34 | -------------------------------------------------------------------------------- /configs/pool/backbone/s3dg.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: S3DG 3 | VIDEO: 4 | BACKBONE: 5 | META_ARCH: Inception3D 6 | NUM_OUT_FEATURES: 1024 7 | NUM_STREAMS: 1 8 | BRANCH: 9 | NAME: STConv3d 10 | GATING: true 11 | STEM: 12 | NAME: STConv3d 13 | NONLOCAL: 14 | ENABLE: false 15 | STAGES: [5] 16 | MASK_ENABLE: false 17 | HEAD: 18 | NAME: BaseHead 19 | ACTIVATION: softmax 20 | DROPOUT_RATE: 0 21 | NUM_CLASSES: # !!! -------------------------------------------------------------------------------- /configs/pool/backbone/slowfast_4x16.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: SlowFast_4x16 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 50 6 | META_ARCH: Slowfast 7 | NUM_FILTERS: [64, 256, 512, 1024, 2048] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 2048 10 | KERNEL_SIZE: [ 11 | [ 12 | [1, 7, 7], 13 | [1, 3, 3], 14 | [1, 3, 3], 15 | [1, 3, 3], 16 | [1, 3, 3], 17 | ], 18 | [ 19 | [5, 7, 7], 20 | [1, 3, 3], 21 | [1, 3, 3], 22 | [1, 3, 3], 23 | [1, 3, 3], 24 | ], 25 | ] 26 | DOWNSAMPLING: [true, false, true, true, true] 27 | DOWNSAMPLING_TEMPORAL: [false, false, false, false, false] 28 | TEMPORAL_CONV_BOTTLENECK: 29 | [ 30 | [false, false, false, true, true], # slow branch, 31 | [false, true, true, true, true] # fast branch 32 | ] 33 | NUM_STREAMS: 1 34 | EXPANSION_RATIO: 4 35 | BRANCH: 36 | NAME: SlowfastBranch 37 | STEM: 38 | NAME: DownSampleStem 39 | SLOWFAST: 40 | MODE: slowfast 41 | ALPHA: 8 42 | BETA: 8 # slow fast channel ratio 43 | CONV_CHANNEL_RATIO: 2 44 | KERNEL_SIZE: 5 45 | FUSION_CONV_BIAS: false 46 | FUSION_BN: true 47 | FUSION_RELU: true 48 | NONLOCAL: 49 | ENABLE: false 50 | STAGES: [5] 51 | MASK_ENABLE: false 52 | HEAD: 53 | NAME: SlowFastHead 54 | ACTIVATION: softmax 55 | DROPOUT_RATE: 0 56 | NUM_CLASSES: # !!! 57 | DATA: 58 | NUM_INPUT_FRAMES: 32 59 | SAMPLING_RATE: 2 60 | -------------------------------------------------------------------------------- /configs/pool/backbone/slowfast_8x8.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: SlowFast_8x8 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 50 6 | META_ARCH: Slowfast 7 | NUM_FILTERS: [64, 256, 512, 1024, 2048] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 2048 10 | KERNEL_SIZE: [ 11 | [ 12 | [1, 7, 7], 13 | [1, 3, 3], 14 | [1, 3, 3], 15 | [1, 3, 3], 16 | [1, 3, 3], 17 | ], 18 | [ 19 | [5, 7, 7], 20 | [1, 3, 3], 21 | [1, 3, 3], 22 | [1, 3, 3], 23 | [1, 3, 3], 24 | ], 25 | ] 26 | DOWNSAMPLING: [true, false, true, true, true] 27 | DOWNSAMPLING_TEMPORAL: [false, false, false, false, false] 28 | TEMPORAL_CONV_BOTTLENECK: 29 | [ 30 | [false, false, false, true, true], # slow branch, 31 | [false, true, true, true, true] # fast branch 32 | ] 33 | NUM_STREAMS: 1 34 | EXPANSION_RATIO: 4 35 | BRANCH: 36 | NAME: SlowfastBranch 37 | STEM: 38 | NAME: DownSampleStem 39 | SLOWFAST: 40 | MODE: slowfast 41 | ALPHA: 4 42 | BETA: 8 # slow fast channel ratio 43 | CONV_CHANNEL_RATIO: 2 44 | KERNEL_SIZE: 7 45 | FUSION_CONV_BIAS: false 46 | FUSION_BN: true 47 | FUSION_RELU: true 48 | NONLOCAL: 49 | ENABLE: false 50 | STAGES: [5] 51 | MASK_ENABLE: false 52 | HEAD: 53 | NAME: SlowFastHead 54 | ACTIVATION: softmax 55 | DROPOUT_RATE: 0 56 | NUM_CLASSES: # !!! 57 | DATA: 58 | NUM_INPUT_FRAMES: 32 59 | SAMPLING_RATE: 2 -------------------------------------------------------------------------------- /configs/pool/backbone/tada2d.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: TAda2D 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 50 6 | META_ARCH: ResNet3D 7 | NUM_FILTERS: [64, 256, 512, 1024, 2048] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 2048 10 | KERNEL_SIZE: [ 11 | [1, 7, 7], 12 | [1, 3, 3], 13 | [1, 3, 3], 14 | [1, 3, 3], 15 | [1, 3, 3] 16 | ] 17 | DOWNSAMPLING: [true, true, true, true, true] 18 | DOWNSAMPLING_TEMPORAL: [false, false, false, false, false] 19 | NUM_STREAMS: 1 20 | EXPANSION_RATIO: 4 21 | INITIALIZATION: kaiming 22 | STEM: 23 | NAME: Base2DStem 24 | BRANCH: 25 | NAME: TAda2DBlock 26 | ROUTE_FUNC_K: [3, 3] 27 | ROUTE_FUNC_R: 4 28 | POOL_K: [3, 1, 1] 29 | NONLOCAL: 30 | ENABLE: false 31 | STAGES: [5] 32 | MASK_ENABLE: false 33 | HEAD: 34 | NAME: BaseHead 35 | ACTIVATION: softmax 36 | DROPOUT_RATE: 0 37 | NUM_CLASSES: # !!! 38 | -------------------------------------------------------------------------------- /configs/pool/backbone/tadaconvnextv2_base.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: TAdaConvNeXtV2-Base 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: [3, 3, 27, 3] 6 | META_ARCH: ConvNeXt 7 | NUM_FILTERS: [128, 256, 512, 1024] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 1024 10 | DROP_PATH: 0.6 11 | LARGE_SCALE_INIT_VALUE: 1e-6 12 | STEM: 13 | T_KERNEL_SIZE: 3 14 | T_STRIDE: 2 15 | BRANCH: 16 | NAME: TAdaConvNeXtV2Block 17 | ROUTE_FUNC_K: [3, 3] 18 | ROUTE_FUNC_R: 2 19 | HEAD_DIM: 64 20 | HEAD: 21 | NAME: BaseHead 22 | ACTIVATION: softmax 23 | DROPOUT_RATE: 0 24 | NUM_CLASSES: # !!! 25 | 26 | -------------------------------------------------------------------------------- /configs/pool/backbone/tadaconvnextv2_small.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: TAdaConvNeXtV2-Small 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: [3, 3, 27, 3] 6 | META_ARCH: ConvNeXt 7 | NUM_FILTERS: [96, 192, 384, 768] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 768 10 | DROP_PATH: 0.4 11 | LARGE_SCALE_INIT_VALUE: 1e-6 12 | STEM: 13 | T_KERNEL_SIZE: 3 14 | T_STRIDE: 2 15 | BRANCH: 16 | NAME: TAdaConvNeXtV2Block 17 | ROUTE_FUNC_K: [3, 3] 18 | ROUTE_FUNC_R: 2 19 | HEAD_DIM: 48 20 | HEAD: 21 | NAME: BaseHead 22 | ACTIVATION: softmax 23 | DROPOUT_RATE: 0 24 | NUM_CLASSES: # !!! 25 | 26 | -------------------------------------------------------------------------------- /configs/pool/backbone/tadaconvnextv2_tiny.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: TAdaConvNeXtV2-Tiny 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: [3, 3, 9, 3] 6 | META_ARCH: ConvNeXt 7 | NUM_FILTERS: [96, 192, 384, 768] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 768 10 | DROP_PATH: 0.2 11 | LARGE_SCALE_INIT_VALUE: 1e-6 12 | STEM: 13 | T_KERNEL_SIZE: 3 14 | T_STRIDE: 2 15 | BRANCH: 16 | NAME: TAdaConvNeXtV2Block 17 | ROUTE_FUNC_K: [3, 3] 18 | ROUTE_FUNC_R: 2 19 | HEAD_DIM: 48 20 | HEAD: 21 | NAME: BaseHead 22 | ACTIVATION: softmax 23 | DROPOUT_RATE: 0 24 | NUM_CLASSES: # !!! 25 | 26 | -------------------------------------------------------------------------------- /configs/pool/backbone/tadaformer_b16.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: TAdaFormer_B16 3 | 4 | VIDEO: 5 | BACKBONE: 6 | META_ARCH: VisionTransformer 7 | INPUT_RES: 224 8 | PATCH_SIZE: 16 9 | TUBLET_SIZE: 3 10 | TUBLET_STRIDE: 2 11 | NUM_FEATURES: 768 12 | NUM_OUT_FEATURES: 768 13 | DEPTH: 12 14 | NUM_HEADS: 12 15 | DROP_PATH: 0.0 16 | ATTN_DROPOUT: 0.0 17 | REQUIRE_PROJ: false 18 | ATTN_MASK_ENABLE: false 19 | DOUBLE_TADA: false 20 | FREEZE: false 21 | REDUCTION: 2 22 | BRANCH: 23 | NAME: TAdaFormerBlock 24 | ROUTE_FUNC_K: [3, 3] 25 | ROUTE_FUNC_R: 2 26 | TEMP_ENHANCE: false 27 | HEAD: 28 | NAME: BaseHead 29 | OUTPUT_DIM: 512 -------------------------------------------------------------------------------- /configs/pool/backbone/tadaformer_l14.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: TAdaFormer_L14 3 | 4 | VIDEO: 5 | BACKBONE: 6 | META_ARCH: VisionTransformer 7 | INPUT_RES: 224 8 | PATCH_SIZE: 14 9 | TUBLET_SIZE: 3 10 | TUBLET_STRIDE: 2 11 | NUM_FEATURES: 1024 12 | NUM_OUT_FEATURES: 1024 13 | DEPTH: 24 14 | NUM_HEADS: 16 15 | DROP_PATH: 0.0 16 | ATTN_DROPOUT: 0.0 17 | REQUIRE_PROJ: false 18 | ATTN_MASK_ENABLE: false 19 | DOUBLE_TADA: false 20 | FREEZE: false 21 | REDUCTION: 2 22 | BRANCH: 23 | NAME: TAdaFormerBlock 24 | ROUTE_FUNC_K: [3, 3] 25 | ROUTE_FUNC_R: 2 26 | TEMP_ENHANCE: false 27 | HEAD: 28 | NAME: BaseHead 29 | OUTPUT_DIM: 512 -------------------------------------------------------------------------------- /configs/pool/backbone/timesformer.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: timesformer 3 | VIDEO: 4 | BACKBONE: 5 | META_ARCH: Transformer 6 | NUM_FEATURES: 768 7 | NUM_OUT_FEATURES: 768 8 | PATCH_SIZE: 16 9 | DEPTH: 12 10 | NUM_HEADS: 12 11 | DIM_HEAD: 64 12 | ATTN_DROPOUT: 0.1 13 | FF_DROPOUT: 0.1 14 | DROP_PATH: 0.0 15 | PRE_LOGITS: false 16 | STEM: 17 | NAME: PatchEmbedStem 18 | BRANCH: 19 | NAME: TimesformerLayer 20 | NONLOCAL: 21 | ENABLE: false 22 | STAGES: [5] 23 | MASK_ENABLE: false 24 | HEAD: 25 | NAME: TransformerHead 26 | ACTIVATION: softmax 27 | DROPOUT_RATE: 0 28 | NUM_CLASSES: # !!! -------------------------------------------------------------------------------- /configs/pool/backbone/vivit.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: vivit 3 | VIDEO: 4 | BACKBONE: 5 | META_ARCH: Transformer 6 | NUM_FEATURES: 768 7 | NUM_OUT_FEATURES: 768 8 | PATCH_SIZE: 16 9 | TUBELET_SIZE: 2 10 | DEPTH: 12 11 | NUM_HEADS: 12 12 | DIM_HEAD: 64 13 | ATTN_DROPOUT: 0.0 14 | FF_DROPOUT: 0.0 15 | DROP_PATH: 0.1 16 | MLP_MULT: 4 17 | STEM: 18 | NAME: TubeletEmbeddingStem 19 | BRANCH: 20 | NAME: BaseTransformerLayer 21 | HEAD: 22 | NAME: TransformerHead 23 | ACTIVATION: softmax 24 | DROPOUT_RATE: 0 25 | NUM_CLASSES: # !!! 26 | PRE_LOGITS: false 27 | TRAIN: 28 | CHECKPOINT_PRE_PROCESS: 29 | ENABLE: true 30 | POP_HEAD: true 31 | POS_EMBED: repeat 32 | PATCH_EMBD: central_frame -------------------------------------------------------------------------------- /configs/pool/backbone/vivit_fac_enc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: vivit 3 | VIDEO: 4 | BACKBONE: 5 | META_ARCH: FactorizedTransformer 6 | NUM_FEATURES: 768 7 | NUM_OUT_FEATURES: 768 8 | PATCH_SIZE: 16 9 | TUBELET_SIZE: 2 10 | DEPTH: 12 11 | DEPTH_TEMP: 4 12 | NUM_HEADS: 12 13 | DIM_HEAD: 64 14 | ATTN_DROPOUT: 0.0 15 | FF_DROPOUT: 0.0 16 | DROP_PATH: 0.1 17 | MLP_MULT: 4 18 | STEM: 19 | NAME: TubeletEmbeddingStem 20 | BRANCH: 21 | NAME: BaseTransformerLayer 22 | HEAD: 23 | NAME: TransformerHead 24 | ACTIVATION: softmax 25 | DROPOUT_RATE: 0 26 | NUM_CLASSES: # !!! 27 | PRE_LOGITS: false 28 | TRAIN: 29 | CHECKPOINT_PRE_PROCESS: 30 | ENABLE: true 31 | POP_HEAD: true 32 | POS_EMBED: 33 | PATCH_EMBD: central_frame -------------------------------------------------------------------------------- /configs/pool/base.yaml: -------------------------------------------------------------------------------- 1 | TASK_TYPE: classification 2 | PRETRAIN: 3 | ENABLE: false 4 | LOCALIZATION: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: false 8 | DATASET: 9 | BATCH_SIZE: 128 10 | LOG_FILE: training_log.log 11 | EVAL_PERIOD: 10 12 | NUM_FOLDS: 1 13 | AUTO_RESUME: true 14 | CHECKPOINT_PERIOD: 10 15 | INIT: "" 16 | CHECKPOINT_FILE_PATH: "" 17 | CHECKPOINT_TYPE: pytorch 18 | CHECKPOINT_INFLATE: false 19 | CHECKPOINT_PRE_PROCESS: 20 | ENABLE: false 21 | FINE_TUNE: false 22 | ONLY_LINEAR: false 23 | LR_REDUCE: false 24 | TRAIN_VAL_COMBINE: false 25 | TEST: 26 | ENABLE: false 27 | DATASET: 28 | BATCH_SIZE: 100 29 | NUM_SPATIAL_CROPS: 1 30 | SPATIAL_CROPS: cc 31 | NUM_ENSEMBLE_VIEWS: 1 32 | LOG_FILE: val.log 33 | CHECKPOINT_FILE_PATH: "" 34 | CHECKPOINT_TYPE: pytorch 35 | AUTOMATIC_MULTI_SCALE_TEST: true 36 | VISUALIZATION: 37 | ENABLE: false 38 | NAME: "" 39 | FEATURE_MAPS: 40 | ENABLE: false 41 | BASE_OUTPUT_DIR: "" 42 | SUBMISSION: 43 | ENABLE: false 44 | SAVE_RESULTS_PATH: "test.json" 45 | DATA: 46 | DATA_ROOT_DIR: /data_root/ 47 | ANNO_DIR: /anno_dir/ 48 | NUM_INPUT_FRAMES: 16 49 | NUM_INPUT_CHANNELS: 3 50 | SAMPLING_MODE: interval_based 51 | SAMPLING_RATE: 4 52 | TRAIN_JITTER_SCALES: [168, 224] 53 | TRAIN_CROP_SIZE: 112 54 | TEST_SCALE: 224 55 | TEST_CROP_SIZE: 112 56 | MEAN: [0.45, 0.45, 0.45] 57 | STD: [0.225, 0.225, 0.225] 58 | MULTI_LABEL: false 59 | ENSEMBLE_METHOD: sum 60 | TARGET_FPS: 30 61 | MINUS_INTERVAL: false 62 | MODEL: 63 | NAME: 64 | EMA: 65 | ENABLE: false 66 | DECAY: 0.99996 67 | VIDEO: 68 | BACKBONE: 69 | DEPTH: 70 | META_ARCH: 71 | NUM_FILTERS: 72 | NUM_INPUT_CHANNELS: 3 73 | NUM_OUT_FEATURES: 74 | KERNEL_SIZE: 75 | DOWNSAMPLING: 76 | DOWNSAMPLING_TEMPORAL: 77 | NUM_STREAMS: 1 78 | EXPANSION_RATIO: 2 79 | BRANCH: 80 | NAME: 81 | STEM: 82 | NAME: 83 | NONLOCAL: 84 | ENABLE: false 85 | STAGES: [5] 86 | MASK_ENABLE: false 87 | INITIALIZATION: 88 | HEAD: 89 | NAME: BaseHead 90 | ACTIVATION: softmax 91 | DROPOUT_RATE: 0 92 | NUM_CLASSES: 93 | OPTIMIZER: 94 | ADJUST_LR: false 95 | BASE_LR: 0.002 96 | LR_POLICY: cosine 97 | MAX_EPOCH: 300 98 | MOMENTUM: 0.9 99 | WEIGHT_DECAY: 1e-3 100 | WARMUP_EPOCHS: 10 101 | WARMUP_START_LR: 0.0002 102 | OPTIM_METHOD: adam 103 | DAMPENING: 0.0 104 | NESTEROV: true 105 | BIAS_DOUBLE: false 106 | NEW_PARAMS: [] 107 | NEW_PARAMS_MULT: 10 108 | NEW_PARAMS_WD_MULT: 1 109 | LAYER_WISE_LR_DECAY: 1.0 110 | COSINE_AFTER_WARMUP: false 111 | COSINE_END_LR: 1e-6 112 | BN: 113 | WB_LOCK: false 114 | FREEZE: false 115 | WEIGHT_DECAY: 0.0 116 | MOMENTUM: 0.1 117 | EPS: 1e-5 118 | SYNC: false 119 | DATA_LOADER: 120 | NUM_WORKERS: 4 121 | PIN_MEMORY: false 122 | ENABLE_MULTI_THREAD_DECODE: true 123 | COLLATE_FN: 124 | NUM_GPUS: 8 125 | SHARD_ID: 0 126 | NUM_SHARDS: 1 127 | RANDOM_SEED: 0 128 | OUTPUT_DIR: output/ 129 | OUTPUT_CFG_FILE: configuration.log 130 | LOG_PERIOD: 10 131 | DIST_BACKEND: nccl 132 | LOG_MODEL_INFO: true 133 | LOG_CONFIG_INFO: true 134 | OSS: 135 | ENABLE: false 136 | KEY: 137 | SECRET: 138 | ENDPOINT: 139 | CHECKPOINT_OUTPUT_PATH: # !!@7 140 | SECONDARY_DATA_OSS: 141 | ENABLE: false 142 | KEY: 143 | SECRET: 144 | ENDPOINT: 145 | BUCKETS: [""] 146 | AUGMENTATION: 147 | COLOR_AUG: false 148 | BRIGHTNESS: 0.5 149 | CONTRAST: 0.5 150 | SATURATION: 0.5 151 | HUE: 0.25 152 | GRAYSCALE: 0.3 153 | CONSISTENT: true 154 | SHUFFLE: true 155 | GRAY_FIRST: true 156 | RATIO: [0.857142857142857, 1.1666666666666667] 157 | USE_GPU: false 158 | MIXUP: 159 | ENABLE: false 160 | ALPHA: 0.0 161 | PROB: 1.0 162 | MODE: batch 163 | SWITCH_PROB: 0.5 164 | CUTMIX: 165 | ENABLE: false 166 | ALPHA: 0.0 167 | MINMAX: 168 | RANDOM_ERASING: 169 | ENABLE: false 170 | PROB: 0.25 171 | MODE: const 172 | COUNT: [1, 1] 173 | NUM_SPLITS: 0 174 | AREA_RANGE: [0.02, 0.33] 175 | MIN_ASPECT: 0.3 176 | LABEL_SMOOTHING: 0.0 177 | SSV2_FLIP: false 178 | PAI: false 179 | USE_MULTISEG_VAL_DIST: false -------------------------------------------------------------------------------- /configs/pool/run/training/finetune.yaml: -------------------------------------------------------------------------------- 1 | PRETRAIN: 2 | ENABLE: false 3 | TRAIN: 4 | ENABLE: true 5 | DATASET: # !!@1 6 | BATCH_SIZE: 1024 7 | LOG_FILE: training_log.log 8 | LOSS_FUNC: cross_entropy 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 30 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: true 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: true # !!@3 20 | DATASET: # !!@3 21 | BATCH_SIZE: 1024 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: true 29 | DATA: 30 | DATA_ROOT_DIR: 31 | ANNO_DIR: 32 | NUM_INPUT_FRAMES: 16 33 | NUM_INPUT_CHANNELS: 3 34 | SAMPLING_MODE: interval_based 35 | SAMPLING_RATE: 4 36 | TRAIN_JITTER_SCALES: [168, 224] 37 | TRAIN_CROP_SIZE: 112 38 | TEST_SCALE: 224 39 | TEST_CROP_SIZE: 112 40 | MEAN: [0.45, 0.45, 0.45] 41 | STD: [0.225, 0.225, 0.225] 42 | MULTI_LABEL: false 43 | ENSEMBLE_METHOD: sum 44 | FPS: 30 45 | TARGET_FPS: 30 46 | OPTIMIZER: 47 | BASE_LR: 0.002 48 | LR_POLICY: cosine 49 | MAX_EPOCH: 300 50 | MOMENTUM: 0.9 51 | WEIGHT_DECAY: 1e-3 52 | WARMUP_EPOCHS: 10 53 | WARMUP_START_LR: 0.0002 54 | OPTIM_METHOD: adam 55 | DAMPENING: 0.0 56 | NESTEROV: true 57 | BN: 58 | WEIGHT_DECAY: 0.0 59 | EPS: 1e-3 60 | DATA_LOADER: 61 | NUM_WORKERS: 4 62 | PIN_MEMORY: false 63 | ENABLE_MULTI_THREAD_DECODE: true 64 | NUM_GPUS: 8 65 | SHARD_ID: 0 66 | NUM_SHARDS: 1 67 | RANDOM_SEED: 0 68 | OUTPUT_DIR: 69 | OUTPUT_CFG_FILE: configuration.log 70 | LOG_PERIOD: 10 71 | DIST_BACKEND: nccl 72 | LOG_MODEL_INFO: true 73 | LOG_CONFIG_INFO: true 74 | AUGMENTATION: 75 | COLOR_AUG: true 76 | BRIGHTNESS: 0.5 77 | CONTRAST: 0.5 78 | SATURATION: 0.5 79 | HUE: 0.25 80 | GRAYSCALE: 0.3 81 | CONSISTENT: true 82 | SHUFFLE: true 83 | GRAY_FIRST: true 84 | RATIO: [0.857142857142857, 1.1666666666666667] 85 | USE_GPU: true 86 | PAI: false 87 | 88 | -------------------------------------------------------------------------------- /configs/pool/run/training/from_scratch.yaml: -------------------------------------------------------------------------------- 1 | PRETRAIN: 2 | ENABLE: false 3 | TRAIN: 4 | ENABLE: true 5 | DATASET: # !!@1 6 | BATCH_SIZE: 1024 7 | LOG_FILE: training_log.log 8 | LOSS_FUNC: cross_entropy 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 30 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false # !!@3 20 | DATASET: # !!@3 21 | BATCH_SIZE: 1024 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: true 29 | DATA: 30 | DATA_ROOT_DIR: 31 | ANNO_DIR: 32 | NUM_INPUT_FRAMES: 16 33 | NUM_INPUT_CHANNELS: 3 34 | SAMPLING_MODE: interval_based 35 | SAMPLING_RATE: 4 36 | TRAIN_JITTER_SCALES: [168, 224] 37 | TRAIN_CROP_SIZE: 112 38 | TEST_SCALE: 224 39 | TEST_CROP_SIZE: 112 40 | MEAN: [0.45, 0.45, 0.45] 41 | STD: [0.225, 0.225, 0.225] 42 | MULTI_LABEL: false 43 | ENSEMBLE_METHOD: sum 44 | FPS: 30 45 | TARGET_FPS: 30 46 | OPTIMIZER: 47 | BASE_LR: 0.002 48 | LR_POLICY: cosine 49 | MAX_EPOCH: 300 50 | MOMENTUM: 0.9 51 | WEIGHT_DECAY: 1e-3 52 | WARMUP_EPOCHS: 10 53 | WARMUP_START_LR: 0.0002 54 | OPTIM_METHOD: adam 55 | DAMPENING: 0.0 56 | NESTEROV: true 57 | BN: 58 | WEIGHT_DECAY: 0.0 59 | EPS: 1e-3 60 | DATA_LOADER: 61 | NUM_WORKERS: 4 62 | PIN_MEMORY: false 63 | ENABLE_MULTI_THREAD_DECODE: true 64 | NUM_GPUS: 8 65 | SHARD_ID: 0 66 | NUM_SHARDS: 1 67 | RANDOM_SEED: 0 68 | OUTPUT_DIR: 69 | OUTPUT_CFG_FILE: configuration.log 70 | LOG_PERIOD: 10 71 | DIST_BACKEND: nccl 72 | LOG_MODEL_INFO: true 73 | LOG_CONFIG_INFO: true 74 | AUGMENTATION: 75 | COLOR_AUG: true 76 | BRIGHTNESS: 0.5 77 | CONTRAST: 0.5 78 | SATURATION: 0.5 79 | HUE: 0.25 80 | GRAYSCALE: 0.3 81 | CONSISTENT: true 82 | SHUFFLE: true 83 | GRAY_FIRST: true 84 | RATIO: [0.857142857142857, 1.1666666666666667] 85 | USE_GPU: true 86 | PAI: false -------------------------------------------------------------------------------- /configs/pool/run/training/from_scratch_large.yaml: -------------------------------------------------------------------------------- 1 | PRETRAIN: 2 | ENABLE: false 3 | TRAIN: 4 | ENABLE: true 5 | DATASET: # !!@1 6 | BATCH_SIZE: 256 # 256 for 32 gpus 7 | LOG_FILE: training_log.log 8 | LOSS_FUNC: cross_entropy 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 1 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 5 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: true # !!@3 20 | DATASET: # !!@3 21 | BATCH_SIZE: 256 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: true 29 | AUTOMATIC_MULTI_SCALE_TEST_SPATIAL: true 30 | DATA: 31 | DATA_ROOT_DIR: 32 | ANNO_DIR: 33 | NUM_INPUT_FRAMES: 16 34 | NUM_INPUT_CHANNELS: 3 35 | SAMPLING_MODE: interval_based 36 | SAMPLING_RATE: 4 37 | TRAIN_JITTER_SCALES: [256, 320] 38 | TRAIN_CROP_SIZE: 224 39 | TEST_SCALE: 224 40 | TEST_CROP_SIZE: 224 41 | MEAN: [0.45, 0.45, 0.45] 42 | STD: [0.225, 0.225, 0.225] 43 | MULTI_LABEL: false 44 | ENSEMBLE_METHOD: sum 45 | FPS: 30 46 | TARGET_FPS: 30 47 | OPTIMIZER: 48 | BASE_LR: 0.001 49 | ADJUST_LR: false 50 | LR_POLICY: cosine 51 | MAX_EPOCH: 100 52 | MOMENTUM: 0.9 53 | WEIGHT_DECAY: 1e-4 54 | WARMUP_EPOCHS: 10 55 | WARMUP_START_LR: 0.0001 56 | OPTIM_METHOD: adam 57 | DAMPENING: 0.0 58 | NESTEROV: true 59 | BN: 60 | WEIGHT_DECAY: 0.0 61 | DATA_LOADER: 62 | NUM_WORKERS: 8 63 | PIN_MEMORY: false 64 | ENABLE_MULTI_THREAD_DECODE: true 65 | NUM_GPUS: 32 66 | SHARD_ID: 0 67 | NUM_SHARDS: 1 68 | RANDOM_SEED: 0 69 | OUTPUT_DIR: 70 | OUTPUT_CFG_FILE: configuration.log 71 | LOG_PERIOD: 10 72 | DIST_BACKEND: nccl 73 | LOG_MODEL_INFO: true 74 | LOG_CONFIG_INFO: true 75 | AUGMENTATION: 76 | COLOR_AUG: false 77 | BRIGHTNESS: 0.5 78 | CONTRAST: 0.5 79 | SATURATION: 0.5 80 | HUE: 0.25 81 | GRAYSCALE: 0.3 82 | CONSISTENT: true 83 | SHUFFLE: true 84 | GRAY_FIRST: true 85 | RATIO: [0.857142857142857, 1.1666666666666667] 86 | USE_GPU: false 87 | PAI: false -------------------------------------------------------------------------------- /configs/pool/run/training/localization.yaml: -------------------------------------------------------------------------------- 1 | TASK_TYPE: localization 2 | LOCALIZATION: 3 | ENABLE: true 4 | LOSS: Tem+PemReg+PemCls 5 | LOSS_WEIGHTS: [1,10,1] 6 | POS_CLS_THRES: 0.9 7 | POS_REG_THRES: 0.7 8 | NEG_REG_THRES: 0.3 9 | 10 | TEST_OUTPUT_DIR: ./output/ 11 | PROPS_DIR: prop_results 12 | PROPS_REGRESSION_LOSS: smoothl1 13 | RESULT_FILE: localization_detection_res 14 | CLASSIFIER_FILE: "" 15 | POST_PROCESS: 16 | THREAD: 32 17 | SOFT_NMS_ALPHA: 0.4 18 | SOFT_NMS_LOW_THRES: 0.0 19 | SOFT_NMS_HIGH_THRES: 0.0 20 | PROP_NUM: 100 21 | SELECT_SCORE: 0.0001 22 | SCORE_TYPE: 'cr' 23 | CLR_POWER: 1.2 24 | REG_POWER: 1.2 25 | IOU_POWER: 2.0 26 | TCA_POWER: 1.0 27 | ACTION_SCORE_POWER: 1.0 28 | VIDEO_SCORES_WEIGHT: 1.0 29 | 30 | TRAIN: 31 | ENABLE: true 32 | DATASET: Epickitchen100Localization # !!@1 33 | BATCH_SIZE: 64 34 | LOG_FILE: training_log.log 35 | EVAL_PERIOD: 1 36 | NUM_FOLDS: 1 37 | AUTO_RESUME: true 38 | CHECKPOINT_PERIOD: 1 39 | CHECKPOINT_FILE_PATH: "" # !!@2 40 | CHECKPOINT_TYPE: pytorch 41 | CHECKPOINT_INFLATE: false 42 | FINE_TUNE: false 43 | LR_REDUCE: false 44 | TEST: 45 | ENABLE: false # !!@3 46 | OUTPUT_TEST: false 47 | FORCE_FORWARD: false 48 | DATASET: Epickitchen100Localization # !!@3 49 | BATCH_SIZE: 128 50 | LOG_FILE: val.log 51 | TEST_SET: val 52 | CHECKPOINT_FILE_PATH: "" 53 | SAVE_RESULTS_PATH: "preds.log" 54 | CHECKPOINT_TYPE: pytorch 55 | AUTOMATIC_MULTI_SCALE_TEST: false 56 | TEST_CHECKPOINT: [7,8,9,10] 57 | 58 | DATA: 59 | DATA_ROOT_DIR: 60 | ANNO_DIR: 61 | TEMPORAL_SCALE: 200 62 | DURATION_SCALE: -1 63 | TEMPORAL_MODE: resize 64 | NUM_INPUT_CHANNELS: 2304 65 | TEMPORAL_INTERVAL: 0.53333333 66 | NORM_FEATURE: true 67 | ANNO_NAME: "" 68 | LABELS_TYPE: bmn 69 | 70 | SOLVER: 71 | BASE_LR: 0.001 72 | ADJUST_LR: true 73 | LR_POLICY: cosine 74 | MAX_EPOCH: 10 75 | MOMENTUM: 0.9 76 | WEIGHT_DECAY: 1e-4 77 | WARMUP_EPOCHS: 1 78 | WARMUP_START_LR: 0.0001 79 | OPTIM_METHOD: adam 80 | DAMPENING: 0.0 81 | NESTEROV: true 82 | BN: 83 | USE_BN: false 84 | WEIGHT_DECAY: 0.0 85 | DATA_LOADER: 86 | NUM_WORKERS: 8 87 | PIN_MEMORY: true 88 | 89 | NUM_GPUS: 8 90 | SHARD_ID: 0 91 | NUM_SHARDS: 1 92 | RANDOM_SEED: 0 93 | OUTPUT_DIR: output/test 94 | OUTPUT_CFG_FILE: configuration.log 95 | LOG_PERIOD: 10 96 | DIST_BACKEND: nccl 97 | DEBUG_MODE: false 98 | LOG_MODEL_INFO: true 99 | LOG_CONFIG_INFO: true 100 | OSS: 101 | ENABLE: false 102 | PAI: true 103 | -------------------------------------------------------------------------------- /configs/pool/run/training/mosi.yaml: -------------------------------------------------------------------------------- 1 | PRETRAIN: 2 | ENABLE: true 3 | GENERATOR: MoSIGenerator 4 | LOSS: MoSIJoint 5 | LOSS_WEIGHTS: [1] 6 | DISTANCE_JITTER: [1, 1] 7 | SCALE_JITTER: false 8 | NUM_FRAMES: 16 9 | DATA_MODE: xy 10 | DECOUPLE: true 11 | FRAME_SIZE_STANDARDIZE_ENABLE: true 12 | STANDARD_SIZE: 320 13 | LABEL_MODE: joint # seperate / joint 14 | ZERO_OUT: false 15 | STATIC_MASK: true 16 | ASPECT_RATIO: [1, 1] 17 | MASK_SIZE_RATIO: [0.3, 0.5] 18 | NUM_CLIPS_PER_VIDEO: 1 19 | TRAIN: 20 | ENABLE: true 21 | DATASET: # !!@1 22 | BATCH_SIZE: 80 # 80 for 8 gpus 23 | LOG_FILE: training_log.log 24 | EVAL_PERIOD: 5 25 | NUM_FOLDS: 1 26 | AUTO_RESUME: true 27 | CHECKPOINT_PERIOD: 10 28 | CHECKPOINT_FILE_PATH: "" # !!@2 29 | CHECKPOINT_TYPE: pytorch 30 | CHECKPOINT_INFLATE: false 31 | FINE_TUNE: false 32 | ONLY_LINEAR: false 33 | TEST: 34 | ENABLE: false # !!@3 35 | DATASET: # !!@3 36 | BATCH_SIZE: 80 # 80 for 8 gpus 37 | NUM_SPATIAL_CROPS: 1 38 | SPATIAL_CROPS: cc 39 | NUM_ENSEMBLE_VIEWS: 1 40 | LOG_FILE: val.log 41 | CHECKPOINT_FILE_PATH: "" 42 | CHECKPOINT_TYPE: pytorch 43 | AUTOMATIC_MULTI_SCALE_TEST: false 44 | DATA: 45 | DATA_ROOT_DIR: 46 | ANNO_DIR: 47 | NUM_INPUT_FRAMES: 1 48 | NUM_INPUT_CHANNELS: 3 49 | SAMPLING_MODE: interval_based 50 | SAMPLING_RATE: 4 51 | TRAIN_JITTER_SCALES: [168, 224] 52 | TRAIN_CROP_SIZE: 112 53 | TEST_SCALE: 224 54 | TEST_CROP_SIZE: 112 55 | MEAN: [0.45, 0.45, 0.45] 56 | STD: [0.225, 0.225, 0.225] 57 | MULTI_LABEL: false 58 | ENSEMBLE_METHOD: sum 59 | FPS: 30 60 | TARGET_FPS: 30 61 | OPTIMIZER: 62 | BASE_LR: 0.001 63 | LR_POLICY: cosine 64 | MAX_EPOCH: 100 65 | MOMENTUM: 0.9 66 | WEIGHT_DECAY: 1e-4 67 | WARMUP_EPOCHS: 10 68 | WARMUP_START_LR: 0.0001 69 | OPTIM_METHOD: adam 70 | DAMPENING: 0.0 71 | NESTEROV: true 72 | BN: 73 | WEIGHT_DECAY: 0.0 74 | EPS: 1e-3 75 | DATA_LOADER: 76 | NUM_WORKERS: 4 77 | PIN_MEMORY: false 78 | ENABLE_MULTI_THREAD_DECODE: true 79 | NUM_GPUS: 8 80 | SHARD_ID: 0 81 | NUM_SHARDS: 1 82 | RANDOM_SEED: 0 83 | OUTPUT_DIR: 84 | OUTPUT_CFG_FILE: configuration.log 85 | LOG_PERIOD: 10 86 | DIST_BACKEND: nccl 87 | LOG_MODEL_INFO: true 88 | LOG_CONFIG_INFO: true 89 | AUGMENTATION: 90 | COLOR_AUG: true 91 | BRIGHTNESS: 0.5 92 | CONTRAST: 0.5 93 | SATURATION: 0.5 94 | HUE: 0.25 95 | GRAYSCALE: 0.3 96 | CONSISTENT: false 97 | SHUFFLE: true 98 | GRAY_FIRST: true 99 | RATIO: [0.857142857142857, 1.1666666666666667] 100 | USE_GPU: true 101 | PAI: false 102 | 103 | MODEL: 104 | NAME: MoSINet 105 | VIDEO: 106 | HEAD: 107 | NAME: MoSIHeadJoint 108 | NUM_CLASSES: 5 109 | DROPOUT_RATE: 0.5 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/csn_ek100.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/csn.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: epickitchen100 9 | BATCH_SIZE: 256 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: true 13 | DATASET: epickitchen100 14 | BATCH_SIZE: 256 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 18 | NUM_INPUT_FRAMES: 32 19 | SAMPLING_RATE: 2 20 | TEST_SCALE: 256 21 | TEST_CROP_SIZE: 256 22 | MULTI_LABEL: true 23 | TARGET_FPS: 60 24 | VIDEO: 25 | HEAD: 26 | NAME: BaseHeadx2 27 | NUM_CLASSES: [97, 300] 28 | DROPOUT_RATE: 0.5 29 | DATA_LOADER: 30 | NUM_WORKERS: 4 31 | OPTIMIZER: 32 | BASE_LR: 0.0001 33 | ADJUST_LR: false 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 50 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 0.05 38 | WARMUP_EPOCHS: 5 39 | WARMUP_START_LR: 0.000001 40 | OPTIM_METHOD: adamw 41 | DAMPENING: 0.0 42 | NESTEROV: true 43 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/csn_ek100_submission.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/csn.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: false 8 | DATASET: epickitchen100 9 | BATCH_SIZE: 256 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: false 13 | DATASET: epickitchen100 14 | BATCH_SIZE: 256 15 | SUBMISSION: 16 | ENABLE: true 17 | ACTION_CLASS_ENSUMBLE_METHOD: "sum" # sum or calculate 18 | TASK_TYPE: submission 19 | DATA: 20 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 21 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 22 | NUM_INPUT_FRAMES: 32 23 | SAMPLING_RATE: 2 24 | TEST_SCALE: 256 25 | TEST_CROP_SIZE: 256 26 | MULTI_LABEL: true 27 | TARGET_FPS: 60 28 | VIDEO: 29 | HEAD: 30 | NAME: BaseHeadx2 31 | NUM_CLASSES: [97, 300] 32 | DROPOUT_RATE: 0.5 33 | DATA_LOADER: 34 | NUM_WORKERS: 4 35 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/csn.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../csn_ek100.yaml 2 | TRAIN: 3 | CHECKPOINT_PERIOD: 1 4 | CHECKPOINT_FILE_PATH: "" # pretrained weights from K400/K700/IG65M... 5 | FINE_TUNE: true 6 | CHECKPOINT_PRE_PROCESS: 7 | ENABLE: true 8 | POP_HEAD: true 9 | POS_EMBED: 10 | PATCH_EMBD: 11 | AUGMENTATION: 12 | COLOR_AUG: true 13 | BRIGHTNESS: 0.5 14 | CONTRAST: 0.5 15 | SATURATION: 0.5 16 | HUE: 0.25 17 | GRAYSCALE: 0.0 18 | CONSISTENT: true 19 | SHUFFLE: false 20 | GRAY_FIRST: false 21 | USE_GPU: false 22 | MIXUP: 23 | ENABLE: true 24 | ALPHA: 0.2 25 | PROB: 1.0 26 | MODE: batch 27 | SWITCH_PROB: 0.5 28 | CUTMIX: 29 | ENABLE: true 30 | ALPHA: 1.0 31 | MINMAX: 32 | RANDOM_ERASING: 33 | ENABLE: true 34 | PROB: 0.25 35 | MODE: pixel 36 | COUNT: [1, 1] 37 | NUM_SPLITS: 0 38 | AREA_RANGE: [0.02, 0.33] 39 | MIN_ASPECT: 0.3 40 | LABEL_SMOOTHING: 0.2 41 | BN: 42 | WB_LOCK: false 43 | FREEZE: true 44 | OUTPUT_DIR: output/csn_ek100 45 | -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/csn_submit.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../csn_ek100_submission.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/csn152_pt_k700_ft_ek100_32x224x224_4452_public.pyth 4 | BATCH_SIZE: 256 5 | TEST: 6 | BATCH_SIZE: 256 7 | OUTPUT_DIR: output/csn_ek100_submit -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/csn_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../csn_ek100.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/csn152_pt_k700_ft_ek100_32x224x224_4452_public.pyth 5 | BN: 6 | WB_LOCK: false 7 | FREEZE: true 8 | OUTPUT_DIR: output/csn_ek100_test 9 | -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_ek100.yaml 2 | TRAIN: 3 | CHECKPOINT_PERIOD: 1 4 | EVAL_PERIOD: 1 5 | CHECKPOINT_FILE_PATH: "" # directory of pretrained models 6 | FINE_TUNE: true 7 | BATCH_SIZE: 128 8 | CHECKPOINT_PRE_PROCESS: 9 | ENABLE: true 10 | POP_HEAD: true 11 | POS_EMBED: super-resolution 12 | PATCH_EMBD: 13 | 14 | DATA: 15 | TRAIN_JITTER_SCALES: [336, 448] 16 | TRAIN_CROP_SIZE: 320 17 | TEST_SCALE: 320 18 | TEST_CROP_SIZE: 320 19 | 20 | AUGMENTATION: 21 | COLOR_AUG: true 22 | BRIGHTNESS: 0.5 23 | CONTRAST: 0.5 24 | SATURATION: 0.5 25 | HUE: 0.25 26 | GRAYSCALE: 0.0 27 | CONSISTENT: true 28 | SHUFFLE: false 29 | GRAY_FIRST: false 30 | USE_GPU: false 31 | MIXUP: 32 | ENABLE: true 33 | ALPHA: 0.2 34 | PROB: 1.0 35 | MODE: batch 36 | SWITCH_PROB: 0.5 37 | CUTMIX: 38 | ENABLE: true 39 | ALPHA: 1.0 40 | MINMAX: 41 | RANDOM_ERASING: 42 | ENABLE: true 43 | PROB: 0.25 44 | MODE: pixel 45 | COUNT: [1, 1] 46 | NUM_SPLITS: 0 47 | AREA_RANGE: [0.02, 0.33] 48 | MIN_ASPECT: 0.3 49 | LABEL_SMOOTHING: 0.2 50 | 51 | VIDEO: 52 | BACKBONE: 53 | DROP_PATH: 0.2 54 | HEAD: 55 | DROPOUT_RATE: 0.0 56 | 57 | DATA_LOADER: 58 | NUM_WORKERS: 8 59 | 60 | OUTPUT_DIR: output/vivit_fac_enc_ek100 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc_submit.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_ek100_submission.yaml 2 | TRAIN: 3 | CHECKPOINT_PERIOD: 1 4 | EVAL_PERIOD: 1 5 | CHECKPOINT_FILE_PATH: ./checkpoints/vivit_fac_enc_b16x2_pt_k700_ft_ek100_32x224x224_4630_public.pyth 6 | FINE_TUNE: true 7 | BATCH_SIZE: 256 8 | 9 | DATA: 10 | TRAIN_JITTER_SCALES: [336, 448] 11 | TRAIN_CROP_SIZE: 320 12 | TEST_SCALE: 320 13 | TEST_CROP_SIZE: 320 14 | 15 | DATA_LOADER: 16 | NUM_WORKERS: 8 17 | 18 | OUTPUT_DIR: output/vivit_fac_enc_ek100_submit -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_ek100.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/vivit_fac_enc_b16x2_pt_k700_ft_ek100_32x224x224_4630_public.pyth 5 | CHECKPOINT_PRE_PROCESS: 6 | ENABLE: true 7 | POP_HEAD: true 8 | POS_EMBED: super-resolution 9 | PATCH_EMBD: 10 | 11 | DATA: 12 | TRAIN_JITTER_SCALES: [336, 448] 13 | TRAIN_CROP_SIZE: 320 14 | TEST_SCALE: 320 15 | TEST_CROP_SIZE: 320 16 | 17 | DATA_LOADER: 18 | NUM_WORKERS: 8 19 | 20 | OUTPUT_DIR: output/vivit_fac_enc_ek100_test -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/k400/vivit_fac_enc_b16x2.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_k400.yaml 2 | TRAIN: 3 | CHECKPOINT_PERIOD: 1 4 | EVAL_PERIOD: 1 5 | CHECKPOINT_FILE_PATH: "" # directory to the pretrained imagenet vit b16 224 model 6 | FINE_TUNE: true 7 | OPTIMIZER: 8 | BASE_LR: 0.0001 9 | ADJUST_LR: false 10 | LR_POLICY: cosine 11 | MAX_EPOCH: 30 12 | MOMENTUM: 0.9 13 | WEIGHT_DECAY: 0.1 14 | WARMUP_EPOCHS: 2.5 15 | WARMUP_START_LR: 0.000001 16 | OPTIM_METHOD: adamw 17 | DAMPENING: 0.0 18 | NESTEROV: true 19 | MODEL: 20 | EMA: 21 | ENABLE: true 22 | DECAY: 0.999 23 | 24 | AUGMENTATION: 25 | COLOR_AUG: true 26 | BRIGHTNESS: 0.5 27 | CONTRAST: 0.5 28 | SATURATION: 0.5 29 | HUE: 0.25 30 | GRAYSCALE: 0.3 31 | CONSISTENT: true 32 | SHUFFLE: true 33 | GRAY_FIRST: true 34 | USE_GPU: false 35 | MIXUP: 36 | ENABLE: true 37 | ALPHA: 0.2 38 | PROB: 1.0 39 | MODE: batch 40 | SWITCH_PROB: 0.5 41 | LABEL_SMOOTHING: 0.1 42 | 43 | VIDEO: 44 | HEAD: 45 | DROPOUT_RATE: 0.0 46 | 47 | OUTPUT_DIR: output/vivit_fac_enc_k400 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/k400/vivit_fac_enc_b16x2_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_k400.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: "./checkpoints/vivit_fac_enc_b16x2_k400_32x224x224_7935_public.pyth" 5 | 6 | OUTPUT_DIR: output/vivit_fac_enc_k400_test -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: epickitchen100 9 | BATCH_SIZE: 256 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: true 13 | DATASET: epickitchen100 14 | BATCH_SIZE: 256 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 18 | NUM_INPUT_FRAMES: 32 19 | SAMPLING_RATE: 2 20 | MULTI_LABEL: true 21 | TARGET_FPS: 60 22 | VIDEO: 23 | HEAD: 24 | NAME: TransformerHeadx2 25 | NUM_CLASSES: [97, 300] 26 | DROPOUT_RATE: 0.5 27 | 28 | DATA_LOADER: 29 | NUM_WORKERS: 4 30 | 31 | OPTIMIZER: 32 | BASE_LR: 0.0001 33 | ADJUST_LR: false 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 50 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 0.05 38 | WARMUP_EPOCHS: 5 39 | WARMUP_START_LR: 0.000001 40 | OPTIM_METHOD: adamw 41 | DAMPENING: 0.0 42 | NESTEROV: true 43 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100_submission.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: false 8 | DATASET: epickitchen100 9 | BATCH_SIZE: 256 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: false 13 | DATASET: epickitchen100 14 | BATCH_SIZE: 256 15 | SUBMISSION: 16 | ENABLE: true 17 | ACTION_CLASS_ENSUMBLE_METHOD: "sum" # sum or calculate 18 | TASK_TYPE: submission 19 | DATA: 20 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 21 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 22 | NUM_INPUT_FRAMES: 32 23 | SAMPLING_RATE: 2 24 | MULTI_LABEL: true 25 | TARGET_FPS: 60 26 | VIDEO: 27 | HEAD: 28 | NAME: TransformerHeadx2 29 | NUM_CLASSES: [97, 300] 30 | DROPOUT_RATE: 0.5 31 | 32 | DATA_LOADER: 33 | NUM_WORKERS: 10 34 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/vivit_fac_enc_k400.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 256 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: true 13 | DATASET: kinetics400 14 | BATCH_SIZE: 256 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 18 | SAMPLING_RATE: 2 19 | NUM_INPUT_FRAMES: 32 20 | VIDEO: 21 | HEAD: 22 | NUM_CLASSES: 400 23 | DROPOUT_RATE: 0.5 24 | 25 | DATA_LOADER: 26 | NUM_WORKERS: 4 27 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../bmn_epic.yaml 2 | TRAIN: 3 | ENABLE: true 4 | BATCH_SIZE: 4 5 | CHECKPOINT_FILE_PATH: "" 6 | TEST: 7 | ENABLE: true 8 | BATCH_SIZE: 4 9 | TEST_CHECKPOINT: [9] 10 | CHECKPOINT_FILE_PATH: "" 11 | OUTPUT_DIR: /mnt/data-nas/qingzhiwu/results/checkpoints/epic_tal/vvt-os/ 12 | 13 | -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-tal/bmn_epic.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/localization.yaml 2 | _BASE_MODEL: ../../pool/backbone/localization-conv.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | BATCH_SIZE: 16 7 | DATASET: Epickitchen100Localization 8 | CHECKPOINT_FILE_PATH: # !!@2 9 | TEST: 10 | ENABLE: true 11 | BATCH_SIZE: 16 12 | DATASET: Epickitchen100Localization 13 | 14 | LOCALIZATION: 15 | ENABLE: true 16 | LOSS: Tem+PemReg+PemCls 17 | LOSS_WEIGHTS: [1,10,1,1] 18 | TEST_OUTPUT_DIR: ./output/ 19 | PROPS_DIR: prop_results 20 | RESULT_FILE: tal_detection_res 21 | CLASSIFIER_FILE: 22 | POST_PROCESS: 23 | PROP_NUM_RATIO: 2 24 | THREAD: 32 25 | SOFT_NMS_ALPHA: 0.4 26 | SOFT_NMS_LOW_THRES: 0.25 27 | SOFT_NMS_HIGH_THRES: 0.9 28 | PROP_NUM_RATIO: 1.0 29 | SELECT_SCORE: 0.0 30 | SCORE_TYPE: 'cr' 31 | CLR_POWER: 1.2 32 | REG_POWER: 1.0 33 | IOU_POWER: 2.0 34 | ACTION_SCORE_POWER: 1.0 35 | VIDEO_SCORES_WEIGHT: 1.0 36 | 37 | DATA: 38 | DATA_ROOT_DIR: [/mnt/data-nas/qingzhiwu/dataset/epic-tal/features/features_s8_fps60_320_-1_train/] 39 | ANNO_DIR: /mnt/data-nas/qingzhiwu/dataset/epic-tal/annotations/ 40 | VIDEO_LENGTH_FILE: epic_videos_len.txt 41 | ANNO_NAME: "EPIC_100_validation.json" 42 | TEMPORAL_SCALE: 200 43 | DURATION_SCALE: 100 44 | NUM_INPUT_CHANNELS: 6912 45 | NORM_FEATURE: false 46 | LABELS_TYPE: bmn 47 | LOAD_TYPE: torch 48 | CLIPS_LIST_FILE: 5s_clips.txt 49 | TARGET_FPS: 60 50 | NUM_INPUT_FRAMES: 32 51 | SAMPLING_RATE: 2 52 | CLIP_INTERVAL: 8 53 | MULTI_LABEL: true 54 | CLASSIFIER_ROOT_DIR: /mnt/data-nas/qingzhiwu/dataset/epic-tal/features/cls_res_s8_fps60_320_-1_train/ 55 | LOAD_CLASSIFIER_RES: true 56 | 57 | OPTIMIZER: 58 | BASE_LR: 0.002 59 | ADJUST_LR: true 60 | LR_POLICY: cosine 61 | MAX_EPOCH: 10 62 | MOMENTUM: 0.9 63 | WEIGHT_DECAY: 1e-4 64 | WARMUP_EPOCHS: 1 65 | WARMUP_START_LR: 0.00001 66 | OPTIM_METHOD: adamw 67 | DAMPENING: 0.0 68 | NESTEROV: true 69 | 70 | VIDEO: 71 | HEAD: 72 | NAME: BaseBMN 73 | ACTIVATION: sigmoid 74 | DROPOUT_RATE: 0 75 | NUM_SAMPLE: 32 76 | NUM_SAMPLE_PERBIN: 3 77 | BOUNDARY_RATIO: 0.5 78 | USE_BMN_REGRESSION: false 79 | 80 | LOG_PERIOD: 50 81 | USE_MULTISEG_VAL_DIST: true -------------------------------------------------------------------------------- /configs/projects/mosi/baselines/r2d3ds_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml 2 | _BASE_MODEL: ../../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: HMDB51 9 | CHECKPOINT_FILE_PATH: "" 10 | BATCH_SIZE: 1024 11 | TEST: 12 | ENABLE: true 13 | DATASET: HMDB51 14 | BATCH_SIZE: 1024 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 18 | VIDEO: 19 | HEAD: 20 | NUM_CLASSES: 51 21 | DROPOUT_RATE: 0.5 22 | OUTPUT_DIR: output/r2d3ds_hmdb_from_scratch 23 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/baselines/r2d3ds_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml 2 | _BASE_MODEL: ../../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: UCF101 9 | CHECKPOINT_FILE_PATH: "" 10 | BATCH_SIZE: 1024 11 | TEST: 12 | ENABLE: true 13 | DATASET: UCF101 14 | BATCH_SIZE: 1024 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 18 | VIDEO: 19 | HEAD: 20 | NUM_CLASSES: 101 21 | DROPOUT_RATE: 0.5 22 | OUTPUT_DIR: output/r2d3ds_ucf_from_scratch 23 | NUM_GPUS: 8 24 | -------------------------------------------------------------------------------- /configs/projects/mosi/baselines/r2p1d_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml 2 | _BASE_MODEL: ../../../pool/backbone/r2p1d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: HMDB51 9 | CHECKPOINT_FILE_PATH: "" 10 | BATCH_SIZE: 384 11 | TEST: 12 | ENABLE: true 13 | DATASET: HMDB51 14 | BATCH_SIZE: 384 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 18 | VIDEO: 19 | HEAD: 20 | NUM_CLASSES: 51 21 | DROPOUT_RATE: 0.5 22 | OUTPUT_DIR: output/r2p1d_hmdb_from_scratch 23 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/baselines/r2p1d_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml 2 | _BASE_MODEL: ../../../pool/backbone/r2p1d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: UCF101 9 | CHECKPOINT_FILE_PATH: "" 10 | BATCH_SIZE: 384 11 | TEST: 12 | ENABLE: true 13 | DATASET: UCF101 14 | BATCH_SIZE: 384 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 18 | VIDEO: 19 | HEAD: 20 | NUM_CLASSES: 101 21 | DROPOUT_RATE: 0.5 22 | OUTPUT_DIR: output/r2p1d_ucf_from_scratch 23 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/ft-hmdb/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2d3ds_hmdb.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_hmdb_mosi_public.pyth 4 | OUTPUT_DIR: output/r2d3ds_mosi_ft_hmdb -------------------------------------------------------------------------------- /configs/projects/mosi/ft-hmdb/r2d3ds_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2d3ds_hmdb.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_hmdb_ft_hmdb_4693_public.pyth 5 | OUTPUT_DIR: output/r2d3ds_mosi_ft_hmdb_test -------------------------------------------------------------------------------- /configs/projects/mosi/ft-hmdb/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2p1d_hmdb.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_hmdb_mosi_public.pyth 4 | OUTPUT_DIR: output/r2p1d_mosi_ft_hmdb -------------------------------------------------------------------------------- /configs/projects/mosi/ft-hmdb/r2p1d_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2p1d_hmdb.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_hmdb_ft_hmdb_5183_public.pyth 5 | OUTPUT_DIR: output/r2p1d_mosi_ft_hmdb_test -------------------------------------------------------------------------------- /configs/projects/mosi/ft-ucf/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2d3ds_ucf.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_ucf_mosi_public.pyth 4 | OUTPUT_DIR: output/r2d3ds_mosi_ft_ucf -------------------------------------------------------------------------------- /configs/projects/mosi/ft-ucf/r2d3ds_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2d3ds_ucf.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_ucf_ft_ucf_7175_public.pyth 5 | OUTPUT_DIR: output/r2d3ds_mosi_ft_ucf_test -------------------------------------------------------------------------------- /configs/projects/mosi/ft-ucf/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2p1d_ucf.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_ucf_mosi_public.pyth 4 | OUTPUT_DIR: output/r2p1d_mosi_ft_ucf 5 | -------------------------------------------------------------------------------- /configs/projects/mosi/ft-ucf/r2p1d_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2p1d_ucf.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_ucf_ft_ucf_8279_public.pyth 5 | OUTPUT_DIR: output/r2p1d_mosi_ft_ucf_test 6 | -------------------------------------------------------------------------------- /configs/projects/mosi/ft_r2d3ds_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: HMDB51 9 | CHECKPOINT_FILE_PATH: "" # !!@2 10 | BATCH_SIZE: 1024 11 | TEST: 12 | ENABLE: true 13 | DATASET: HMDB51 14 | BATCH_SIZE: 1024 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 18 | MINUS_INTERVAL: false 19 | VIDEO: 20 | HEAD: 21 | NUM_CLASSES: 51 22 | DROPOUT_RATE: 0.5 23 | OPTIMIZER: 24 | BASE_LR: 0.002 25 | WARMUP_START_LR: 0.0002 26 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/ft_r2d3ds_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: UCF101 9 | CHECKPOINT_FILE_PATH: "" # !!@2 10 | BATCH_SIZE: 1024 11 | TEST: 12 | ENABLE: true 13 | DATASET: UCF101 14 | BATCH_SIZE: 1024 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 18 | MINUS_INTERVAL: false 19 | VIDEO: 20 | HEAD: 21 | NUM_CLASSES: 101 22 | DROPOUT_RATE: 0.5 23 | OPTIMIZER: 24 | BASE_LR: 0.004 25 | WARMUP_START_LR: 0.0004 26 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/ft_r2p1d_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: HMDB51 9 | CHECKPOINT_FILE_PATH: "" # !!@2 10 | BATCH_SIZE: 384 11 | TEST: 12 | ENABLE: true 13 | DATASET: HMDB51 14 | BATCH_SIZE: 384 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 18 | MINUS_INTERVAL: false 19 | VIDEO: 20 | HEAD: 21 | NUM_CLASSES: 51 22 | DROPOUT_RATE: 0.5 23 | OPTIMIZER: 24 | BASE_LR: 0.00075 25 | WARMUP_START_LR: 0.000075 26 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/ft_r2p1d_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: UCF101 9 | CHECKPOINT_FILE_PATH: "" # !!@2 10 | BATCH_SIZE: 384 11 | TEST: 12 | ENABLE: true 13 | DATASET: UCF101 14 | BATCH_SIZE: 384 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 18 | MINUS_INTERVAL: false 19 | VIDEO: 20 | HEAD: 21 | NUM_CLASSES: 101 22 | DROPOUT_RATE: 0.5 23 | OPTIMIZER: 24 | BASE_LR: 0.0015 25 | WARMUP_START_LR: 0.00015 26 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2d3ds_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | DATASET: HMDB51 7 | BATCH_SIZE: 10 # 10 per gpu 8 | LOG_FILE: training_log.log 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 20 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false 20 | DATASET: HMDB51 21 | BATCH_SIZE: 10 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: false 29 | DATA: 30 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 31 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 32 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2d3ds_imagenet.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | IMAGENET_DATA_SIZE: 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: imagenet 9 | BATCH_SIZE: 10 # 10 per gpu 10 | LOG_FILE: training_log.log 11 | EVAL_PERIOD: 5 12 | NUM_FOLDS: 20 13 | AUTO_RESUME: true 14 | CHECKPOINT_PERIOD: 10 15 | CHECKPOINT_FILE_PATH: "" # !!@2p 16 | CHECKPOINT_TYPE: pytorch 17 | CHECKPOINT_INFLATE: false 18 | FINE_TUNE: false 19 | ONLY_LINEAR: false 20 | TEST: 21 | ENABLE: false 22 | DATASET: imagenet 23 | BATCH_SIZE: 10 24 | NUM_SPATIAL_CROPS: 1 25 | SPATIAL_CROPS: cc 26 | NUM_ENSEMBLE_VIEWS: 1 27 | LOG_FILE: val.log 28 | CHECKPOINT_FILE_PATH: "" 29 | CHECKPOINT_TYPE: pytorch 30 | AUTOMATIC_MULTI_SCALE_TEST: false 31 | DATA: 32 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/imagenet/ 33 | ANNO_DIR: /mnt/ziyuan/ziyuan/imagenet/ 34 | MEAN: [0.485, 0.456, 0.406] 35 | STD: [0.229, 0.224, 0.225] 36 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2d3ds_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | DATASET: UCF101 7 | BATCH_SIZE: 10 # 10 per gpu 8 | LOG_FILE: training_log.log 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 20 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false 20 | DATASET: UCF101 21 | BATCH_SIZE: 10 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: false 29 | DATA: 30 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 31 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 32 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2p1d_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | DATASET: HMDB51 7 | BATCH_SIZE: 5 # 5 per gpu 8 | LOG_FILE: training_log.log 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 20 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false 20 | DATASET: HMDB51 21 | BATCH_SIZE: 5 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: false 29 | DATA: 30 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 31 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 32 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2p1d_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | DATASET: UCF101 7 | BATCH_SIZE: 5 # 5 per gpu 8 | LOG_FILE: training_log.log 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 20 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false 20 | DATASET: UCF101 21 | BATCH_SIZE: 5 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: false 29 | DATA: 30 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 31 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 32 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/mosi/pt-hmdb/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2d3ds_hmdb.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | OUTPUT_DIR: output/r2d3ds_pt_hmdb -------------------------------------------------------------------------------- /configs/projects/mosi/pt-hmdb/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2p1d_hmdb.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | OUTPUT_DIR: output/r2p1d_pt_hmdb 5 | -------------------------------------------------------------------------------- /configs/projects/mosi/pt-imagenet/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2d3ds_imagenet.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | PRETRAIN: 5 | IMAGENET_DATA_SIZE: 5 6 | OUTPUT_DIR: output/r2d3ds_pt_imagenet -------------------------------------------------------------------------------- /configs/projects/mosi/pt-ucf/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2d3ds_ucf.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | OUTPUT_DIR: output/r2d3ds_pt_ucf -------------------------------------------------------------------------------- /configs/projects/mosi/pt-ucf/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2p1d_ucf.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | OUTPUT_DIR: output/r2p1d_pt_ucf 5 | -------------------------------------------------------------------------------- /configs/projects/tada/k400/tada2d_16x5.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../tada2d_k400.yaml 2 | TRAIN: 3 | FINE_TUNE: true 4 | BATCH_SIZE: 64 5 | INIT: in1k 6 | CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights 7 | OPTIMIZER: 8 | BASE_LR: 0.12 9 | DATA: 10 | SAMPLING_RATE: 5 11 | NUM_INPUT_FRAMES: 16 12 | OUTPUT_DIR: output/tada2d_16x5_k400 -------------------------------------------------------------------------------- /configs/projects/tada/k400/tada2d_8x8.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../tada2d_k400.yaml 2 | TRAIN: 3 | FINE_TUNE: true 4 | INIT: in1k 5 | CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights 6 | DATA: 7 | SAMPLING_RATE: 8 8 | NUM_INPUT_FRAMES: 8 9 | OUTPUT_DIR: output/tada2d_8x8_k400 -------------------------------------------------------------------------------- /configs/projects/tada/ssv2/tada2d_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../tada2d_ssv2.yaml 2 | TRAIN: 3 | FINE_TUNE: true 4 | BATCH_SIZE: 64 5 | INIT: in1k 6 | CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights 7 | DATA: 8 | NUM_INPUT_FRAMES: 16 9 | OPTIMIZER: 10 | BASE_LR: 0.24 11 | OUTPUT_DIR: output/tada2d_ssv2_16f -------------------------------------------------------------------------------- /configs/projects/tada/ssv2/tada2d_8f.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../tada2d_ssv2.yaml 2 | TRAIN: 3 | FINE_TUNE: true 4 | INIT: in1k 5 | CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights 6 | DATA: 7 | NUM_INPUT_FRAMES: 8 8 | OUTPUT_DIR: output/tada2d_ssv2_8f -------------------------------------------------------------------------------- /configs/projects/tada/tada2d_k400.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tada2d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 128 10 | FINE_TUNE: true 11 | INIT: in1k 12 | CHECKPOINT_FILE_PATH: "" # !!@2 13 | TEST: 14 | ENABLE: true 15 | DATASET: kinetics400 16 | BATCH_SIZE: 128 17 | DATA: 18 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 19 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 20 | SAMPLING_RATE: 8 21 | NUM_INPUT_FRAMES: 8 22 | TRAIN_JITTER_SCALES: [224, 340] 23 | TRAIN_CROP_SIZE: 224 24 | TEST_SCALE: 256 25 | TEST_CROP_SIZE: 256 26 | VIDEO: 27 | HEAD: 28 | NUM_CLASSES: 400 29 | DROPOUT_RATE: 0.5 30 | DATA_LOADER: 31 | NUM_WORKERS: 8 32 | OPTIMIZER: 33 | BASE_LR: 0.24 34 | ADJUST_LR: false 35 | LR_POLICY: cosine 36 | MAX_EPOCH: 100 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 1e-4 39 | WARMUP_EPOCHS: 8 40 | WARMUP_START_LR: 0.01 41 | OPTIM_METHOD: sgd 42 | DAMPENING: 0.0 43 | NESTEROV: true 44 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/tada/tada2d_ssv2.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tada2d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: ssv2 9 | BATCH_SIZE: 128 10 | FINE_TUNE: true 11 | INIT: in1k 12 | CHECKPOINT_FILE_PATH: "" 13 | TEST: 14 | ENABLE: true 15 | DATASET: ssv2 16 | BATCH_SIZE: 128 17 | DATA: 18 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/ 19 | ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/ 20 | NUM_INPUT_FRAMES: 8 21 | SAMPLING_MODE: segment_based 22 | TRAIN_JITTER_SCALES: [224, 340] 23 | TRAIN_CROP_SIZE: 224 24 | TEST_SCALE: 256 25 | TEST_CROP_SIZE: 256 26 | VIDEO: 27 | HEAD: 28 | NUM_CLASSES: 174 29 | DROPOUT_RATE: 0.5 30 | DATA_LOADER: 31 | NUM_WORKERS: 8 32 | OPTIMIZER: 33 | BASE_LR: 0.48 34 | ADJUST_LR: false 35 | LR_POLICY: cosine 36 | MAX_EPOCH: 64 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 1e-4 39 | WARMUP_EPOCHS: 4 40 | WARMUP_START_LR: 0.0001 41 | OPTIM_METHOD: sgd 42 | DAMPENING: 0.0 43 | NESTEROV: true 44 | AUGMENTATION: 45 | SSV2_FLIP: true 46 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/tadaconvnextv2/tadaconvnextv2_base_k400_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_base.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 64 #total batch size: 64x4=256 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: in1k 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: kinetics400 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 21 | SAMPLING_RATE: 5 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 256 26 | TEST_CROP_SIZE: 256 27 | VIDEO: 28 | BACKBONE: 29 | DROP_PATH: 0.6 30 | HEAD: 31 | NUM_CLASSES: 400 32 | DROPOUT_RATE: 0.5 33 | 34 | OUTPUT_DIR: output/tadaconvnextv2_base_k400_16f 35 | 36 | OPTIMIZER: 37 | BASE_LR: 2.5e-4 38 | ADJUST_LR: false 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 100 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 0.02 43 | WARMUP_EPOCHS: 8 44 | WARMUP_START_LR: 1e-6 45 | OPTIM_METHOD: adamw 46 | DAMPENING: 0.0 47 | NESTEROV: true 48 | HEAD_LRMULT: 10 49 | NEW_PARAMS: ["dwconv_rf", "norm_avgpool"] 50 | NEW_PARAMS_MULT: 10 51 | AUGMENTATION: 52 | COLOR_AUG: true 53 | GRAYSCALE: 0.2 54 | COLOR_P: 0.0 55 | CONSISTENT: true 56 | SHUFFLE: true 57 | GRAY_FIRST: false 58 | IS_SPLIT: false 59 | USE_GPU: false 60 | SSV2_FLIP: true 61 | RATIO: [0.75, 1.333] 62 | MIXUP: 63 | ENABLE: false 64 | CUTMIX: 65 | ENABLE: false 66 | RANDOM_ERASING: 67 | ENABLE: false 68 | LABEL_SMOOTHING: 0.0 69 | AUTOAUGMENT: 70 | ENABLE: true 71 | BEFORE_CROP: true 72 | TYPE: rand-m9-n4-mstd0.5-inc1 73 | NUM_GPUS: 8 74 | DATA_LOADER: 75 | NUM_WORKERS: 12 76 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaconvnextv2/tadaconvnextv2_base_ssv2_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_base.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: ssv2 9 | BATCH_SIZE: 64 #total batch size: 64x4=256 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: in1k # by default, the initialization is from kinetics 400 pretrain 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: ssv2 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/ 21 | SAMPLING_MODE: segment_based 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 256 26 | TEST_CROP_SIZE: 256 27 | VIDEO: 28 | BACKBONE: 29 | DROP_PATH: 0.6 30 | HEAD: 31 | NUM_CLASSES: 174 32 | DROPOUT_RATE: 0.5 33 | 34 | OUTPUT_DIR: output/tadaconvnextv2_base_ssv2_16f 35 | 36 | OPTIMIZER: 37 | BASE_LR: 2.5e-4 38 | ADJUST_LR: false 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 64 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 0.02 43 | WARMUP_EPOCHS: 2.5 44 | WARMUP_START_LR: 1e-6 45 | OPTIM_METHOD: adamw 46 | DAMPENING: 0.0 47 | NESTEROV: true 48 | HEAD_LRMULT: 10 49 | NEW_PARAMS: ["dwconv_rf", "norm_avgpool"] 50 | NEW_PARAMS_MULT: 10 51 | AUGMENTATION: 52 | COLOR_AUG: true 53 | GRAYSCALE: 0.2 54 | COLOR_P: 0.0 55 | CONSISTENT: true 56 | SHUFFLE: true 57 | GRAY_FIRST: false 58 | IS_SPLIT: false 59 | USE_GPU: false 60 | SSV2_FLIP: true 61 | RATIO: [0.75, 1.333] 62 | MIXUP: 63 | ENABLE: false 64 | CUTMIX: 65 | ENABLE: false 66 | RANDOM_ERASING: 67 | ENABLE: false 68 | LABEL_SMOOTHING: 0.0 69 | AUTOAUGMENT: 70 | ENABLE: true 71 | BEFORE_CROP: true 72 | TYPE: rand-m9-n4-mstd0.5-inc1 73 | NUM_GPUS: 8 74 | DATA_LOADER: 75 | NUM_WORKERS: 12 76 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaconvnextv2/tadaconvnextv2_small_k400_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_small.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 64 #total batch size: 64x4=256 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: in1k 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: kinetics400 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 21 | SAMPLING_RATE: 5 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 256 26 | TEST_CROP_SIZE: 256 27 | VIDEO: 28 | BACKBONE: 29 | DROP_PATH: 0.4 30 | HEAD: 31 | NUM_CLASSES: 400 32 | DROPOUT_RATE: 0.5 33 | 34 | OUTPUT_DIR: output/tadaconvnextv2_small_k400_16f 35 | 36 | OPTIMIZER: 37 | BASE_LR: 2.5e-4 38 | ADJUST_LR: false 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 100 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 0.02 43 | WARMUP_EPOCHS: 8 44 | WARMUP_START_LR: 1e-6 45 | OPTIM_METHOD: adamw 46 | DAMPENING: 0.0 47 | NESTEROV: true 48 | HEAD_LRMULT: 10 49 | NEW_PARAMS: ["dwconv_rf", "norm_avgpool"] 50 | NEW_PARAMS_MULT: 10 51 | AUGMENTATION: 52 | COLOR_AUG: true 53 | GRAYSCALE: 0.2 54 | COLOR_P: 0.0 55 | CONSISTENT: true 56 | SHUFFLE: true 57 | GRAY_FIRST: false 58 | IS_SPLIT: false 59 | USE_GPU: false 60 | SSV2_FLIP: true 61 | RATIO: [0.75, 1.333] 62 | MIXUP: 63 | ENABLE: false 64 | CUTMIX: 65 | ENABLE: false 66 | RANDOM_ERASING: 67 | ENABLE: false 68 | LABEL_SMOOTHING: 0.0 69 | AUTOAUGMENT: 70 | ENABLE: true 71 | BEFORE_CROP: true 72 | TYPE: rand-m9-n4-mstd0.5-inc1 73 | NUM_GPUS: 8 74 | DATA_LOADER: 75 | NUM_WORKERS: 12 76 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaconvnextv2/tadaconvnextv2_small_ssv2_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_small.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: ssv2 9 | BATCH_SIZE: 64 #total batch size: 64x4=256 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: in1k # by default, the initialization is from kinetics 400 pretrain 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: ssv2 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/ 21 | SAMPLING_MODE: segment_based 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 256 26 | TEST_CROP_SIZE: 256 27 | VIDEO: 28 | BACKBONE: 29 | DROP_PATH: 0.5 30 | HEAD: 31 | NUM_CLASSES: 174 32 | DROPOUT_RATE: 0.5 33 | 34 | OUTPUT_DIR: output/tadaconvnextv2_small_ssv2_16f 35 | 36 | OPTIMIZER: 37 | BASE_LR: 2.5e-4 38 | ADJUST_LR: false 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 64 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 0.02 43 | WARMUP_EPOCHS: 2.5 44 | WARMUP_START_LR: 1e-6 45 | OPTIM_METHOD: adamw 46 | DAMPENING: 0.0 47 | NESTEROV: true 48 | HEAD_LRMULT: 10 49 | NEW_PARAMS: ["dwconv_rf", "norm_avgpool"] 50 | NEW_PARAMS_MULT: 10 51 | AUGMENTATION: 52 | COLOR_AUG: true 53 | GRAYSCALE: 0.2 54 | COLOR_P: 0.0 55 | CONSISTENT: true 56 | SHUFFLE: true 57 | GRAY_FIRST: false 58 | IS_SPLIT: false 59 | USE_GPU: false 60 | SSV2_FLIP: true 61 | RATIO: [0.75, 1.333] 62 | MIXUP: 63 | ENABLE: false 64 | CUTMIX: 65 | ENABLE: false 66 | RANDOM_ERASING: 67 | ENABLE: false 68 | LABEL_SMOOTHING: 0.0 69 | AUTOAUGMENT: 70 | ENABLE: true 71 | BEFORE_CROP: true 72 | TYPE: rand-m9-n4-mstd0.5-inc1 73 | NUM_GPUS: 8 74 | DATA_LOADER: 75 | NUM_WORKERS: 12 76 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaconvnextv2/tadaconvnextv2_tiny_k400_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_tiny.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 128 #total batch size: 128x4=512 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: in1k 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: kinetics400 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 21 | SAMPLING_RATE: 5 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 256 26 | TEST_CROP_SIZE: 256 27 | VIDEO: 28 | BACKBONE: 29 | DROP_PATH: 0.2 30 | HEAD: 31 | NUM_CLASSES: 400 32 | DROPOUT_RATE: 0.5 33 | 34 | OUTPUT_DIR: output/tadaconvnextv2_tiny_k400_16f 35 | 36 | OPTIMIZER: 37 | BASE_LR: 5e-4 38 | ADJUST_LR: false 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 100 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 0.02 43 | WARMUP_EPOCHS: 8 44 | WARMUP_START_LR: 1e-6 45 | OPTIM_METHOD: adamw 46 | DAMPENING: 0.0 47 | NESTEROV: true 48 | HEAD_LRMULT: 10 49 | NEW_PARAMS: ["dwconv_rf", "norm_avgpool"] 50 | NEW_PARAMS_MULT: 10 51 | AUGMENTATION: 52 | COLOR_AUG: true 53 | GRAYSCALE: 0.2 54 | COLOR_P: 0.0 55 | CONSISTENT: true 56 | SHUFFLE: true 57 | GRAY_FIRST: false 58 | IS_SPLIT: false 59 | USE_GPU: false 60 | SSV2_FLIP: true 61 | RATIO: [0.75, 1.333] 62 | MIXUP: 63 | ENABLE: false 64 | CUTMIX: 65 | ENABLE: false 66 | RANDOM_ERASING: 67 | ENABLE: false 68 | LABEL_SMOOTHING: 0.0 69 | AUTOAUGMENT: 70 | ENABLE: true 71 | BEFORE_CROP: true 72 | TYPE: rand-m9-n4-mstd0.5-inc1 73 | NUM_GPUS: 8 74 | DATA_LOADER: 75 | NUM_WORKERS: 12 76 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaconvnextv2/tadaconvnextv2_tiny_ssv2_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaconvnextv2_tiny.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: ssv2 9 | BATCH_SIZE: 128 #total batch size: 128x4=512 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: in1k # by default, the initialization is from kinetics 400 pretrain 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: ssv2 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/ 21 | SAMPLING_MODE: segment_based 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 256 26 | TEST_CROP_SIZE: 256 27 | VIDEO: 28 | BACKBONE: 29 | DROP_PATH: 0.3 30 | HEAD: 31 | NUM_CLASSES: 174 32 | DROPOUT_RATE: 0.5 33 | 34 | OUTPUT_DIR: output/tadaconvnextv2_tiny_ssv2_16f 35 | 36 | OPTIMIZER: 37 | BASE_LR: 5e-4 38 | ADJUST_LR: false 39 | LR_POLICY: cosine 40 | MAX_EPOCH: 64 41 | MOMENTUM: 0.9 42 | WEIGHT_DECAY: 0.02 43 | WARMUP_EPOCHS: 2.5 44 | WARMUP_START_LR: 1e-6 45 | OPTIM_METHOD: adamw 46 | DAMPENING: 0.0 47 | NESTEROV: true 48 | HEAD_LRMULT: 10 49 | NEW_PARAMS: ["dwconv_rf", "norm_avgpool"] 50 | NEW_PARAMS_MULT: 10 51 | AUGMENTATION: 52 | COLOR_AUG: true 53 | GRAYSCALE: 0.2 54 | COLOR_P: 0.0 55 | CONSISTENT: true 56 | SHUFFLE: true 57 | GRAY_FIRST: false 58 | IS_SPLIT: false 59 | USE_GPU: false 60 | SSV2_FLIP: true 61 | RATIO: [0.75, 1.333] 62 | MIXUP: 63 | ENABLE: false 64 | CUTMIX: 65 | ENABLE: false 66 | RANDOM_ERASING: 67 | ENABLE: false 68 | LABEL_SMOOTHING: 0.0 69 | AUTOAUGMENT: 70 | ENABLE: true 71 | BEFORE_CROP: true 72 | TYPE: rand-m9-n4-mstd0.5-inc1 73 | NUM_GPUS: 8 74 | DATA_LOADER: 75 | NUM_WORKERS: 12 76 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaformer/tadaformer_b16_k400_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaformer_b16.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 256 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: clip 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: kinetics400 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 21 | SAMPLING_MODE: segment_based 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 224 26 | TEST_CROP_SIZE: 224 27 | MEAN: [0.48145466, 0.4578275, 0.40821073] 28 | STD: [0.26862954, 0.26130258, 0.27577711] 29 | VIDEO: 30 | HEAD: 31 | NUM_CLASSES: 400 32 | DROPOUT_RATE: 0.5 33 | 34 | OUTPUT_DIR: output/tadaformer_b16_k400_16f 35 | 36 | OPTIMIZER: 37 | BASE_LR: 5e-5 38 | ADJUST_LR: false 39 | LR_POLICY: cosine_v2 40 | COSINE_END_LR: 1e-6 41 | COSINE_AFTER_WARMUP: true 42 | MAX_EPOCH: 30 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 0.05 45 | WARMUP_EPOCHS: 5 46 | WARMUP_START_LR: 1e-6 47 | OPTIM_METHOD: adamw 48 | DAMPENING: 0.0 49 | NESTEROV: true 50 | HEAD_LRMULT: 10 51 | NEW_PARAMS: ["tada"] 52 | NEW_PARAMS_MULT: 10 53 | LAYER_WISE_LR_DECAY: 0.7 54 | AUGMENTATION: 55 | COLOR_AUG: true 56 | GRAYSCALE: 0.2 57 | COLOR_P: 0.0 58 | CONSISTENT: true 59 | SHUFFLE: true 60 | GRAY_FIRST: false 61 | IS_SPLIT: false 62 | USE_GPU: false 63 | SSV2_FLIP: true 64 | RATIO: [0.75, 1.333] 65 | MIXUP: 66 | ENABLE: false 67 | CUTMIX: 68 | ENABLE: false 69 | RANDOM_ERASING: 70 | ENABLE: false 71 | LABEL_SMOOTHING: 0.1 72 | AUTOAUGMENT: 73 | ENABLE: true 74 | BEFORE_CROP: true 75 | TYPE: rand-m9-n4-mstd0.5-inc1 76 | NUM_GPUS: 8 77 | DATA_LOADER: 78 | NUM_WORKERS: 12 79 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaformer/tadaformer_b16_ssv2_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaformer_b16.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: ssv2 9 | BATCH_SIZE: 256 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: clip 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: ssv2 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/ 21 | SAMPLING_MODE: segment_based 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 224 26 | TEST_CROP_SIZE: 224 27 | MEAN: [0.48145466, 0.4578275, 0.40821073] 28 | STD: [0.26862954, 0.26130258, 0.27577711] 29 | VIDEO: 30 | BACKBONE: 31 | TEMP_ENHANCE: true 32 | DOUBLE_TADA: true 33 | HEAD: 34 | NUM_CLASSES: 174 35 | DROPOUT_RATE: 0.5 36 | 37 | OUTPUT_DIR: output/tadaformer_b16_ssv2_16f 38 | 39 | OPTIMIZER: 40 | BASE_LR: 5e-4 41 | ADJUST_LR: false 42 | LR_POLICY: cosine_v2 43 | COSINE_END_LR: 1e-6 44 | COSINE_AFTER_WARMUP: true 45 | MAX_EPOCH: 24 46 | MOMENTUM: 0.9 47 | WEIGHT_DECAY: 0.05 48 | WARMUP_EPOCHS: 4 49 | WARMUP_START_LR: 1e-8 50 | OPTIM_METHOD: adamw 51 | DAMPENING: 0.0 52 | NESTEROV: true 53 | HEAD_LRMULT: 10 54 | NEW_PARAMS: ["tada"] 55 | NEW_PARAMS_MULT: 10 56 | LAYER_WISE_LR_DECAY: 0.7 57 | AUGMENTATION: 58 | COLOR_AUG: true 59 | GRAYSCALE: 0.2 60 | COLOR_P: 0.0 61 | CONSISTENT: true 62 | SHUFFLE: true 63 | GRAY_FIRST: false 64 | IS_SPLIT: false 65 | USE_GPU: false 66 | SSV2_FLIP: true 67 | RATIO: [0.75, 1.333] 68 | MIXUP: 69 | ENABLE: false 70 | CUTMIX: 71 | ENABLE: false 72 | RANDOM_ERASING: 73 | ENABLE: false 74 | LABEL_SMOOTHING: 0.1 75 | AUTOAUGMENT: 76 | ENABLE: true 77 | BEFORE_CROP: true 78 | TYPE: rand-m9-n4-mstd0.5-inc1 79 | NUM_GPUS: 8 80 | DATA_LOADER: 81 | NUM_WORKERS: 12 82 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaformer/tadaformer_l14_k400_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaformer_l14.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 64 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: clip 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: kinetics400 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 21 | SAMPLING_MODE: segment_based 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 224 26 | TEST_CROP_SIZE: 224 27 | MEAN: [0.48145466, 0.4578275, 0.40821073] 28 | STD: [0.26862954, 0.26130258, 0.27577711] 29 | VIDEO: 30 | HEAD: 31 | NUM_CLASSES: 400 32 | DROPOUT_RATE: 0.5 33 | 34 | OUTPUT_DIR: output/tadaformer_l14_k400_16f 35 | 36 | OPTIMIZER: 37 | BASE_LR: 2e-5 38 | ADJUST_LR: false 39 | LR_POLICY: cosine_v2 40 | COSINE_END_LR: 1e-6 41 | COSINE_AFTER_WARMUP: true 42 | MAX_EPOCH: 24 43 | MOMENTUM: 0.9 44 | WEIGHT_DECAY: 0.05 45 | WARMUP_EPOCHS: 5 46 | WARMUP_START_LR: 1e-6 47 | OPTIM_METHOD: adamw 48 | DAMPENING: 0.0 49 | NESTEROV: true 50 | HEAD_LRMULT: 10 51 | NEW_PARAMS: ["tada"] 52 | NEW_PARAMS_MULT: 10 53 | LAYER_WISE_LR_DECAY: 0.85 54 | AUGMENTATION: 55 | COLOR_AUG: true 56 | GRAYSCALE: 0.2 57 | COLOR_P: 0.0 58 | CONSISTENT: true 59 | SHUFFLE: true 60 | GRAY_FIRST: false 61 | IS_SPLIT: false 62 | USE_GPU: false 63 | SSV2_FLIP: true 64 | RATIO: [0.75, 1.333] 65 | MIXUP: 66 | ENABLE: false 67 | CUTMIX: 68 | ENABLE: false 69 | RANDOM_ERASING: 70 | ENABLE: false 71 | LABEL_SMOOTHING: 0.1 72 | AUTOAUGMENT: 73 | ENABLE: true 74 | BEFORE_CROP: true 75 | TYPE: rand-m9-n4-mstd0.5-inc1 76 | NUM_GPUS: 16 77 | DATA_LOADER: 78 | NUM_WORKERS: 12 79 | PIN_MEMORY: true -------------------------------------------------------------------------------- /configs/projects/tadaformer/tadaformer_l14_ssv2_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tadaformer_l14.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: ssv2 9 | BATCH_SIZE: 128 10 | FINE_TUNE: true 11 | LR_REDUCE: true 12 | INIT: clip 13 | CHECKPOINT_FILE_PATH: "" 14 | TEST: 15 | ENABLE: true 16 | DATASET: ssv2 17 | BATCH_SIZE: 256 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/videos_mp4/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/labels/ 21 | SAMPLING_MODE: segment_based 22 | NUM_INPUT_FRAMES: 16 23 | TRAIN_JITTER_SCALES: [0.08, 1.0] 24 | TRAIN_CROP_SIZE: 224 25 | TEST_SCALE: 224 26 | TEST_CROP_SIZE: 224 27 | MEAN: [0.48145466, 0.4578275, 0.40821073] 28 | STD: [0.26862954, 0.26130258, 0.27577711] 29 | VIDEO: 30 | BACKBONE: 31 | DROP_PATH: 0.2 32 | TEMP_ENHANCE: true 33 | DOUBLE_TADA: true 34 | HEAD: 35 | NUM_CLASSES: 174 36 | DROPOUT_RATE: 0.5 37 | 38 | OUTPUT_DIR: output/tadaformer_l14_ssv2_16f 39 | 40 | OPTIMIZER: 41 | BASE_LR: 2.5e-4 42 | ADJUST_LR: false 43 | LR_POLICY: cosine_v2 44 | COSINE_END_LR: 1e-6 45 | COSINE_AFTER_WARMUP: true 46 | MAX_EPOCH: 24 47 | MOMENTUM: 0.9 48 | WEIGHT_DECAY: 0.05 49 | WARMUP_EPOCHS: 4 50 | WARMUP_START_LR: 1e-8 51 | OPTIM_METHOD: adamw 52 | DAMPENING: 0.0 53 | NESTEROV: true 54 | HEAD_LRMULT: 10 55 | NEW_PARAMS: ["tada"] 56 | NEW_PARAMS_MULT: 10 57 | LAYER_WISE_LR_DECAY: 0.85 58 | AUGMENTATION: 59 | COLOR_AUG: true 60 | GRAYSCALE: 0.2 61 | COLOR_P: 0.0 62 | CONSISTENT: true 63 | SHUFFLE: true 64 | GRAY_FIRST: false 65 | IS_SPLIT: false 66 | USE_GPU: false 67 | SSV2_FLIP: true 68 | RATIO: [0.75, 1.333] 69 | MIXUP: 70 | ENABLE: false 71 | CUTMIX: 72 | ENABLE: false 73 | RANDOM_ERASING: 74 | ENABLE: false 75 | LABEL_SMOOTHING: 0.1 76 | AUTOAUGMENT: 77 | ENABLE: true 78 | BEFORE_CROP: true 79 | TYPE: rand-m9-n4-mstd0.5-inc1 80 | NUM_GPUS: 8 81 | DATA_LOADER: 82 | NUM_WORKERS: 12 83 | PIN_MEMORY: true -------------------------------------------------------------------------------- /projects/epic-kitchen-ar/README.md: -------------------------------------------------------------------------------- 1 | # Towards training stronger video vision transformers for epic-kitchens-100 action recognition (CVPR 2021 Workshop) 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Zhiwu Qing](https://scholar.google.com/citations?user=q9refl4AAAAJ&hl=zh-CN), Xiang Wang, Yutong Feng, [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ&hl=zh-CN&authuser=1), Jianwen Jiang, Zhurong Xia, Mingqian Tang, Nong Sang, and [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/).
3 | In arXiv, 2021. [[Paper]](https://arxiv.org/pdf/2106.05058). 4 | 5 | # Running instructions 6 | Action recognition on Epic-Kitchens-100 share the same pipline with classification. Refer to `configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml` for more details. We also include some trained weights in the [MODEL ZOO](MODEL_ZOO.md). 7 | 8 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml`, and run the command 9 | 10 | ``` 11 | python runs/run.py --cfgconfigs/projects/epic-kitchen-ar/ek100/vivit_fac_enc.yaml 12 | ``` 13 | 14 | # Citing this report 15 | If you find the training setting useful, please consider citing the paper as follows: 16 | ```BibTeX 17 | @article{huang2021towards, 18 | title={Towards training stronger video vision transformers for epic-kitchens-100 action recognition}, 19 | author={Huang, Ziyuan and Qing, Zhiwu and Wang, Xiang and Feng, Yutong and Zhang, Shiwei and Jiang, Jianwen and Xia, Zhurong and Tang, Mingqian and Sang, Nong and Ang Jr, Marcelo H}, 20 | journal={arXiv preprint arXiv:2106.05058}, 21 | year={2021} 22 | } 23 | ``` -------------------------------------------------------------------------------- /projects/epic-kitchen-tal/README.md: -------------------------------------------------------------------------------- 1 | 2 | # A Stronger Baseline for Ego-Centric Action Detection (CVPR 2021 Workshop) 3 | 4 | 5 | # Running instructions 6 | To train the action localization model, set the `_BASE_RUN` to point to `configs/pool/run/training/localization.yaml`. See `configs/projects/epic-kitchen-tal/bmn_epic.yaml` for more details. Alternatively, you can also find some pre-trained model in the `MODEL_ZOO.md`. 7 | 8 | For detailed explanations on the approach itself, please refer to the [paper](https://arxiv.org/pdf/2106.06942). 9 | 10 | For preparing dataset, please download [features](), [classification results]() and [dataset annotations](). 11 | 12 | 13 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR`, `CLASSIFIER_ROOT_DIR` and `NUM_GPUS` in `configs/projects/epic-kitchen-tal/bmn_epic.yaml`, and run the command 14 | 15 | ``` 16 | python runs/run.py --cfg configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml 17 | ``` 18 | 19 | 20 | # Citing this report 21 | If you find this report useful for your research, please consider citing the paper as follows: 22 | ```BibTeX 23 | @article{qing2021stronger, 24 | title={A Stronger Baseline for Ego-Centric Action Detection}, 25 | author={Qing, Zhiwu and Huang, Ziyuan and Wang, Xiang and Feng, Yutong and Zhang, Shiwei and Jiang, Jianwen and Tang, Mingqian and Gao, Changxin and Ang Jr, Marcelo H and Sang, Nong}, 26 | journal={arXiv preprint arXiv:2106.06942}, 27 | year={2021} 28 | } 29 | ``` 30 | -------------------------------------------------------------------------------- /projects/mosi/MoSI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/projects/mosi/MoSI.png -------------------------------------------------------------------------------- /projects/mosi/README.md: -------------------------------------------------------------------------------- 1 | # Self-supervised Motion Learning from Static Images (CVPR 2021) 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ&hl=zh-CN&authuser=1), Jianwen Jiang, Mingqian Tang, 3 | [Rong Jin](https://www.cse.msu.edu/~rongjin/), [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/),
4 | In CVPR, 2021. 5 | 6 | [[Paper](https://openaccess.thecvf.com/content/CVPR2021/papers/Huang_Self-Supervised_Motion_Learning_From_Static_Images_CVPR_2021_paper.pdf)] 7 | 8 | # Running instructions 9 | To train the model with MoSI, set the `_BASE_RUN` to point to `configs/pool/run/training/mosi.yaml`. See `configs/projects/mosi/mosi_*.yaml` for more details. Alternatively, you can also find some pre-trained model in the `MODEL_ZOO.md`. 10 | 11 | For detailed explanations on the approach itself, please refer to the [paper](https://openaccess.thecvf.com/content/CVPR2021/papers/Huang_Self-Supervised_Motion_Learning_From_Static_Images_CVPR_2021_paper.pdf). 12 | 13 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/mosi/mosi_r2d3ds_hmdb.yaml`, and run the command 14 | 15 | ``` 16 | python runs/run.py --cfg configs/projects/mosi/pt-hmdb/r2d3ds.yaml 17 | ``` 18 | 19 |
20 |
21 | 22 |
23 |
24 | 25 | # Citing MoSI 26 | If you find MoSI useful for your research, please consider citing the paper as follows: 27 | ```BibTeX 28 | @inproceedings{mosi2021, 29 | title={Self-supervised motion learning from static images}, 30 | author={Huang, Ziyuan and Zhang, Shiwei and Jiang, Jianwen and Tang, Mingqian and Jin, Rong and Ang, Marcelo H}, 31 | booktitle={{CVPR}}, 32 | pages={1276--1285}, 33 | year={2021} 34 | } 35 | ``` -------------------------------------------------------------------------------- /projects/tada/README.md: -------------------------------------------------------------------------------- 1 | # TAda! Temporally-Adaptive Convolutions for Video Understanding (ICLR 2022) 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ&hl=zh-CN&authuser=1), [Liang Pan](https://scholar.google.com/citations?user=lSDISOcAAAAJ&hl=zh-CN&authuser=1), [Zhiwu Qing](https://scholar.google.com/citations?user=q9refl4AAAAJ&hl=zh-CN&authuser=1), 3 | Mingqian Tang, [Ziwei Liu](https://liuziwei7.github.io/), [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/),
4 | In ICLR, 2022. 5 | 6 | [[Paper](https://arxiv.org/pdf/2110.06178)][[Project homepage](https://tadaconv-iclr2022.github.io)] 7 | 8 | # Running instructions 9 | To train TAda2D networks, set the `_BASE_MODEL` to point to `configs/pool/backbone/tada2d.yaml`. See `configs/projects/tada/tada2d_*.yaml` for more details. 10 | TAda2D networks trained on Kinetics and Something-Something can be found in [`MODEL_ZOO.md`](../../MODEL_ZOO.md). 11 | 12 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/tada/tada2d_k400.yaml`, and run the command 13 | 14 | ``` 15 | python runs/run.py --cfg configs/projects/tada/k400/tada2d_8x8.yaml 16 | ``` 17 | 18 |
19 |
20 | 21 |
22 |
23 | 24 | 25 | # Model Zoo 26 | 27 | | Dataset | architecture | #frames x sampling rate | top1 | top5| checkpoint | config | 28 | | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | 29 | | K400 | TAda2D-R50 | 8 x 8 | 76.7 | 92.6 | [[google drive](https://drive.google.com/file/d/1YsbTKLoDwxtStAsP5oxUMbIsw85NvY0O/view?usp=sharing)][[baidu](https://pan.baidu.com/s/1rPPZtVDlEoftkg-r_Di59w)(code:p06d)] | [tada2d_8x8.yaml](../../configs/projects/tada/k400/tada2d_8x8.yaml) | 30 | | K400 | TAda2D-R50 | 16 x 5 | 77.4 | 93.1 | [[google drive](https://drive.google.com/file/d/1UQDurxakmnDxa5D2tBuTqTH60BVyW3XM/view?usp=sharing)][[baidu](https://pan.baidu.com/s/1MzFCZU1G1JR2ur9gWd3hCg)(code:6k8h)] | [tada2d_16x5.yaml](../../configs/projects/tada/k400/tada2d_16x5.yaml) | 31 | | SSV2 | TAda2D-R50 | 8 | 64.0 | 88.0 | [[google drive](https://drive.google.com/file/d/16y6dDf-hcMmJ2jDCV9tRla8aRJZKJXSk/view?usp=sharing)][[baidu](https://pan.baidu.com/s/1CWy35SlWMbKnYqZXESndKg)(code:dlil)] | [tada2d_8f.yaml](../../configs/projects/tada/ssv2/tada2d_8f.yaml) | 32 | | SSV2 | TAda2D-R50 | 16 | 65.6 | 89.1 | [[google drive](https://drive.google.com/file/d/1xwCxuFW6DZ0xpEsp_tFJYQRGuHPJe4uS/view?usp=sharing)][[baidu](https://pan.baidu.com/s/1GKUKyDytaKKeCBAerh-4IQ)(code:f857)] | [tada2d_16f.yaml](../../configs/projects/tada/ssv2/tada2d_16f.yaml) | 33 | 34 | # Citing TAda! 35 | If you find TAdaConv or TAda2D useful for your research, please consider citing the paper as follows: 36 | ```BibTeX 37 | @inproceedings{huang2021tada, 38 | title={TAda! Temporally-Adaptive Convolutions for Video Understanding}, 39 | author={Huang, Ziyuan and Zhang, Shiwei and Pan, Liang and Qing, Zhiwu and Tang, Mingqian and Liu, Ziwei and Ang Jr, Marcelo H}, 40 | booktitle={{ICLR}}, 41 | year={2022} 42 | } 43 | ``` -------------------------------------------------------------------------------- /projects/tada/TAda2D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/projects/tada/TAda2D.png -------------------------------------------------------------------------------- /projects/tadaconvv2/TAdaConvV2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/projects/tadaconvv2/TAdaConvV2.png -------------------------------------------------------------------------------- /runs/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """Entry file for training, evaluating and testing a video model.""" 5 | 6 | import os 7 | import sys 8 | import time 9 | sys.path.append(os.path.abspath(os.curdir)) 10 | 11 | from tadaconv.utils.launcher import launch_task 12 | 13 | from test import test 14 | from train import train 15 | from test_epic_localization import test_epic_localization 16 | from submission_test import submission_test 17 | 18 | from tadaconv.utils.config import Config 19 | 20 | def _prepare_data(cfg): 21 | if cfg.TASK_TYPE in ['classification']: 22 | train_func = train 23 | test_func = test 24 | elif cfg.TASK_TYPE in ['localization']: 25 | train_func = train 26 | test_func = test_epic_localization 27 | elif cfg.TASK_TYPE in ["submission"]: 28 | cfg.TRAIN.ENABLE = False 29 | cfg.TEST.ENABLE = False 30 | train_func = None 31 | test_func = None 32 | submission_func = submission_test 33 | else: 34 | raise ValueError("unknown TASK_TYPE {}".format(cfg.TASK_TYPE)) 35 | 36 | run_list = [] 37 | if cfg.TRAIN.ENABLE: 38 | # Training process is performed by the entry function defined above. 39 | run_list.append([cfg.deep_copy(), train_func]) 40 | 41 | if cfg.TEST.ENABLE: 42 | # Test is performed by the entry function defined above. 43 | run_list.append([cfg.deep_copy(), test_func]) 44 | if cfg.TEST.AUTOMATIC_MULTI_SCALE_TEST: 45 | """ 46 | By default, test_func performs single view test. 47 | AUTOMATIC_MULTI_SCALE_TEST automatically performs multi-view test after the single view test. 48 | """ 49 | cfg.LOG_MODEL_INFO = False 50 | cfg.LOG_CONFIG_INFO = False 51 | 52 | cfg.TEST.NUM_ENSEMBLE_VIEWS = 10 53 | cfg.TEST.NUM_SPATIAL_CROPS = 1 54 | 55 | if "kinetics" in cfg.TEST.DATASET or "epickitchen" in cfg.TEST.DATASET: 56 | cfg.TEST.NUM_SPATIAL_CROPS = 3 57 | if "imagenet" in cfg.TEST.DATASET and not cfg.PRETRAIN.ENABLE: 58 | cfg.TEST.NUM_ENSEMBLE_VIEWS = 1 59 | cfg.TEST.NUM_SPATIAL_CROPS = 3 60 | if "ssv2" in cfg.TEST.DATASET: 61 | cfg.TEST.NUM_ENSEMBLE_VIEWS = 2 62 | cfg.TEST.NUM_SPATIAL_CROPS = 3 63 | cfg.TEST.LOG_FILE = "val_{}clipsx{}crops.log".format( 64 | cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.TEST.NUM_SPATIAL_CROPS 65 | ) 66 | run_list.append([cfg.deep_copy(), test_func]) 67 | 68 | if cfg.SUBMISSION.ENABLE: 69 | # currently only supports epic kitchen submission 70 | cfg.LOG_MODEL_INFO = False 71 | cfg.TEST.NUM_ENSEMBLE_VIEWS = 10 72 | cfg.TEST.NUM_SPATIAL_CROPS = 3 73 | 74 | cfg.TEST.LOG_FILE = "test_{}clipsx{}crops.log".format( 75 | cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.TEST.NUM_SPATIAL_CROPS 76 | ) 77 | run_list.append([cfg.deep_copy(), submission_func]) 78 | 79 | return run_list 80 | 81 | def main(): 82 | """ 83 | Entry function for spawning all the function processes. 84 | """ 85 | cfg = Config(load=True) 86 | 87 | # get the list of configs and functions for running 88 | run_list = _prepare_data(cfg) 89 | 90 | for run in run_list: 91 | launch_task(cfg=run[0], init_method=run[0].get_args().init_method, func=run[1]) 92 | 93 | print("Finish running with config: {}".format(cfg.args.cfg_file)) 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /tadaconv/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/tadaconv/datasets/__init__.py -------------------------------------------------------------------------------- /tadaconv/datasets/base/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from .ucf101 import Ucf101 5 | from .hmdb51 import Hmdb51 6 | from .kinetics400 import Kinetics400 7 | from .ssv2 import Ssv2 8 | from .imagenet import Imagenet 9 | from .epickitchen100_feature import Epickitchen100localization 10 | from .epickitchen100 import Epickitchen100 11 | -------------------------------------------------------------------------------- /tadaconv/datasets/base/builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Builder for the dataloader.""" 5 | 6 | import itertools 7 | import numpy as np 8 | import torch 9 | import tadaconv.utils.misc as misc 10 | from tadaconv.utils.sampler import MultiFoldDistributedSampler 11 | from torch.utils.data._utils.collate import default_collate 12 | from torch.utils.data.distributed import DistributedSampler 13 | from torch.utils.data.sampler import RandomSampler 14 | from tadaconv.utils.val_dist_sampler import MultiSegValDistributedSampler 15 | from tadaconv.datasets.utils.collate_functions import COLLATE_FN_REGISTRY 16 | 17 | 18 | from tadaconv.utils.registry import Registry 19 | 20 | DATASET_REGISTRY = Registry("DATASET") 21 | 22 | def get_sampler(cfg, dataset, split, shuffle): 23 | """ 24 | Returns the sampler object for the dataset. 25 | Args: 26 | dataset (Dataset): constructed dataset. 27 | split (str): which split is the dataset for. 28 | shuffle (bool): whether or not to shuffle the dataset. 29 | Returns: 30 | sampler (Sampler): dataset sampler. 31 | """ 32 | if misc.get_num_gpus(cfg) > 1: 33 | if split == "train" and cfg.TRAIN.NUM_FOLDS > 1: 34 | return MultiFoldDistributedSampler( 35 | dataset, cfg.TRAIN.NUM_FOLDS 36 | ) 37 | elif cfg.USE_MULTISEG_VAL_DIST and cfg.TRAIN.ENABLE is False: 38 | return MultiSegValDistributedSampler(dataset, shuffle=False) 39 | else: 40 | return DistributedSampler( 41 | dataset, 42 | shuffle=shuffle 43 | ) 44 | else: 45 | return None 46 | 47 | def build_loader(cfg, split): 48 | """ 49 | Constructs the data loader for the given dataset. 50 | Args: 51 | cfg (Configs): global config object. details in utils/config.py 52 | split (str): the split of the data loader. Options include `train`, 53 | `val`, `test`, and `submission`. 54 | Returns: 55 | loader object. 56 | """ 57 | assert split in ["train", "val", "test", "submission"] 58 | if split in ["train"]: 59 | dataset_name = cfg.TRAIN.DATASET 60 | batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 61 | shuffle = True 62 | drop_last = True 63 | elif split in ["val"]: 64 | dataset_name = cfg.TEST.DATASET 65 | batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 66 | shuffle = False 67 | drop_last = False 68 | elif split in ["test", "submission"]: 69 | dataset_name = cfg.TEST.DATASET 70 | batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 71 | shuffle = False 72 | drop_last = False 73 | 74 | # Construct the dataset 75 | dataset = build_dataset(dataset_name, cfg, split) 76 | 77 | # Create a sampler for multi-process training 78 | sampler = get_sampler(cfg, dataset, split, shuffle) 79 | # Create a loader 80 | if hasattr(cfg.DATA_LOADER, "COLLATE_FN") and cfg.DATA_LOADER.COLLATE_FN is not None: 81 | collate_fn = COLLATE_FN_REGISTRY.get(cfg.DATA_LOADER.COLLATE_FN)(cfg) 82 | else: 83 | collate_fn = None 84 | loader = torch.utils.data.DataLoader( 85 | dataset, 86 | batch_size=batch_size, 87 | shuffle=(False if sampler else shuffle), 88 | sampler=sampler, 89 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 90 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 91 | drop_last=drop_last, 92 | collate_fn=collate_fn 93 | ) 94 | return loader 95 | 96 | 97 | def shuffle_dataset(loader, cur_epoch): 98 | """" 99 | Shuffles the sampler for the dataset. 100 | Args: 101 | loader (loader): data loader to perform shuffle. 102 | cur_epoch (int): number of the current epoch. 103 | """ 104 | sampler = loader.sampler 105 | assert isinstance( 106 | sampler, (RandomSampler, DistributedSampler, MultiFoldDistributedSampler) 107 | ), "Sampler type '{}' not supported".format(type(sampler)) 108 | # RandomSampler handles shuffling automatically 109 | if isinstance(sampler, (DistributedSampler, MultiFoldDistributedSampler)): 110 | # DistributedSampler shuffles data based on epoch 111 | sampler.set_epoch(cur_epoch) 112 | 113 | def build_dataset(dataset_name, cfg, split): 114 | """ 115 | Builds a dataset according to the "dataset_name". 116 | Args: 117 | dataset_name (str): the name of the dataset to be constructed. 118 | cfg (Config): global config object. 119 | split (str): the split of the data loader. 120 | Returns: 121 | Dataset (Dataset): a dataset object constructed for the specified dataset_name. 122 | """ 123 | name = dataset_name.capitalize() 124 | return DATASET_REGISTRY.get(name)(cfg, split) 125 | -------------------------------------------------------------------------------- /tadaconv/datasets/base/epickitchen100.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Epic-Kitchens dataset. """ 5 | 6 | import os 7 | import random 8 | import torch 9 | import torch.utils.data 10 | import tadaconv.utils.logging as logging 11 | 12 | import time 13 | import oss2 as oss 14 | 15 | import tadaconv.utils.bucket as bu 16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY 17 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset 18 | 19 | logger = logging.get_logger(__name__) 20 | 21 | @DATASET_REGISTRY.register() 22 | class Epickitchen100(BaseVideoDataset): 23 | def __init__(self, cfg, split): 24 | super(Epickitchen100, self).__init__(cfg, split) 25 | if (self.split == "test" or self.split == "submission") and self.cfg.PRETRAIN.ENABLE == False: 26 | self._pre_transformation_config_required = True 27 | 28 | def _get_dataset_list_name(self): 29 | """ 30 | Returns the list for the dataset. 31 | Returns: 32 | dataset_list_name (str) 33 | """ 34 | if self.split == "train": 35 | if self.cfg.TRAIN.TRAIN_VAL_COMBINE: 36 | train_list = "train_val" 37 | else: 38 | train_list = "train" 39 | name = "EPIC_100_{}.csv".format( 40 | train_list if self.split == "train" else "validation" if not self.split == "submission" else "test_timestamps", 41 | ) 42 | logger.info("Reading video list from file: {}".format(name)) 43 | return name 44 | 45 | def _get_sample_info(self, index): 46 | """ 47 | Returns the sample info corresponding to the index. 48 | Args: 49 | index (int): target index 50 | Returns: 51 | sample_info (dict): contains different informations to be used later 52 | "name": the name of the video 53 | "path": the path of the video for the specified index 54 | "verb_class": verb label of the video 55 | "noun_class": noun label of the video 56 | """ 57 | if not self.split == "submission": 58 | video_name = self._samples[index][0] 59 | verb_class = self._samples[index][10] 60 | noun_class = self._samples[index][12] 61 | video_path = os.path.join(self.data_root_dir, video_name+".MP4") 62 | else: 63 | # if the split is submission, then no label is available 64 | # we simply set the verb class and the noun class to zero 65 | video_name = self._samples[index][0] 66 | verb_class = 0 67 | noun_class = 0 68 | video_path = os.path.join(self.data_root_dir, video_name+".MP4") 69 | 70 | if self.cfg.DATA.MULTI_LABEL or not hasattr(self.cfg.DATA, "TRAIN_VERSION"): 71 | supervised_label = { 72 | "verb_class": verb_class, 73 | "noun_class": noun_class 74 | } 75 | else: 76 | if self.cfg.DATA.TRAIN_VERSION == "only_train_verb": 77 | supervised_label = verb_class 78 | elif self.cfg.DATA.TRAIN_VERSION == "only_train_noun": 79 | supervised_label = noun_class 80 | 81 | sample_info = { 82 | "name": video_name, 83 | "path": video_path, 84 | "supervised_label": supervised_label 85 | } 86 | return sample_info 87 | 88 | def _pre_transformation_config(self): 89 | """ 90 | Set transformation parameters if required. 91 | """ 92 | self.resize_video.set_spatial_index(self.spatial_idx) 93 | 94 | def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True): 95 | pass # making python happy 96 | 97 | def _get_ssl_label(self): 98 | pass # making python happy -------------------------------------------------------------------------------- /tadaconv/datasets/base/hmdb51.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ HMDB51 dataset. """ 5 | 6 | import os 7 | import random 8 | import time 9 | 10 | import oss2 as oss 11 | import tadaconv.utils.bucket as bu 12 | import tadaconv.utils.logging as logging 13 | import torch 14 | import torch.utils.data 15 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset 16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY 17 | 18 | logger = logging.get_logger(__name__) 19 | 20 | @DATASET_REGISTRY.register() 21 | class Hmdb51(BaseVideoDataset): 22 | def __init__(self, cfg, split): 23 | super(Hmdb51, self).__init__(cfg, split) 24 | if self.split == "test" and self.cfg.PRETRAIN.ENABLE == False: 25 | self._pre_transformation_config_required = True 26 | 27 | 28 | def _get_dataset_list_name(self): 29 | """ 30 | Returns the list for the dataset. 31 | Returns: 32 | name (str): name of the list to be read 33 | """ 34 | name = "hmdb51_{}_list.txt".format( 35 | "train" if "train" in self.split else "test", 36 | ) 37 | logger.info("Reading video list from file: {}".format(name)) 38 | return name 39 | 40 | def _get_sample_info(self, index): 41 | """ 42 | Returns the sample info corresponding to the index. 43 | Args: 44 | index (int): target index 45 | Returns: 46 | sample_info (dict): contains different informations to be used later 47 | "path": indicating the target's path w.r.t. index 48 | "supervised_label": indicating the class of the target 49 | """ 50 | video_path, class_, = self._samples[index].strip().split(" ") 51 | class_ = int(class_) 52 | video_path = os.path.join(self.data_root_dir, video_path) 53 | sample_info = { 54 | "path": video_path, 55 | "supervised_label": class_, 56 | } 57 | return sample_info 58 | 59 | def _pre_transformation_config(self): 60 | """ 61 | Set transformation parameters if required. 62 | """ 63 | self.resize_video.set_spatial_index(self.spatial_idx) 64 | 65 | def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True): 66 | return self._interval_based_sampling(vid_length, vid_fps, clip_idx, num_clips, num_frames, interval) 67 | 68 | def _get_ssl_label(self): 69 | pass # making python happy 70 | -------------------------------------------------------------------------------- /tadaconv/datasets/base/kinetics400.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Kinetics400 dataset. """ 5 | 6 | import os 7 | import random 8 | import torch 9 | import torch.utils.data 10 | import tadaconv.utils.logging as logging 11 | 12 | import time 13 | import oss2 as oss 14 | 15 | import tadaconv.utils.bucket as bu 16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY 17 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset 18 | 19 | logger = logging.get_logger(__name__) 20 | 21 | @DATASET_REGISTRY.register() 22 | class Kinetics400(BaseVideoDataset): 23 | def __init__(self, cfg, split): 24 | super(Kinetics400, self).__init__(cfg, split) 25 | if self.split == "test" and self.cfg.PRETRAIN.ENABLE == False: 26 | self._pre_transformation_config_required = True 27 | 28 | def _get_dataset_list_name(self): 29 | """ 30 | Returns the list for the dataset. 31 | Returns: 32 | name (str): name of the list to be read 33 | """ 34 | name = "kinetics400_{}_list.txt".format( 35 | self.split, 36 | ) 37 | logger.info("Reading video list from file: {}".format(name)) 38 | return name 39 | 40 | def _get_sample_info(self, index): 41 | """ 42 | Returns the sample info corresponding to the index. 43 | Args: 44 | index (int): target index 45 | Returns: 46 | sample_info (dict): contains different informations to be used later 47 | "path": indicating the target's path w.r.t. index 48 | "supervised_label": indicating the class of the target 49 | """ 50 | video_path, class_, = self._samples[index].strip().split(" ") 51 | class_ = int(class_) 52 | video_path = os.path.join(self.data_root_dir, video_path) 53 | sample_info = { 54 | "path": video_path, 55 | "supervised_label": class_, 56 | } 57 | return sample_info 58 | 59 | def _pre_transformation_config(self): 60 | """ 61 | Set transformation parameters if required. 62 | """ 63 | self.resize_video.set_spatial_index(self.spatial_idx) 64 | 65 | def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True): 66 | return self._interval_based_sampling(vid_length, vid_fps, clip_idx, num_clips, num_frames, interval) 67 | 68 | def _get_ssl_label(self): 69 | pass # making python happy -------------------------------------------------------------------------------- /tadaconv/datasets/base/ssv2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Something-Something-V2 dataset. """ 5 | 6 | import os 7 | import random 8 | import torch 9 | import torch.utils.data 10 | import tadaconv.utils.logging as logging 11 | 12 | import time 13 | import oss2 as oss 14 | 15 | import tadaconv.utils.bucket as bu 16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY 17 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset 18 | 19 | logger = logging.get_logger(__name__) 20 | 21 | @DATASET_REGISTRY.register() 22 | class Ssv2(BaseVideoDataset): 23 | def __init__(self, cfg, split): 24 | super(Ssv2, self).__init__(cfg, split) 25 | if self.split == "test" and self.cfg.PRETRAIN.ENABLE == False: 26 | self._pre_transformation_config_required = True 27 | 28 | def _get_dataset_list_name(self): 29 | """ 30 | Returns the list for the dataset. 31 | Returns: 32 | name (str): name of the list to be read 33 | """ 34 | name = "something-something-v2-{}-with-label.json".format( 35 | "train" if self.split == "train" else "validation", 36 | ) 37 | logger.info("Reading video list from file: {}".format(name)) 38 | return name 39 | 40 | def _get_sample_info(self, index): 41 | """ 42 | Returns the sample info corresponding to the index. 43 | Args: 44 | index (int): target index 45 | Returns: 46 | sample_info (dict): contains different informations to be used later 47 | "path": indicating the target's path w.r.t. index 48 | "supervised_label": indicating the class of the target 49 | """ 50 | class_ = self._samples[index]["label_idx"] 51 | video_path = os.path.join(self.data_root_dir, self._samples[index]["id"]+".mp4") 52 | sample_info = { 53 | "path": video_path, 54 | "supervised_label": class_, 55 | } 56 | return sample_info 57 | 58 | def _pre_transformation_config(self): 59 | """ 60 | Set transformation parameters if required. 61 | """ 62 | self.resize_video.set_spatial_index(self.spatial_idx) 63 | 64 | def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True): 65 | pass # making python happy 66 | 67 | def _get_ssl_label(self): 68 | pass # making python happy -------------------------------------------------------------------------------- /tadaconv/datasets/base/ucf101.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ UCF101 dataset. """ 5 | 6 | import os 7 | import random 8 | import torch 9 | import torch.utils.data 10 | import tadaconv.utils.logging as logging 11 | 12 | import time 13 | import oss2 as oss 14 | 15 | import tadaconv.utils.bucket as bu 16 | from tadaconv.datasets.base.builder import DATASET_REGISTRY 17 | from tadaconv.datasets.base.base_dataset import BaseVideoDataset 18 | 19 | logger = logging.get_logger(__name__) 20 | 21 | @DATASET_REGISTRY.register() 22 | class Ucf101(BaseVideoDataset): 23 | def __init__(self, cfg, split): 24 | super(Ucf101, self).__init__(cfg, split) 25 | if self.split == "test" and self.cfg.PRETRAIN.ENABLE == False: 26 | self._pre_transformation_config_required = True 27 | 28 | def _get_dataset_list_name(self): 29 | """ 30 | Returns the list for the dataset. 31 | Returns: 32 | name (str): name of the list to be read 33 | """ 34 | name = "ucf101_{}_list.txt".format( 35 | "train" if "train" in self.split else "test", 36 | ) 37 | logger.info("Reading video list from file: {}".format(name)) 38 | return name 39 | 40 | def _get_sample_info(self, index): 41 | """ 42 | Returns the sample info corresponding to the index. 43 | Args: 44 | index (int): target index 45 | Returns: 46 | sample_info (dict): contains different informations to be used later 47 | "path": indicating the target's path w.r.t. index 48 | "supervised_label": indicating the class of the target 49 | """ 50 | video_path, class_, = self._samples[index].strip().split(" ") 51 | class_ = int(class_) 52 | video_path = os.path.join(self.data_root_dir, video_path) 53 | sample_info = { 54 | "path": video_path, 55 | "supervised_label": class_, 56 | } 57 | return sample_info 58 | 59 | def _pre_transformation_config(self): 60 | """ 61 | Set transformation parameters if required. 62 | """ 63 | self.resize_video.set_spatial_index(self.spatial_idx) 64 | 65 | def _custom_sampling(self, vid_length, vid_fps, clip_idx, num_clips, num_frames, interval=2, random_sample=True): 66 | return self._interval_based_sampling(vid_length, vid_fps, clip_idx, num_clips, num_frames, interval) 67 | 68 | def _get_ssl_label(self): 69 | pass # making python happy 70 | -------------------------------------------------------------------------------- /tadaconv/datasets/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/tadaconv/datasets/utils/__init__.py -------------------------------------------------------------------------------- /tadaconv/datasets/utils/collate_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Collate functions. """ 5 | 6 | import random 7 | from tadaconv.utils.registry import Registry 8 | from torch.utils.data._utils.collate import default_collate 9 | import torch.nn.functional as F 10 | 11 | COLLATE_FN_REGISTRY = Registry() 12 | 13 | @COLLATE_FN_REGISTRY.register() 14 | class ZeroShotCollate(object): 15 | def __init__(self, cfg): 16 | self.cfg = cfg 17 | 18 | def __call__(self, batch): 19 | batch = default_collate(batch) 20 | batch[0]["text_embedding"] = batch[0]["text_embedding"][0].unsqueeze(0) 21 | return batch -------------------------------------------------------------------------------- /tadaconv/datasets/utils/preprocess_ssv2.py: -------------------------------------------------------------------------------- 1 | 2 | from email.policy import default 3 | import os 4 | import sys 5 | import json 6 | import tqdm 7 | import argparse 8 | 9 | 10 | # ---- config 11 | 12 | anno_path = "" # where you put your annotation files 13 | data_path = "" # where you put original webm videos 14 | data_out_path = "" # where to put the converted mp4 videos 15 | 16 | def main( 17 | anno_conversion, data_conversion, num_splits, split_id, split, 18 | anno_path, data_path, data_out_path 19 | ): 20 | 21 | # ---- anno conversion 22 | 23 | if anno_conversion: 24 | 25 | with open(os.path.join(anno_path, "something-something-v2-labels.json"), "r") as f: 26 | labels = json.load(f) 27 | 28 | print(f"Converting file: {os.path.join(anno_path, 'something-something-v2-train.json')}.") 29 | trainset_samples = [] 30 | with open(os.path.join(anno_path, "something-something-v2-train.json"), "r") as f: 31 | lines = json.load(f) 32 | for line in lines: 33 | line['label_idx'] = int(labels[line['template'].replace('[', '').replace(']', '') ]) 34 | trainset_samples.append(line) 35 | 36 | with open(os.path.join(anno_path, "something-something-v2-train-with-label.json"), "w") as f: 37 | json.dump(trainset_samples, f, indent=4) 38 | 39 | print(f"Converting file: {os.path.join(anno_path, 'something-something-v2-validation.json')}.") 40 | val_samples = [] 41 | with open(os.path.join(anno_path, "something-something-v2-validation.json"), "r") as f: 42 | lines = json.load(f) 43 | for line in lines: 44 | line['label_idx'] = int(labels[line['template'].replace('[', '').replace(']', '') ]) 45 | val_samples.append(line) 46 | 47 | 48 | with open(os.path.join(anno_path, "something-something-v2-validation-with-label.json"), "w") as f: 49 | json.dump(val_samples, f, indent=4) 50 | 51 | # ---- convert files 52 | 53 | if data_conversion: 54 | 55 | if not os.path.exists(data_out_path): 56 | os.mkdir(data_out_path) 57 | 58 | if not anno_conversion: 59 | print("Loading train samples") 60 | trainset_samples = [] 61 | with open(os.path.join(anno_path, "something-something-v2-train.json"), "r") as f: 62 | lines = json.load(f) 63 | for line in lines: 64 | trainset_samples.append(line) 65 | print("Loading val samples") 66 | val_samples = [] 67 | with open(os.path.join(anno_path, "something-something-v2-validation.json"), "r") as f: 68 | lines = json.load(f) 69 | for line in lines: 70 | val_samples.append(line) 71 | print(len(trainset_samples)) 72 | print(len(val_samples)) 73 | 74 | if split_id < num_splits-1: 75 | trainset_samples_torun = trainset_samples[ 76 | split_id * round(len(trainset_samples)/num_splits): (split_id+1) * round(len(trainset_samples)/num_splits) 77 | ] 78 | val_samples_torun = val_samples[ 79 | split_id * round(len(val_samples)/num_splits): (split_id+1) * round(len(val_samples)/num_splits) 80 | ] 81 | else: 82 | trainset_samples_torun = trainset_samples[ 83 | split_id * round(len(trainset_samples)/num_splits): 84 | ] 85 | val_samples_torun = val_samples[ 86 | split_id * round(len(val_samples)/num_splits): 87 | ] 88 | 89 | if split in ['all', 'train']: 90 | print("converting train samples") 91 | for i, sample in enumerate(tqdm.tqdm(trainset_samples_torun)): 92 | name = sample['id'] 93 | input_file = f'{name}.webm' 94 | output_file = f'{name}.mp4' 95 | cmd = f"ffmpeg -i {data_path}/{input_file} -vf 'pad=ceil(iw/2)*2:ceil(ih/2)*2' {data_out_path}/{output_file} -loglevel error -y" 96 | os.system(cmd) 97 | 98 | if split in ['all', 'val']: 99 | print("converting val samples") 100 | for i, sample in enumerate(tqdm.tqdm(val_samples_torun)): 101 | name = sample['id'] 102 | input_file = f'{name}.webm' 103 | output_file = f'{name}.mp4' 104 | cmd = f"ffmpeg -i {data_path}/{input_file} -vf 'pad=ceil(iw/2)*2:ceil(ih/2)*2' {data_out_path}/{output_file} -loglevel error -y" 105 | os.system(cmd) 106 | 107 | if __name__ == "__main__": 108 | parser = argparse.ArgumentParser(description='Process SSV2 annos and data.') 109 | parser.add_argument('--anno', action='store_true') 110 | parser.add_argument('--data', action='store_true') 111 | parser.add_argument('--num_splits', type=int, default=1) 112 | parser.add_argument('--split_id', type=int, default=0) 113 | parser.add_argument('--split', type=str, default="all") 114 | parser.add_argument('--anno_path', type=str, default=anno_path) 115 | parser.add_argument('--data_path', type=str, default=data_path) 116 | parser.add_argument('--data_out_path', type=str, default=data_out_path) 117 | args = parser.parse_args() 118 | main( 119 | args.anno, args.data, args.num_splits, args.split_id, args.split, 120 | args.anno_path, args.data_path, args.data_out_path 121 | ) -------------------------------------------------------------------------------- /tadaconv/datasets/utils/random_erasing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ 5 | Random erasing classes. 6 | This file is modified from https://github.com/rwightman/pytorch-image-models/blob/master/timm/data/random_erasing.py. 7 | """ 8 | 9 | import random 10 | import math 11 | import torch 12 | 13 | 14 | def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'): 15 | # NOTE I've seen CUDA illegal memory access errors being caused by the normal_() 16 | # paths, flip the order so normal is run on CPU if this becomes a problem 17 | # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508 18 | if per_pixel: 19 | return torch.empty(patch_size, dtype=dtype, device=device).normal_() 20 | elif rand_color: 21 | return torch.empty((patch_size[0], 1, 1, 1), dtype=dtype, device=device).normal_() 22 | else: 23 | return torch.zeros((patch_size[0], 1, 1, 1), dtype=dtype, device=device) 24 | 25 | 26 | class RandomErasing: 27 | """ Randomly selects a rectangle region in an image and erases its pixels. 28 | 'Random Erasing Data Augmentation' by Zhong et al. 29 | See https://arxiv.org/pdf/1708.04896.pdf 30 | 31 | This variant of RandomErasing is intended to be applied to either a batch 32 | or single image tensor after it has been normalized by dataset mean and std. 33 | Args: 34 | probability: Probability that the Random Erasing operation will be performed. 35 | min_area: Minimum percentage of erased area wrt input image area. 36 | max_area: Maximum percentage of erased area wrt input image area. 37 | min_aspect: Minimum aspect ratio of erased area. 38 | mode: pixel color mode, one of 'const', 'rand', or 'pixel' 39 | 'const' - erase block is constant color of 0 for all channels 40 | 'rand' - erase block is same per-channel random (normal) color 41 | 'pixel' - erase block is per-pixel random (normal) color 42 | max_count: maximum number of erasing blocks per image, area per box is scaled by count. 43 | per-image count is randomly chosen between 1 and this value. 44 | """ 45 | 46 | def __init__(self, cfg,): 47 | """ 48 | Args: 49 | cfg (Config): global config object. 50 | """ 51 | self.enable = cfg.AUGMENTATION.RANDOM_ERASING.ENABLE 52 | self.probability = cfg.AUGMENTATION.RANDOM_ERASING.PROB 53 | self.min_area, self.max_area = cfg.AUGMENTATION.RANDOM_ERASING.AREA_RANGE 54 | 55 | min_aspect = cfg.AUGMENTATION.RANDOM_ERASING.MIN_ASPECT 56 | max_aspect = 1 / min_aspect 57 | self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) 58 | 59 | self.min_count, self.max_count = cfg.AUGMENTATION.RANDOM_ERASING.COUNT 60 | self.num_splits = cfg.AUGMENTATION.RANDOM_ERASING.NUM_SPLITS 61 | mode = cfg.AUGMENTATION.RANDOM_ERASING.MODE.lower() 62 | self.rand_color = False 63 | self.per_pixel = False 64 | if mode == 'rand': 65 | self.rand_color = True # per block random normal 66 | elif mode == 'pixel': 67 | self.per_pixel = True # per pixel random normal 68 | else: 69 | assert not mode or mode == 'const' 70 | 71 | def _erase(self, img, chan, num_frames, img_h, img_w, dtype): 72 | if random.random() > self.probability: 73 | return 74 | area = img_h * img_w 75 | count = self.min_count if self.min_count == self.max_count else \ 76 | random.randint(self.min_count, self.max_count) 77 | for _ in range(count): 78 | for attempt in range(10): 79 | target_area = random.uniform(self.min_area, self.max_area) * area / count 80 | aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) 81 | h = int(round(math.sqrt(target_area * aspect_ratio))) 82 | w = int(round(math.sqrt(target_area / aspect_ratio))) 83 | if w < img_w and h < img_h: 84 | top = random.randint(0, img_h - h) 85 | left = random.randint(0, img_w - w) 86 | img[:, :, top:top + h, left:left + w] = _get_pixels( 87 | self.per_pixel, self.rand_color, (chan, num_frames, h, w), 88 | dtype=dtype, device=img.device) 89 | break 90 | 91 | def __call__(self, input): 92 | if self.enable: 93 | if len(input.size()) == 4: 94 | self._erase(input, *input.size(), input.dtype) 95 | else: 96 | batch_size, chan, num_frames, img_h, img_w = input.size() 97 | # skip first slice of batch if num_splits is set (for clean portion of samples) 98 | batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 99 | for i in range(batch_start, batch_size): 100 | self._erase(input[i], chan, num_frames, img_h, img_w, input.dtype) 101 | return input -------------------------------------------------------------------------------- /tadaconv/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/tadaconv/models/__init__.py -------------------------------------------------------------------------------- /tadaconv/models/base/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | import tadaconv.models.module_zoo 5 | from tadaconv.models.base.base_blocks import BaseHead, Base3DStem 6 | import tadaconv.models.base.transformer 7 | import tadaconv.models.base.slowfast -------------------------------------------------------------------------------- /tadaconv/models/base/builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Builder for video models. """ 5 | 6 | import sys 7 | import torch 8 | import torch.nn as nn 9 | 10 | import traceback 11 | 12 | import tadaconv.utils.logging as logging 13 | 14 | from tadaconv.models.base.models import BaseVideoModel, MODEL_REGISTRY 15 | from tadaconv.models.utils.model_ema import ModelEmaV2 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | def build_model(cfg, gpu_id=None): 20 | """ 21 | Builds the video model. 22 | Args: 23 | cfg (Config): global config object that provides specifics to construct the model. 24 | gpu_id (Optional[int]): specify the gpu index to build model. 25 | Returns: 26 | model: constructed model 27 | model_ema: copied model for ema 28 | """ 29 | # Construct the model 30 | if MODEL_REGISTRY.get(cfg.MODEL.NAME) == None: 31 | # attempt to find standard models 32 | model = BaseVideoModel(cfg) 33 | else: 34 | # if the model is explicitly defined, 35 | # it is directly constructed from the model pool 36 | model = MODEL_REGISTRY.get(cfg.MODEL.NAME)(cfg) 37 | 38 | if torch.cuda.is_available(): 39 | assert ( 40 | cfg.NUM_GPUS <= torch.cuda.device_count() 41 | ), "Cannot use more GPU devices than available" 42 | else: 43 | assert ( 44 | cfg.NUM_GPUS == 0 45 | ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs." 46 | 47 | if cfg.NUM_GPUS: 48 | if gpu_id is None: 49 | # Determine the GPU used by the current process 50 | cur_device = torch.cuda.current_device() 51 | else: 52 | cur_device = gpu_id 53 | model = model.cuda(device=cur_device) 54 | 55 | model_ema = None 56 | if cfg.MODEL.EMA.ENABLE: 57 | model_ema = ModelEmaV2(model, decay=cfg.MODEL.EMA.DECAY) 58 | 59 | try: 60 | # convert batchnorm to be synchronized across 61 | # different GPUs if needed 62 | sync_bn = cfg.BN.SYNC 63 | if sync_bn == True and cfg.NUM_GPUS * cfg.NUM_SHARDS > 1: 64 | model = nn.SyncBatchNorm.convert_sync_batchnorm(model) 65 | except: 66 | sync_bn = None 67 | 68 | # Use multi-process data parallel model in the multi-gpu setting 69 | if cfg.NUM_GPUS*cfg.NUM_SHARDS > 1: 70 | # Make model replica operate on the current device 71 | if cfg.PAI: 72 | # Support distributed training on the cluster 73 | model = torch.nn.parallel.DistributedDataParallel( 74 | module=model 75 | ) 76 | else: 77 | model = torch.nn.parallel.DistributedDataParallel( 78 | module=model, device_ids=[cur_device], output_device=cur_device 79 | ) 80 | 81 | return model, model_ema -------------------------------------------------------------------------------- /tadaconv/models/base/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | import torch 5 | import torch.nn as nn 6 | from tadaconv.utils.registry import Registry 7 | from tadaconv.models.base.backbone import BACKBONE_REGISTRY 8 | from tadaconv.models.base.base_blocks import HEAD_REGISTRY 9 | 10 | MODEL_REGISTRY = Registry("Model") 11 | 12 | class BaseVideoModel(nn.Module): 13 | """ 14 | Standard video model. 15 | The model is divided into the backbone and the head, where the backbone 16 | extracts features and the head performs classification. 17 | 18 | The backbones can be defined in model/base/backbone.py or anywhere else 19 | as long as the backbone is registered by the BACKBONE_REGISTRY. 20 | The heads can be defined in model/module_zoo/heads/ or anywhere else 21 | as long as the head is registered by the HEAD_REGISTRY. 22 | 23 | The registries automatically finds the registered modules and construct 24 | the base video model. 25 | """ 26 | def __init__(self, cfg): 27 | """ 28 | Args: 29 | cfg (Config): global config object. 30 | """ 31 | super(BaseVideoModel, self).__init__() 32 | self.cfg = cfg 33 | 34 | # the backbone is created according to meta-architectures 35 | # defined in models/base/backbone.py 36 | self.backbone = BACKBONE_REGISTRY.get(cfg.VIDEO.BACKBONE.META_ARCH)(cfg=cfg) 37 | 38 | # the head is created according to the heads 39 | # defined in models/module_zoo/heads 40 | self.head = HEAD_REGISTRY.get(cfg.VIDEO.HEAD.NAME)(cfg=cfg) 41 | 42 | def forward(self, x): 43 | x = self.backbone(x) 44 | x = self.head(x) 45 | return x 46 | 47 | def train(self, mode=True): 48 | r"""Sets the module in training mode. 49 | 50 | This has any effect only on certain modules. See documentations of 51 | particular modules for details of their behaviors in training/evaluation 52 | mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, 53 | etc. 54 | 55 | Args: 56 | mode (bool): whether to set training mode (``True``) or evaluation 57 | mode (``False``). Default: ``True``. 58 | 59 | Returns: 60 | Module: self 61 | """ 62 | self.training = mode 63 | super(BaseVideoModel, self).train(mode) 64 | for module in self.modules(): 65 | if isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.LayerNorm)) and self.cfg.BN.FREEZE: 66 | module.train(False) 67 | return self 68 | 69 | @MODEL_REGISTRY.register() 70 | class MoSINet(BaseVideoModel): 71 | def __init__(self, cfg): 72 | super(MoSINet, self).__init__(cfg) 73 | 74 | def forward(self, x): 75 | if isinstance(x, dict): 76 | x_data = x["video"] 77 | else: 78 | x_data = x 79 | b, n, c, t, h, w = x_data.shape 80 | x_data = x_data.reshape(b*n, c, t, h, w) 81 | res, logits = super(MoSINet, self).forward(x_data) 82 | pred = {} 83 | if isinstance(res, dict): 84 | for k, v in res.items(): 85 | pred[k] = v 86 | else: 87 | pred["move_joint"] = res 88 | return pred, logits -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from tadaconv.models.module_zoo.heads import * 5 | from tadaconv.models.module_zoo.stems import * 6 | from tadaconv.models.module_zoo.branches import * 7 | from tadaconv.models.module_zoo.ops import * -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/branches/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from tadaconv.models.module_zoo.branches.r2plus1d_branch import R2Plus1DBranch 5 | from tadaconv.models.module_zoo.branches.r2d3d_branch import R2D3DBranch 6 | from tadaconv.models.module_zoo.branches.csn_branch import CSNBranch 7 | from tadaconv.models.module_zoo.branches.slowfast_branch import SlowfastBranch 8 | from tadaconv.models.module_zoo.branches.s3dg_branch import STConv3d 9 | from tadaconv.models.module_zoo.branches.non_local import NonLocal 10 | from tadaconv.models.module_zoo.branches.tada_branch import TAda2DBlock 11 | from tadaconv.models.module_zoo.branches.tadaformer import TAdaFormerBlock 12 | from tadaconv.models.module_zoo.branches.tadaconvnextv2 import TAdaConvNeXtV2Block -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/branches/csn_branch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ CSN Branch. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from tadaconv.models.base.base_blocks import BaseBranch, Base3DStem, BaseHead 10 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY 11 | 12 | @BRANCH_REGISTRY.register() 13 | class CSNBranch(BaseBranch): 14 | """ 15 | The ir-CSN branch. 16 | 17 | See Du Tran et al. 18 | Video Classification with Channel-Separated Convolutional Networks. 19 | """ 20 | def __init__(self, cfg, block_idx): 21 | """ 22 | Args: 23 | cfg (Config): global config object. 24 | block_idx (list): list of [stage_id, block_id], both starting from 0. 25 | """ 26 | super(CSNBranch, self).__init__(cfg, block_idx) 27 | 28 | def _construct_bottleneck(self): 29 | self.a = nn.Conv3d( 30 | in_channels = self.dim_in, 31 | out_channels = self.num_filters//self.expansion_ratio, 32 | kernel_size = 1, 33 | stride = 1, 34 | padding = 0, 35 | bias = False 36 | ) 37 | self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 38 | self.a_relu = nn.ReLU(inplace=True) 39 | 40 | self.b = nn.Conv3d( 41 | in_channels = self.num_filters//self.expansion_ratio, 42 | out_channels = self.num_filters//self.expansion_ratio, 43 | kernel_size = self.kernel_size, 44 | stride = self.stride, 45 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 46 | bias = False, 47 | groups = self.num_filters//self.expansion_ratio, 48 | ) 49 | self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 50 | self.b_relu = nn.ReLU(inplace=True) 51 | 52 | self.c = nn.Conv3d( 53 | in_channels = self.num_filters//self.expansion_ratio, 54 | out_channels = self.num_filters, 55 | kernel_size = 1, 56 | stride = 1, 57 | padding = 0, 58 | bias = False 59 | ) 60 | self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 61 | 62 | def forward(self, x): 63 | if self.transformation == 'bottleneck': 64 | x = self.a(x) 65 | x = self.a_bn(x) 66 | x = self.a_relu(x) 67 | 68 | x = self.b(x) 69 | x = self.b_bn(x) 70 | x = self.b_relu(x) 71 | 72 | x = self.c(x) 73 | x = self.c_bn(x) 74 | return x 75 | -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/branches/non_local.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ NonLocal block. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from tadaconv.models.base.base_blocks import BaseBranch, BRANCH_REGISTRY 10 | 11 | @BRANCH_REGISTRY.register() 12 | class NonLocal(BaseBranch): 13 | """ 14 | Non-local block. 15 | 16 | See Xiaolong Wang et al. 17 | Non-local Neural Networks. 18 | """ 19 | 20 | def __init__(self, cfg, block_idx): 21 | super(NonLocal, self).__init__(cfg, block_idx) 22 | 23 | self.dim_middle = self.dim_in // 2 24 | 25 | self.qconv = nn.Conv3d( 26 | self.dim_in, 27 | self.dim_middle, 28 | kernel_size=1, 29 | stride=1, 30 | padding=0 31 | ) 32 | 33 | self.kconv = nn.Conv3d( 34 | self.dim_in, 35 | self.dim_middle, 36 | kernel_size=1, 37 | stride=1, 38 | padding=0 39 | ) 40 | 41 | self.vconv = nn.Conv3d( 42 | self.dim_in, 43 | self.dim_middle, 44 | kernel_size=1, 45 | stride=1, 46 | padding=0 47 | ) 48 | 49 | self.out_conv = nn.Conv3d( 50 | self.dim_middle, 51 | self.num_filters, 52 | kernel_size=1, 53 | stride=1, 54 | padding=0, 55 | ) 56 | self.out_bn = nn.BatchNorm3d(self.num_filters, eps=1e-5, momentum=self.bn_mmt) 57 | 58 | def forward(self, x): 59 | n,c,t,h,w = x.shape 60 | 61 | query = self.qconv(x).view(n, self.dim_middle, -1) 62 | key = self.kconv(x).view(n, self.dim_middle, -1) 63 | value = self.vconv(x).view(n, self.dim_middle, -1) 64 | 65 | attn = torch.einsum("nct,ncp->ntp", (query, key)) 66 | attn = attn * (self.dim_middle ** -0.5) 67 | attn = F.softmax(attn, dim=2) 68 | 69 | out = torch.einsum("ntg,ncg->nct", (attn, value)) 70 | out = out.view(n, self.dim_middle, t, h, w) 71 | out = self.out_conv(out) 72 | out = self.out_bn(out) 73 | return x + out 74 | 75 | 76 | -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/branches/r2d3d_branch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ R2D3D branch. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from tadaconv.models.base.base_blocks import BaseBranch, BaseHead 10 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY 11 | 12 | @BRANCH_REGISTRY.register() 13 | class R2D3DBranch(BaseBranch): 14 | """ 15 | The R2D3D Branch. 16 | 17 | Essentially the MCx model in 18 | Du Tran et al. 19 | A Closer Look at Spatiotemporal Convoluitions for Action Recognition. 20 | 21 | The model is used in DPC, MemDPC for self-supervised video 22 | representation learning. 23 | """ 24 | def __init__(self, cfg, block_idx): 25 | """ 26 | Args: 27 | cfg (Config): global config object. 28 | block_idx (list): list of [stage_id, block_id], both starting from 0. 29 | """ 30 | super(R2D3DBranch, self).__init__(cfg, block_idx) 31 | 32 | def _construct_simple_block(self): 33 | self.a = nn.Conv3d( 34 | in_channels = self.dim_in, 35 | out_channels = self.num_filters, 36 | kernel_size = self.kernel_size, 37 | stride = self.stride, 38 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 39 | bias = False 40 | ) 41 | self.a_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 42 | self.a_relu = nn.ReLU(inplace=True) 43 | 44 | self.b = nn.Conv3d( 45 | in_channels = self.num_filters, 46 | out_channels = self.num_filters, 47 | kernel_size = self.kernel_size, 48 | stride = 1, 49 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 50 | bias = False 51 | ) 52 | self.b_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 53 | 54 | def _construct_bottleneck(self): 55 | self.a = nn.Conv3d( 56 | in_channels = self.dim_in, 57 | out_channels = self.num_filters//self.expansion_ratio, 58 | kernel_size = 1, 59 | stride = 1, 60 | padding = 0, 61 | bias = False 62 | ) 63 | self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 64 | self.a_relu = nn.ReLU(inplace=True) 65 | 66 | self.b = nn.Conv3d( 67 | in_channels = self.num_filters//self.expansion_ratio, 68 | out_channels = self.num_filters//self.expansion_ratio, 69 | kernel_size = self.kernel_size, 70 | stride = self.stride, 71 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 72 | bias = False 73 | ) 74 | self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 75 | self.b_relu = nn.ReLU(inplace=True) 76 | 77 | self.c = nn.Conv3d( 78 | in_channels = self.num_filters//self.expansion_ratio, 79 | out_channels = self.num_filters, 80 | kernel_size = 1, 81 | stride = 1, 82 | padding = 0, 83 | bias = False 84 | ) 85 | self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 86 | 87 | def forward(self, x): 88 | if self.transformation == 'simple_block': 89 | x = self.a(x) 90 | x = self.a_bn(x) 91 | x = self.a_relu(x) 92 | 93 | x = self.b(x) 94 | x = self.b_bn(x) 95 | return x 96 | elif self.transformation == 'bottleneck': 97 | x = self.a(x) 98 | x = self.a_bn(x) 99 | x = self.a_relu(x) 100 | 101 | x = self.b(x) 102 | x = self.b_bn(x) 103 | x = self.b_relu(x) 104 | 105 | x = self.c(x) 106 | x = self.c_bn(x) 107 | return x 108 | 109 | -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/branches/s3dg_branch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ S3D/S3DG branch. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from tadaconv.models.base.base_blocks import ( 10 | BRANCH_REGISTRY, InceptionBaseConv3D 11 | ) 12 | 13 | class InceptionBlock3D(nn.Module): 14 | """ 15 | Element constructing the S3D/S3DG. 16 | See models/base/backbone.py L99-186. 17 | 18 | Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. 19 | """ 20 | def __init__(self, cfg, in_planes, out_planes): 21 | super(InceptionBlock3D, self).__init__() 22 | 23 | _gating = cfg.VIDEO.BACKBONE.BRANCH.GATING 24 | 25 | assert len(out_planes) == 6 26 | assert isinstance(out_planes, list) 27 | 28 | [num_out_0_0a, 29 | num_out_1_0a, num_out_1_0b, 30 | num_out_2_0a, num_out_2_0b, 31 | num_out_3_0b] = out_planes 32 | 33 | self.branch0 = nn.Sequential( 34 | InceptionBaseConv3D(cfg, in_planes, num_out_0_0a, kernel_size=1, stride=1), 35 | ) 36 | self.branch1 = nn.Sequential( 37 | InceptionBaseConv3D(cfg, in_planes, num_out_1_0a, kernel_size=1, stride=1), 38 | BRANCH_REGISTRY.get(cfg.VIDEO.BACKBONE.BRANCH.NAME)(cfg, num_out_1_0a, num_out_1_0b, kernel_size=3, stride=1, padding=1), 39 | ) 40 | self.branch2 = nn.Sequential( 41 | InceptionBaseConv3D(cfg, in_planes, num_out_2_0a, kernel_size=1, stride=1), 42 | BRANCH_REGISTRY.get(cfg.VIDEO.BACKBONE.BRANCH.NAME)(cfg, num_out_2_0a, num_out_2_0b, kernel_size=3, stride=1, padding=1), 43 | ) 44 | self.branch3 = nn.Sequential( 45 | nn.MaxPool3d(kernel_size=(3, 3, 3), stride=1, padding=1), 46 | InceptionBaseConv3D(cfg, in_planes, num_out_3_0b, kernel_size=1, stride=1), 47 | ) 48 | 49 | self.out_channels = sum([num_out_0_0a, num_out_1_0b, num_out_2_0b, num_out_3_0b]) 50 | 51 | self.gating = _gating 52 | if _gating: 53 | self.gating_b0 = SelfGating(num_out_0_0a) 54 | self.gating_b1 = SelfGating(num_out_1_0b) 55 | self.gating_b2 = SelfGating(num_out_2_0b) 56 | self.gating_b3 = SelfGating(num_out_3_0b) 57 | 58 | 59 | def forward(self, x): 60 | x0 = self.branch0(x) 61 | x1 = self.branch1(x) 62 | x2 = self.branch2(x) 63 | x3 = self.branch3(x) 64 | if self.gating: 65 | x0 = self.gating_b0(x0) 66 | x1 = self.gating_b1(x1) 67 | x2 = self.gating_b2(x2) 68 | x3 = self.gating_b3(x3) 69 | 70 | out = torch.cat((x0, x1, x2, x3), 1) 71 | 72 | return out 73 | 74 | class SelfGating(nn.Module): 75 | def __init__(self, input_dim): 76 | super(SelfGating, self).__init__() 77 | self.fc = nn.Linear(input_dim, input_dim) 78 | 79 | def forward(self, input_tensor): 80 | """Feature gating as used in S3D-G""" 81 | spatiotemporal_average = torch.mean(input_tensor, dim=[2, 3, 4]) 82 | weights = self.fc(spatiotemporal_average) 83 | weights = torch.sigmoid(weights) 84 | return weights[:, :, None, None, None] * input_tensor 85 | 86 | @BRANCH_REGISTRY.register() 87 | class STConv3d(nn.Module): 88 | """ 89 | Element constructing the S3D/S3DG. 90 | See models/base/backbone.py L99-186. 91 | 92 | Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. 93 | """ 94 | def __init__(self,cfg,in_planes,out_planes,kernel_size,stride,padding=0): 95 | super(STConv3d, self).__init__() 96 | if isinstance(stride, tuple): 97 | t_stride = stride[0] 98 | stride = stride[-1] 99 | else: # int 100 | t_stride = stride 101 | 102 | self.bn_mmt = cfg.BN.MOMENTUM 103 | self.bn_eps = cfg.BN.EPS 104 | self._construct_branch( 105 | cfg, 106 | in_planes, 107 | out_planes, 108 | kernel_size, 109 | stride, 110 | t_stride, 111 | padding 112 | ) 113 | 114 | def _construct_branch( 115 | self, 116 | cfg, 117 | in_planes, 118 | out_planes, 119 | kernel_size, 120 | stride, 121 | t_stride, 122 | padding=0 123 | ): 124 | self.conv1 = nn.Conv3d(in_planes, out_planes, kernel_size=(1,kernel_size,kernel_size), 125 | stride=(1,stride,stride),padding=(0,padding,padding), bias=False) 126 | self.conv2 = nn.Conv3d(out_planes,out_planes,kernel_size=(kernel_size,1,1), 127 | stride=(t_stride,1,1),padding=(padding,0,0), bias=False) 128 | 129 | self.bn1=nn.BatchNorm3d(out_planes, eps=self.bn_eps, momentum=self.bn_mmt) 130 | self.bn2=nn.BatchNorm3d(out_planes, eps=self.bn_eps, momentum=self.bn_mmt) 131 | self.relu = nn.ReLU(inplace=True) 132 | 133 | # init 134 | self.conv1.weight.data.normal_(mean=0, std=0.01) # original s3d is truncated normal within 2 std 135 | self.conv2.weight.data.normal_(mean=0, std=0.01) # original s3d is truncated normal within 2 std 136 | self.bn1.weight.data.fill_(1) 137 | self.bn1.bias.data.zero_() 138 | self.bn2.weight.data.fill_(1) 139 | self.bn2.bias.data.zero_() 140 | 141 | def forward(self,x): 142 | x=self.conv1(x) 143 | x=self.bn1(x) 144 | x=self.relu(x) 145 | x=self.conv2(x) 146 | x=self.bn2(x) 147 | x=self.relu(x) 148 | return x 149 | 150 | 151 | -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/branches/slowfast_branch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ SlowFast architectures. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from tadaconv.models.base.base_blocks import BaseBranch 10 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY 11 | from tadaconv.models.utils.init_helper import _init_convnet_weights 12 | 13 | @BRANCH_REGISTRY.register() 14 | class SlowfastBranch(BaseBranch): 15 | """ 16 | Constructs SlowFast conv branch. 17 | 18 | See Christoph Feichtenhofer et al. 19 | SlowFast Networks for Video Recognition. 20 | """ 21 | def __init__(self, cfg, block_idx): 22 | super(SlowfastBranch, self).__init__(cfg, block_idx) 23 | 24 | def _construct_simple_block(self): 25 | self.a = nn.Conv3d( 26 | in_channels = self.dim_in, 27 | out_channels = self.num_filters, 28 | kernel_size = self.kernel_size, 29 | stride = self.stride, 30 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 31 | bias = False 32 | ) 33 | self.a_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 34 | self.a_relu = nn.ReLU(inplace=True) 35 | 36 | self.b = nn.Conv3d( 37 | in_channels = self.num_filters, 38 | out_channels = self.num_filters, 39 | kernel_size = self.kernel_size, 40 | stride = 1, 41 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 42 | bias = False 43 | ) 44 | self.b_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 45 | self.b_bn.transform_final_bn = True 46 | 47 | def _construct_bottleneck(self): 48 | self.a = nn.Conv3d( 49 | in_channels = self.dim_in, 50 | out_channels = self.num_filters//self.expansion_ratio, 51 | kernel_size = [3, 1, 1] if self.cfg.VIDEO.BACKBONE.TEMPORAL_CONV_BOTTLENECK[self.stage_id] else 1, 52 | stride = 1, 53 | padding = [1, 0, 0] if self.cfg.VIDEO.BACKBONE.TEMPORAL_CONV_BOTTLENECK[self.stage_id] else 0, 54 | bias = False 55 | ) 56 | self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 57 | self.a_relu = nn.ReLU(inplace=True) 58 | 59 | self.b = nn.Conv3d( 60 | in_channels = self.num_filters//self.expansion_ratio, 61 | out_channels = self.num_filters//self.expansion_ratio, 62 | kernel_size = self.kernel_size, 63 | stride = self.stride, 64 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 65 | bias = False 66 | ) 67 | self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 68 | self.b_relu = nn.ReLU(inplace=True) 69 | 70 | self.c = nn.Conv3d( 71 | in_channels = self.num_filters//self.expansion_ratio, 72 | out_channels = self.num_filters, 73 | kernel_size = 1, 74 | stride = 1, 75 | padding = 0, 76 | bias = False 77 | ) 78 | self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 79 | self.c_bn.transform_final_bn = True 80 | 81 | def forward(self, x): 82 | if self.transformation == 'simple_block': 83 | x = self.a(x) 84 | x = self.a_bn(x) 85 | x = self.a_relu(x) 86 | 87 | x = self.b(x) 88 | x = self.b_bn(x) 89 | return x 90 | elif self.transformation == 'bottleneck': 91 | x = self.a(x) 92 | x = self.a_bn(x) 93 | x = self.a_relu(x) 94 | 95 | x = self.b(x) 96 | x = self.b_bn(x) 97 | x = self.b_relu(x) 98 | 99 | x = self.c(x) 100 | x = self.c_bn(x) 101 | return x -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/branches/tada_branch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ TAda Branch. """ 5 | 6 | import math 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.nn.modules.utils import _triple 11 | 12 | from tadaconv.models.base.base_blocks import BaseBranch, Base3DStem, BaseHead 13 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY 14 | from tadaconv.models.module_zoo.ops.tadaconv import RouteFuncMLP, TAdaConv2d 15 | 16 | @BRANCH_REGISTRY.register() 17 | class TAda2DBlock(BaseBranch): 18 | """ 19 | The TAdaConv branch with average pooling as the feature aggregation scheme. 20 | 21 | For details, see 22 | Ziyuan Huang, Shiwei Zhang, Liang Pan, Zhiwu Qing, Mingqian Tang, Ziwei Liu, and Marcelo H. Ang Jr. 23 | "TAda! Temporally-Adaptive Convolutions for Video Understanding." 24 | 25 | """ 26 | def __init__(self, cfg, block_idx): 27 | super(TAda2DBlock, self).__init__(cfg, block_idx, construct_branch=False) 28 | 29 | self._construct_branch() 30 | 31 | def _construct_bottleneck(self): 32 | self.a = nn.Conv3d( 33 | in_channels = self.dim_in, 34 | out_channels = self.num_filters//self.expansion_ratio, 35 | kernel_size = 1, 36 | stride = 1, 37 | padding = 0, 38 | bias = False 39 | ) 40 | self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 41 | self.a_relu = nn.ReLU(inplace=True) 42 | 43 | self.b = TAdaConv2d( 44 | in_channels = self.num_filters//self.expansion_ratio, 45 | out_channels = self.num_filters//self.expansion_ratio, 46 | kernel_size = [1, self.kernel_size[1], self.kernel_size[2]], 47 | stride = [1, self.stride[1], self.stride[2]], 48 | padding = [0, self.kernel_size[1]//2, self.kernel_size[2]//2], 49 | bias = False 50 | ) 51 | self.b_rf = RouteFuncMLP( 52 | c_in=self.num_filters//self.expansion_ratio, 53 | ratio=self.cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_R, 54 | kernels=self.cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_K, 55 | ) 56 | self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 57 | 58 | self.b_avgpool = nn.AvgPool3d( 59 | kernel_size=[ 60 | self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[0], 61 | self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[1], 62 | self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[2] 63 | ], 64 | stride=1, 65 | padding=[ 66 | self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[0]//2, 67 | self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[1]//2, 68 | self.cfg.VIDEO.BACKBONE.BRANCH.POOL_K[2]//2 69 | ], 70 | ) 71 | self.b_avgpool_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 72 | self.b_avgpool_bn.skip_init=True 73 | self.b_avgpool_bn.weight.data.zero_() 74 | self.b_avgpool_bn.bias.data.zero_() 75 | 76 | self.b_relu = nn.ReLU(inplace=True) 77 | 78 | self.c = nn.Conv3d( 79 | in_channels = self.num_filters//self.expansion_ratio, 80 | out_channels = self.num_filters, 81 | kernel_size = 1, 82 | stride = 1, 83 | padding = 0, 84 | bias = False 85 | ) 86 | self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 87 | 88 | def forward(self, x): 89 | if self.transformation == 'bottleneck': 90 | x = self.a(x) 91 | x = self.a_bn(x) 92 | x = self.a_relu(x) 93 | 94 | x = self.b(x, self.b_rf(x)) 95 | x = self.b_bn(x) + self.b_avgpool_bn(self.b_avgpool(x)) 96 | x = self.b_relu(x) 97 | 98 | x = self.c(x) 99 | x = self.c_bn(x) 100 | return x -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/branches/tadaconvnextv2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ TAdaConvNeXtV2 block. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from collections import OrderedDict 10 | 11 | from tadaconv.models.module_zoo.ops.misc import QuickGELU 12 | from tadaconv.models.utils.init_helper import trunc_normal_ 13 | from tadaconv.models.base.base_blocks import BRANCH_REGISTRY, DropPath 14 | from tadaconv.models.module_zoo.ops.tadaconv_v2 import TAdaConv2dV2, RouteFuncwTransformer 15 | from tadaconv.models.module_zoo.ops.misc import LayerNorm 16 | 17 | @BRANCH_REGISTRY.register() 18 | class TAdaConvNeXtV2Block(nn.Module): 19 | r""" TAdaConvNeXtV2 Block. 20 | Args: 21 | cfg (Config): the global config object. 22 | drop_path (float): Stochastic depth rate. Default: 0.0 23 | layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. 24 | """ 25 | def __init__(self, cfg, dim, drop_path=0., layer_scale_init_value=1e-6): 26 | super().__init__() 27 | self.dwconv = TAdaConv2dV2( 28 | dim, dim, kernel_size=(1,7,7), padding=(0,3,3), groups=dim, 29 | cal_dim="cout", 30 | internal_rf_func=False, 31 | internal_temp_aggr=False 32 | ) 33 | self.dwconv_rf = RouteFuncwTransformer( 34 | c_in=dim, 35 | ratio=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_R, 36 | kernels=cfg.VIDEO.BACKBONE.BRANCH.ROUTE_FUNC_K, 37 | with_bias_cal=self.dwconv.bias is not None, 38 | zero_init_cal=True, 39 | head_dim=cfg.VIDEO.BACKBONE.BRANCH.HEAD_DIM if hasattr(cfg.VIDEO.BACKBONE.BRANCH, "HEAD_DIM") else 48 40 | ) 41 | self.norm = LayerNorm(dim, eps=1e-6) 42 | self.avgpool = nn.AvgPool3d(kernel_size=(3,1,1),stride=(1,1,1),padding=(1,0,0)) 43 | self.norm_avgpool = LayerNorm(dim, eps=1e-6) 44 | self.norm_avgpool.weight.data.zero_() 45 | self.norm_avgpool.bias.data.zero_() 46 | self.pwconv1 = nn.Linear(dim, 4 * dim) 47 | self.act = QuickGELU() 48 | self.pwconv2 = nn.Linear(4 * dim, dim) 49 | self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), 50 | requires_grad=True) if layer_scale_init_value > 0 else None 51 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 52 | 53 | def forward(self, x): 54 | input = x 55 | 56 | x = self.dwconv(x, reshape_required=False, alpha=self.dwconv_rf(x)) 57 | 58 | # temporal aggregation 59 | norm_avgpool_x = self.avgpool(x) 60 | x = x.permute(0, 2, 3, 4, 1) # (N, C, T, H, W) -> (N, T, H, W, C) 61 | norm_avgpool_x = norm_avgpool_x.permute(0, 2, 3, 4, 1) # (N, C, T, H, W) -> (N, T, H, W, C) 62 | x = self.norm(x) + self.norm_avgpool(norm_avgpool_x) 63 | 64 | x = self.pwconv1(x) 65 | x = self.act(x) 66 | x = self.pwconv2(x) 67 | if self.gamma is not None: 68 | x = self.gamma * x 69 | x = x.permute(0, 4, 1, 2, 3) # (N, T, H, W, C) -> (N, C, T, H, W) 70 | 71 | x = input + self.drop_path(x) 72 | return x -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/heads/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from .mosi_head import MoSIHeadJoint 5 | from .slowfast_head import SlowFastHead 6 | from .transformer_head import TransformerHead 7 | from .bmn_head import BaseBMN 8 | -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/heads/transformer_head.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Transformer heads. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from tadaconv.models.base.base_blocks import BaseHead 10 | from tadaconv.models.base.base_blocks import HEAD_REGISTRY 11 | 12 | from collections import OrderedDict 13 | from tadaconv.models.utils.init_helper import lecun_normal_, trunc_normal_, _init_transformer_weights 14 | 15 | @HEAD_REGISTRY.register() 16 | class TransformerHead(BaseHead): 17 | """ 18 | Construct head for video vision transformers. 19 | """ 20 | def __init__(self, cfg): 21 | """ 22 | Args: 23 | cfg (Config): global config object. 24 | """ 25 | super(TransformerHead, self).__init__(cfg) 26 | self.apply(_init_transformer_weights) 27 | 28 | def _construct_head( 29 | self, 30 | dim, 31 | num_classes, 32 | dropout_rate, 33 | activation_func, 34 | ): 35 | if self.cfg.VIDEO.HEAD.PRE_LOGITS: 36 | self.pre_logits = nn.Sequential(OrderedDict([ 37 | ('fc', nn.Linear(dim, dim)), 38 | ('act', nn.Tanh()) 39 | ])) 40 | 41 | self.linear = nn.Linear(dim, num_classes) 42 | 43 | if dropout_rate > 0.0: 44 | self.dropout = nn.Dropout(dropout_rate) 45 | 46 | if activation_func == "softmax": 47 | self.activation = nn.Softmax(dim=-1) 48 | elif activation_func == "sigmoid": 49 | self.activation = nn.Sigmoid() 50 | elif activation_func == "identity": 51 | self.activation = nn.Identity() 52 | else: 53 | raise NotImplementedError( 54 | "{} is not supported as an activation" 55 | "function.".format(activation_func) 56 | ) 57 | 58 | def forward(self, x): 59 | """ 60 | Returns: 61 | x (Tensor): classification predictions. 62 | logits (Tensor): global average pooled features. 63 | """ 64 | if hasattr(self, "dropout"): 65 | out = self.dropout(x) 66 | else: 67 | out = x 68 | if hasattr(self, "pre_logits"): 69 | out = self.pre_logits(out) 70 | out = self.linear(out) 71 | 72 | if not self.training: 73 | out = self.activation(out) 74 | return out, x 75 | 76 | @HEAD_REGISTRY.register() 77 | class TransformerHeadx2(BaseHead): 78 | """ 79 | The Transformer head for EPIC-KITCHENS dataset. 80 | """ 81 | def __init__(self, cfg): 82 | """ 83 | Args: 84 | cfg (Config): global config object. 85 | """ 86 | super(TransformerHeadx2, self).__init__(cfg) 87 | self.apply(_init_transformer_weights) 88 | 89 | def _construct_head( 90 | self, 91 | dim, 92 | num_classes, 93 | dropout_rate, 94 | activation_func, 95 | ): 96 | if self.cfg.VIDEO.HEAD.PRE_LOGITS: 97 | self.pre_logits1 = nn.Sequential(OrderedDict([ 98 | ('fc', nn.Linear(dim, dim)), 99 | ('act', nn.Tanh()) 100 | ])) 101 | self.pre_logits2 = nn.Sequential(OrderedDict([ 102 | ('fc', nn.Linear(dim, dim)), 103 | ('act', nn.Tanh()) 104 | ])) 105 | self.linear1 = nn.Linear(dim, num_classes[0], bias=True) 106 | self.linear2 = nn.Linear(dim, num_classes[1], bias=True) 107 | 108 | if dropout_rate > 0.0: 109 | self.dropout = nn.Dropout(dropout_rate) 110 | 111 | if activation_func == "softmax": 112 | self.activation = nn.Softmax(dim=-1) 113 | elif activation_func == "sigmoid": 114 | self.activation = nn.Sigmoid() 115 | elif activation_func == "identity": 116 | self.activation = nn.Identity() 117 | else: 118 | raise NotImplementedError( 119 | "{} is not supported as an activation" 120 | "function.".format(activation_func) 121 | ) 122 | 123 | def forward(self, x): 124 | """ 125 | Returns: 126 | x (dict): dictionary of classification predictions, 127 | with keys "verb_class" and "noun_class" indicating 128 | the predictions on the verb and noun. 129 | logits (Tensor): global average pooled features. 130 | """ 131 | if hasattr(self, "dropout"): 132 | out1 = self.dropout(x) 133 | out2 = self.dropout(x) 134 | else: 135 | out1 = x 136 | out2 = x 137 | 138 | if hasattr(self, "pre_logits1"): 139 | out1 = self.pre_logits1(out1) 140 | out2 = self.pre_logits2(out2) 141 | 142 | out1 = self.linear1(out1) 143 | out2 = self.linear2(out2) 144 | 145 | if not self.training: 146 | out1 = self.activation(out1) 147 | out2 = self.activation(out2) 148 | return {"verb_class": out1, "noun_class": out2}, x -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/ops/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from .tadaconv import TAdaConv2d 5 | from .tadaconv_v2 import TAdaConv2dV2 6 | from .misc import LayerNorm -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/ops/misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Micellaneous operations. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | class LayerNorm(nn.Module): 11 | r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 12 | The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 13 | shape (batch_size, height, width, channels) while channels_first corresponds to inputs 14 | with shape (batch_size, channels, height, width). 15 | """ 16 | def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): 17 | super().__init__() 18 | self.weight = nn.Parameter(torch.ones(normalized_shape)) 19 | self.bias = nn.Parameter(torch.zeros(normalized_shape)) 20 | self.eps = eps 21 | self.data_format = data_format 22 | if self.data_format not in ["channels_last", "channels_first"]: 23 | raise NotImplementedError 24 | self.normalized_shape = (normalized_shape, ) 25 | 26 | def forward(self, x): 27 | if self.data_format == "channels_last": 28 | return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) 29 | elif self.data_format == "channels_first": 30 | u = x.mean(1, keepdim=True) 31 | s = (x - u).pow(2).mean(1, keepdim=True) 32 | x = (x - u) / torch.sqrt(s + self.eps) 33 | if len(x.shape) == 5: 34 | x = self.weight[:, None, None, None] * x + self.bias[:, None, None, None] 35 | elif len(x.shape) == 3: 36 | x = self.weight[:, None] * x + self.bias[:, None] 37 | return x 38 | 39 | class QuickGELU(nn.Module): 40 | def forward(self, x: torch.Tensor): 41 | return x * torch.sigmoid(1.702 * x) -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/stems/__init__.py: -------------------------------------------------------------------------------- 1 | from .downsample_stem import DownSampleStem 2 | from .r2plus1d_stem import R2Plus1DStem 3 | from .embedding_stem import PatchEmbedStem, TubeletEmbeddingStem -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/stems/downsample_stem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Downsample Stem. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from tadaconv.models.base.base_blocks import Base3DStem 10 | from tadaconv.models.base.base_blocks import STEM_REGISTRY 11 | 12 | @STEM_REGISTRY.register() 13 | class DownSampleStem(Base3DStem): 14 | """ 15 | Inherits base 3D stem and adds a maxpool as downsampling. 16 | """ 17 | def __init__(self, cfg): 18 | super(DownSampleStem, self).__init__(cfg) 19 | self.maxpool = nn.MaxPool3d( 20 | kernel_size = (1, 3, 3), 21 | stride = (1, 2, 2), 22 | padding = (0, 1, 1) 23 | ) 24 | 25 | def forward(self, x): 26 | x = self.a(x) 27 | x = self.a_bn(x) 28 | x = self.a_relu(x) 29 | x = self.maxpool(x) 30 | return x 31 | 32 | -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/stems/embedding_stem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Embedding stems. """ 5 | 6 | import math 7 | import torch 8 | from torch import nn, einsum 9 | import torch.nn.functional as F 10 | from einops import rearrange, repeat 11 | from tadaconv.models.base.backbone import BACKBONE_REGISTRY 12 | from tadaconv.models.base.base_blocks import ( 13 | STEM_REGISTRY, BRANCH_REGISTRY, HEAD_REGISTRY, DropPath, BaseHead 14 | ) 15 | 16 | @STEM_REGISTRY.register() 17 | class PatchEmbedStem(nn.Module): 18 | """ 19 | Video to Patch Embedding. 20 | """ 21 | def __init__(self, cfg): 22 | """ 23 | Args: 24 | cfg (Config): global config object. 25 | """ 26 | super().__init__() 27 | image_size = cfg.DATA.TRAIN_CROP_SIZE if cfg is not None else 224 # default 224 28 | channels = cfg.DATA.NUM_INPUT_CHANNELS if cfg is not None else 3 # default 3 29 | num_frames = cfg.DATA.NUM_INPUT_FRAMES if cfg is not None else 16 30 | patch_size = cfg.VIDEO.BACKBONE.PATCH_SIZE if cfg is not None else 16 # default 16 31 | dim = cfg.VIDEO.BACKBONE.NUM_FEATURES if cfg is not None else 768 # default 768 32 | 33 | num_patches_per_image = (image_size // patch_size) ** 2 34 | num_patches = num_patches_per_image * num_frames 35 | 36 | self.image_size = image_size 37 | self.patch_size = patch_size 38 | self.num_frames = num_frames 39 | self.num_patches = num_patches 40 | 41 | self.conv1 = nn.Conv3d( 42 | in_channels =channels, 43 | out_channels =dim, 44 | kernel_size =[1, patch_size, patch_size], 45 | stride =[1, patch_size, patch_size], 46 | ) 47 | 48 | def forward(self, x): 49 | b, c, t, h, w, p = *x.shape, self.patch_size 50 | assert h % p == 0 and w % p == 0, f'height {h} and width {w} of video must be divisible by the patch size {p}' 51 | x = self.conv1(x) 52 | # b, c, t, h, w -> b, c, p (p: num patches) 53 | x = x.reshape(x.shape[0], x.shape[1], -1) 54 | # b, c, p -> b, p, c 55 | x = x.permute(0, 2, 1) 56 | return x 57 | 58 | @STEM_REGISTRY.register() 59 | class TubeletEmbeddingStem(nn.Module): 60 | """ 61 | Video to Tubelet Embedding. 62 | """ 63 | def __init__(self, cfg): 64 | """ 65 | Args: 66 | cfg (Config): global config object. 67 | """ 68 | super().__init__() 69 | image_size = cfg.DATA.TRAIN_CROP_SIZE if cfg is not None else 224 # default 224 70 | channels = cfg.DATA.NUM_INPUT_CHANNELS if cfg is not None else 3 # default 3 71 | num_frames = cfg.DATA.NUM_INPUT_FRAMES if cfg is not None else 16 72 | patch_size = cfg.VIDEO.BACKBONE.PATCH_SIZE if cfg is not None else 16 # default 16 73 | dim = cfg.VIDEO.BACKBONE.NUM_FEATURES if cfg is not None else 768 # default 768 74 | tubelet_size = cfg.VIDEO.BACKBONE.TUBELET_SIZE if cfg is not None else 2 75 | 76 | num_patches_per_image = (image_size // patch_size) ** 2 77 | num_patches = num_patches_per_image * num_frames 78 | 79 | self.image_size = image_size 80 | self.patch_size = patch_size 81 | self.num_frames = num_frames 82 | self.num_patches = num_patches 83 | 84 | self.conv1 = nn.Conv3d( 85 | in_channels =channels, 86 | out_channels =dim, 87 | kernel_size =[tubelet_size, patch_size, patch_size], 88 | stride =[tubelet_size, patch_size, patch_size], 89 | ) 90 | 91 | def forward(self, x): 92 | b, c, t, h, w, p = *x.shape, self.patch_size 93 | assert h % p == 0 and w % p == 0, f'height {h} and width {w} of video must be divisible by the patch size {p}' 94 | x = self.conv1(x) 95 | # b, c, t, h, w -> b, c, p (p: num patches) 96 | x = x.reshape(x.shape[0], x.shape[1], -1) 97 | # b, c, p -> b, p, c 98 | x = x.permute(0, 2, 1) 99 | return x -------------------------------------------------------------------------------- /tadaconv/models/module_zoo/stems/r2plus1d_stem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ R2Plus1D stem. """ 5 | 6 | import math 7 | import torch 8 | import torch.nn as nn 9 | 10 | from tadaconv.models.base.base_blocks import Base3DStem 11 | from tadaconv.models.base.base_blocks import STEM_REGISTRY 12 | 13 | @STEM_REGISTRY.register() 14 | class R2Plus1DStem(Base3DStem): 15 | """ 16 | R(2+1)D Stem. 17 | """ 18 | def __init__( 19 | self, 20 | cfg 21 | ): 22 | super(R2Plus1DStem, self).__init__(cfg) 23 | 24 | def _construct_block( 25 | self, 26 | cfg, 27 | dim_in, 28 | num_filters, 29 | kernel_sz, 30 | stride, 31 | bn_eps=1e-5, 32 | bn_mmt=0.1 33 | ): 34 | 35 | mid_dim = int( 36 | math.floor((kernel_sz[0] * kernel_sz[1] * kernel_sz[2] * dim_in * num_filters) / \ 37 | (kernel_sz[1] * kernel_sz[2] * dim_in + kernel_sz[0] * num_filters))) 38 | 39 | self.a1 = nn.Conv3d( 40 | in_channels = dim_in, 41 | out_channels = mid_dim, 42 | kernel_size = [1, kernel_sz[1], kernel_sz[2]], 43 | stride = [1, stride[1], stride[2]], 44 | padding = [0, kernel_sz[1]//2, kernel_sz[2]//2], 45 | bias = False 46 | ) 47 | self.a1_bn = nn.BatchNorm3d(mid_dim, eps=bn_eps, momentum=bn_mmt) 48 | self.a1_relu = nn.ReLU(inplace=True) 49 | 50 | self.a2 = nn.Conv3d( 51 | in_channels = mid_dim, 52 | out_channels = num_filters, 53 | kernel_size = [kernel_sz[0], 1, 1], 54 | stride = [stride[0], 1, 1], 55 | padding = [kernel_sz[0]//2, 0, 0], 56 | bias = False 57 | ) 58 | self.a2_bn = nn.BatchNorm3d(num_filters, eps=bn_eps, momentum=bn_mmt) 59 | self.a2_relu = nn.ReLU(inplace=True) 60 | 61 | def forward(self, x): 62 | x = self.a1(x) 63 | x = self.a1_bn(x) 64 | x = self.a1_relu(x) 65 | 66 | x = self.a2(x) 67 | x = self.a2_bn(x) 68 | x = self.a2_relu(x) 69 | return x -------------------------------------------------------------------------------- /tadaconv/models/utils/lars.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # From https://github.com/open-mmlab/OpenSelfSup/blob/1db69ecebbc129e8fa90cdcea6f2082f0a4e3d17/openselfsup/utils/optimizers.py 3 | 4 | import torch 5 | from torch.optim.optimizer import Optimizer, required 6 | from torch.optim import * 7 | 8 | 9 | class LARS(Optimizer): 10 | r"""Implements layer-wise adaptive rate scaling for SGD. 11 | 12 | Args: 13 | params (iterable): iterable of parameters to optimize or dicts defining 14 | parameter groups 15 | lr (float): base learning rate (\gamma_0) 16 | momentum (float, optional): momentum factor (default: 0) ("m") 17 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 18 | ("\beta") 19 | dampening (float, optional): dampening for momentum (default: 0) 20 | eta (float, optional): LARS coefficient 21 | nesterov (bool, optional): enables Nesterov momentum (default: False) 22 | 23 | Based on Algorithm 1 of the following paper by You, Gitman, and Ginsburg. 24 | Large Batch Training of Convolutional Networks: 25 | https://arxiv.org/abs/1708.03888 26 | 27 | Example: 28 | >>> optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9, 29 | >>> weight_decay=1e-4, eta=1e-3) 30 | >>> optimizer.zero_grad() 31 | >>> loss_fn(model(input), target).backward() 32 | >>> optimizer.step() 33 | """ 34 | 35 | def __init__(self, 36 | params, 37 | lr=required, 38 | momentum=0, 39 | dampening=0, 40 | weight_decay=0, 41 | eta=0.001, 42 | nesterov=False): 43 | if lr is not required and lr < 0.0: 44 | raise ValueError("Invalid learning rate: {}".format(lr)) 45 | if momentum < 0.0: 46 | raise ValueError("Invalid momentum value: {}".format(momentum)) 47 | if weight_decay < 0.0: 48 | raise ValueError( 49 | "Invalid weight_decay value: {}".format(weight_decay)) 50 | if eta < 0.0: 51 | raise ValueError("Invalid LARS coefficient value: {}".format(eta)) 52 | 53 | defaults = dict( 54 | lr=lr, momentum=momentum, dampening=dampening, 55 | weight_decay=weight_decay, nesterov=nesterov, eta=eta) 56 | if nesterov and (momentum <= 0 or dampening != 0): 57 | raise ValueError("Nesterov momentum requires a momentum and zero dampening") 58 | 59 | super(LARS, self).__init__(params, defaults) 60 | 61 | def __setstate__(self, state): 62 | super(LARS, self).__setstate__(state) 63 | for group in self.param_groups: 64 | group.setdefault('nesterov', False) 65 | 66 | @torch.no_grad() 67 | def step(self, closure=None): 68 | """Performs a single optimization step. 69 | 70 | Args: 71 | closure (callable, optional): A closure that reevaluates the model 72 | and returns the loss. 73 | """ 74 | loss = None 75 | if closure is not None: 76 | with torch.enable_grad(): 77 | loss = closure() 78 | 79 | for group in self.param_groups: 80 | weight_decay = group['weight_decay'] 81 | momentum = group['momentum'] 82 | dampening = group['dampening'] 83 | eta = group['eta'] 84 | nesterov = group['nesterov'] 85 | lr = group['lr'] 86 | lars_exclude = group.get('lars_exclude', False) 87 | 88 | for p in group['params']: 89 | if p.grad is None: 90 | continue 91 | 92 | d_p = p.grad 93 | 94 | if lars_exclude: 95 | local_lr = 1. 96 | else: 97 | weight_norm = torch.norm(p).item() 98 | grad_norm = torch.norm(d_p).item() 99 | # Compute local learning rate for this layer 100 | local_lr = eta * weight_norm / \ 101 | (grad_norm + weight_decay * weight_norm) 102 | 103 | actual_lr = local_lr * lr 104 | d_p = d_p.add(p, alpha=weight_decay).mul(actual_lr) 105 | if momentum != 0: 106 | param_state = self.state[p] 107 | if 'momentum_buffer' not in param_state: 108 | buf = param_state['momentum_buffer'] = \ 109 | torch.clone(d_p).detach() 110 | else: 111 | buf = param_state['momentum_buffer'] 112 | buf.mul_(momentum).add_(d_p, alpha=1 - dampening) 113 | if nesterov: 114 | d_p = d_p.add(buf, alpha=momentum) 115 | else: 116 | d_p = buf 117 | p.add_(-d_p) 118 | 119 | return loss -------------------------------------------------------------------------------- /tadaconv/models/utils/lr_policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # From https://github.com/facebookresearch/SlowFast/blob/master/slowfast/utils/lr_policy.py 4 | 5 | """Learning rate policy.""" 6 | 7 | import math 8 | 9 | 10 | def get_lr_at_epoch(cfg, cur_epoch): 11 | """ 12 | Retrieve the learning rate of the current epoch with the option to perform 13 | warm up in the beginning of the training stage. 14 | Args: 15 | cfg (Config): global config object. 16 | cur_epoch (float): the number of epoch of the current training stage. 17 | """ 18 | lr = get_lr_func(cfg.OPTIMIZER.LR_POLICY)(cfg, cur_epoch) 19 | # Perform warm up. 20 | if cur_epoch < cfg.OPTIMIZER.WARMUP_EPOCHS: 21 | lr_start = cfg.OPTIMIZER.WARMUP_START_LR 22 | lr_end = get_lr_func(cfg.OPTIMIZER.LR_POLICY)( 23 | cfg, cfg.OPTIMIZER.WARMUP_EPOCHS 24 | ) 25 | alpha = (lr_end - lr_start) / cfg.OPTIMIZER.WARMUP_EPOCHS 26 | lr = cur_epoch * alpha + lr_start 27 | return lr 28 | 29 | 30 | def lr_func_cosine(cfg, cur_epoch): 31 | """ 32 | Retrieve the learning rate to specified values at specified epoch with the 33 | cosine learning rate schedule. Details can be found in: 34 | Ilya Loshchilov, and Frank Hutter 35 | SGDR: Stochastic Gradient Descent With Warm Restarts. 36 | Args: 37 | cfg (Config): global config object. 38 | cur_epoch (float): the number of epoch of the current training stage. 39 | """ 40 | return ( 41 | cfg.OPTIMIZER.BASE_LR 42 | * (math.cos(math.pi * cur_epoch / cfg.OPTIMIZER.MAX_EPOCH) + 1.0) 43 | * 0.5 44 | ) 45 | 46 | def lr_func_cosine_v2(cfg, cur_epoch): 47 | """ 48 | Retrieve the learning rate to specified values at specified epoch with the 49 | cosine learning rate schedule. Details can be found in: 50 | Ilya Loshchilov, and Frank Hutter 51 | SGDR: Stochastic Gradient Descent With Warm Restarts. 52 | Args: 53 | cfg (CfgNode): configs. Details can be found in 54 | slowfast/config/defaults.py 55 | cur_epoch (float): the number of epoch of the current training stage. 56 | """ 57 | offset = cfg.OPTIMIZER.WARMUP_EPOCHS if cfg.OPTIMIZER.COSINE_AFTER_WARMUP else 0.0 58 | assert cfg.OPTIMIZER.COSINE_END_LR < cfg.OPTIMIZER.BASE_LR 59 | return ( 60 | cfg.OPTIMIZER.COSINE_END_LR 61 | + (cfg.OPTIMIZER.BASE_LR - cfg.OPTIMIZER.COSINE_END_LR) 62 | * ( 63 | math.cos( 64 | math.pi * (cur_epoch - offset) / (cfg.OPTIMIZER.MAX_EPOCH - offset) 65 | ) 66 | + 1.0 67 | ) 68 | * 0.5 69 | ) 70 | 71 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch): 72 | """ 73 | Retrieve the learning rate to specified values at specified epoch with the 74 | steps with relative learning rate schedule. 75 | Args: 76 | cfg (Config): global config object. 77 | cur_epoch (float): the number of epoch of the current training stage. 78 | """ 79 | ind = get_step_index(cfg, cur_epoch) 80 | return cfg.OPTIMIZER.LRS[ind] * cfg.OPTIMIZER.BASE_LR 81 | 82 | 83 | def get_step_index(cfg, cur_epoch): 84 | """ 85 | Retrieves the lr step index for the given epoch. 86 | Args: 87 | cfg (Config): global config object. 88 | cur_epoch (float): the number of epoch of the current training stage. 89 | """ 90 | steps = cfg.OPTIMIZER.STEPS + [cfg.OPTIMIZER.MAX_EPOCH] 91 | for ind, step in enumerate(steps): # NoQA 92 | if cur_epoch < step: 93 | break 94 | return ind - 1 95 | 96 | 97 | def get_lr_func(lr_policy): 98 | """ 99 | Given the configs, retrieve the specified lr policy function. 100 | Args: 101 | lr_policy (string): the learning rate policy to use for the job. 102 | """ 103 | policy = "lr_func_" + lr_policy 104 | if policy not in globals(): 105 | raise NotImplementedError("Unknown LR policy: {}".format(lr_policy)) 106 | else: 107 | return globals()[policy] 108 | -------------------------------------------------------------------------------- /tadaconv/models/utils/model_ema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py 3 | # thanks for the nice implementation 4 | 5 | import torch 6 | import torch.nn as nn 7 | from copy import deepcopy 8 | 9 | class ModelEmaV2(nn.Module): 10 | """ Model Exponential Moving Average V2 11 | Keep a moving average of everything in the model state_dict (parameters and buffers). 12 | V2 of this module is simpler, it does not match params/buffers based on name but simply 13 | iterates in order. It works with torchscript (JIT of full model). 14 | This is intended to allow functionality like 15 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 16 | A smoothed version of the weights is necessary for some training schemes to perform well. 17 | E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use 18 | RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA 19 | smoothing of weights to match results. Pay attention to the decay constant you are using 20 | relative to your update count per epoch. 21 | To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but 22 | disable validation of the EMA weights. Validation will have to be done manually in a separate 23 | process, or after the training stops converging. 24 | This class is sensitive where it is initialized in the sequence of model init, 25 | GPU assignment and distributed training wrappers. 26 | """ 27 | def __init__(self, model, decay=0.9999, device=None): 28 | super(ModelEmaV2, self).__init__() 29 | # make a copy of the model for accumulating moving average of weights 30 | self.module = deepcopy(model) 31 | self.module.eval() 32 | self.decay = decay 33 | self.device = device # perform ema on different device from model if set 34 | if self.device is not None: 35 | self.module.to(device=device) 36 | 37 | def _update(self, model, update_fn): 38 | with torch.no_grad(): 39 | for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()): 40 | if self.device is not None: 41 | model_v = model_v.to(device=self.device) 42 | ema_v.copy_(update_fn(ema_v, model_v)) 43 | 44 | def update(self, model): 45 | self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m) 46 | 47 | def set(self, model): 48 | self._update(model, update_fn=lambda e, m: m) -------------------------------------------------------------------------------- /tadaconv/models/utils/params.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Params. """ 5 | 6 | def update_3d_conv_params(cfg, conv, idx): 7 | """ 8 | Automatically decodes parameters for 3D convolution blocks according to the config and its index in the model. 9 | Args: 10 | cfg (Config): Config object that contains model parameters such as channel dimensions, whether to downsampling or not, etc. 11 | conv (BaseBranch): Branch whose parameters needs to be specified. 12 | idx (list): List containing the index of the current block. ([stage_id, block_id]) 13 | """ 14 | # extract current block location 15 | stage_id, block_id = idx 16 | conv.stage_id = stage_id 17 | conv.block_id = block_id 18 | 19 | # extract basic info 20 | if block_id == 0: 21 | conv.dim_in = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id-1] 22 | if hasattr(cfg.VIDEO.BACKBONE, "ADD_FUSION_CHANNEL") and cfg.VIDEO.BACKBONE.ADD_FUSION_CHANNEL: 23 | conv.dim_in = conv.dim_in * cfg.VIDEO.BACKBONE.SLOWFAST.CONV_CHANNEL_RATIO // cfg.VIDEO.BACKBONE.SLOWFAST.BETA + conv.dim_in 24 | conv.downsampling = cfg.VIDEO.BACKBONE.DOWNSAMPLING[stage_id] 25 | conv.downsampling_temporal = cfg.VIDEO.BACKBONE.DOWNSAMPLING_TEMPORAL[stage_id] 26 | else: 27 | conv.downsampling = False 28 | conv.dim_in = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id] 29 | conv.num_filters = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id] 30 | conv.bn_mmt = cfg.BN.MOMENTUM 31 | conv.bn_eps = cfg.BN.EPS 32 | conv.kernel_size = cfg.VIDEO.BACKBONE.KERNEL_SIZE[stage_id] 33 | conv.expansion_ratio = cfg.VIDEO.BACKBONE.EXPANSION_RATIO if hasattr(cfg.VIDEO.BACKBONE, "EXPANSION_RATIO") else None 34 | 35 | # configure downsampling 36 | if conv.downsampling: 37 | if conv.downsampling_temporal: 38 | conv.stride = [2, 2, 2] 39 | else: 40 | conv.stride = [1, 2, 2] 41 | else: 42 | conv.stride = [1, 1, 1] 43 | 44 | # define transformation 45 | if isinstance(cfg.VIDEO.BACKBONE.DEPTH, str): 46 | conv.transformation = 'bottleneck' 47 | else: 48 | if cfg.VIDEO.BACKBONE.DEPTH <= 34: 49 | conv.transformation = 'simple_block' 50 | else: 51 | conv.transformation = 'bottleneck' 52 | 53 | # calculate the input size 54 | num_downsampling_spatial = sum( 55 | cfg.VIDEO.BACKBONE.DOWNSAMPLING[:stage_id+(block_id>0)] 56 | ) 57 | if 'DownSample' in cfg.VIDEO.BACKBONE.STEM.NAME: 58 | num_downsampling_spatial += 1 59 | num_downsampling_temporal = sum( 60 | cfg.VIDEO.BACKBONE.DOWNSAMPLING_TEMPORAL[:stage_id+(block_id>0)] 61 | ) 62 | conv.h = cfg.DATA.TRAIN_CROP_SIZE // 2**num_downsampling_spatial \ 63 | + (cfg.DATA.TRAIN_CROP_SIZE//2**(num_downsampling_spatial-1))%2 64 | conv.w = conv.h 65 | conv.t = cfg.DATA.NUM_INPUT_FRAMES // 2**num_downsampling_temporal -------------------------------------------------------------------------------- /tadaconv/sslgenerators/__init__.py: -------------------------------------------------------------------------------- 1 | from .mosi.mosi_generator import MoSIGenerator -------------------------------------------------------------------------------- /tadaconv/sslgenerators/builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Builder for self-supervised generator.""" 5 | 6 | from tadaconv.utils.registry import Registry 7 | 8 | SSL_GENERATOR_REGISTRY = Registry("SSL_Methods") 9 | 10 | def build_ssl_generator(cfg, split): 11 | """ 12 | Entry point to registered self-supervised learning methods. 13 | Returns transformed frames and the self-supervised label. 14 | Args: 15 | split (str): training, validation or test. 16 | """ 17 | ssl_generator = SSL_GENERATOR_REGISTRY.get(cfg.PRETRAIN.GENERATOR)(cfg, split) 18 | return ssl_generator 19 | -------------------------------------------------------------------------------- /tadaconv/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/TAdaConv/75b7839b37fc94d98d4fe5f2aff4b3df4e347dfb/tadaconv/utils/__init__.py -------------------------------------------------------------------------------- /tadaconv/utils/bboxes_1d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def ioa_with_anchors(anchors_min, anchors_max, box_min, box_max): 5 | """ 6 | calculate the overlap proportion between the anchor and all bbox for supervise signal, 7 | Args: 8 | anchors_min (np.ndarry): 1d anchors start position, shape is N. 9 | anchors_max (np.ndarry): 1d anchors end position, shape: N. 10 | box_min (np.ndarry): 1d boxes start position, shape: N. 11 | box_max (np.ndarry): 1d boxes end position, shape: N. 12 | Returns: 13 | scores: (np.ndarry) 14 | """ 15 | len_anchors = anchors_max - anchors_min 16 | int_xmin = np.maximum(anchors_min, box_min) 17 | int_xmax = np.minimum(anchors_max, box_max) 18 | inter_len = np.maximum(int_xmax - int_xmin, 0.) 19 | scores = np.divide(inter_len, len_anchors) 20 | return scores 21 | 22 | 23 | def iou_with_anchors(anchors_min, anchors_max, box_min, box_max): 24 | """ 25 | Compute jaccard score between a box and the anchors. 26 | Args: 27 | anchors_min (np.ndarry): 1d anchors start position, shape is N. 28 | anchors_max (np.ndarry): 1d anchors end position, shape: N. 29 | box_min (np.ndarry): 1d boxes start position, shape: N. 30 | box_max (np.ndarry): 1d boxes end position, shape: N. 31 | Returns: 32 | jaccard: (np.ndarry) 33 | """ 34 | len_anchors = anchors_max - anchors_min 35 | int_xmin = np.maximum(anchors_min, box_min) 36 | int_xmax = np.minimum(anchors_max, box_max) 37 | inter_len = np.maximum(int_xmax - int_xmin, 0.) 38 | union_len = len_anchors - inter_len + box_max - box_min 39 | # print inter_len,union_len 40 | jaccard = np.divide(inter_len, union_len) 41 | return jaccard -------------------------------------------------------------------------------- /tadaconv/utils/eval_tal/eval_tal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | import sys 5 | from .eval_epic_detection import Epicdetection 6 | from tadaconv.utils import logging 7 | import numpy as np 8 | import json 9 | logger = logging.get_logger(__name__) 10 | 11 | 12 | def evaluate_detection(video_anno, detection_result_file, tiou_thresholds=np.linspace(0.5, 0.95, 10)): 13 | """ 14 | Evaluate action detection performance. 15 | Args: 16 | video_anno (str): Annotation file path. 17 | detection_result_file (str): The detection results output by your model. 18 | tiou_thresholds (np.array): Iou thresholds to be tested. 19 | """ 20 | detection = Epicdetection(video_anno, detection_result_file, 21 | tiou_thresholds=tiou_thresholds, 22 | subset='validation', verbose=True, check_status=False) 23 | detection.evaluate() 24 | -------------------------------------------------------------------------------- /tadaconv/utils/launcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ Task launcher. """ 4 | 5 | import os 6 | import torch 7 | from tadaconv.utils.misc import get_num_gpus 8 | 9 | def launch_task(cfg, init_method, func): 10 | """ 11 | Launches the task "func" on one or multiple devices. 12 | Args: 13 | cfg (Config): global config object. 14 | init_method (str): initialization method to launch the job with multiple 15 | devices. 16 | func (function): task to run. 17 | """ 18 | torch.cuda.empty_cache() 19 | if get_num_gpus(cfg) > 1: 20 | if cfg.PAI: 21 | # if using the PAI cluster, get info from the environment 22 | cfg.SHARD_ID = int(os.environ['RANK']) 23 | if "VISIBLE_DEVICE_LIST" in os.environ: 24 | cfg.NUM_GPUS = len(os.environ["VISIBLE_DEVICE_LIST"].split(",")) 25 | else: 26 | cfg.NUM_GPUS = torch.cuda.device_count() 27 | cfg.NUM_SHARDS = int(os.environ['WORLD_SIZE']) 28 | 29 | torch.multiprocessing.spawn( 30 | run, 31 | nprocs=cfg.NUM_GPUS, 32 | args=(func, init_method, cfg), 33 | daemon=False, 34 | ) 35 | else: 36 | func(cfg=cfg) 37 | 38 | def run( 39 | local_rank, func, init_method, cfg 40 | ): 41 | """ 42 | Runs a function from a child process. 43 | Args: 44 | local_rank (int): rank of the current process on the current machine. 45 | func (function): function to execute on each of the process. 46 | init_method (string): method to initialize the distributed training. 47 | cfg (Config): global config object. 48 | """ 49 | 50 | num_proc = cfg.NUM_GPUS # number of nodes per machine 51 | shard_id = cfg.SHARD_ID 52 | num_shards = cfg.NUM_SHARDS # number of machines 53 | backend = cfg.DIST_BACKEND # distribued backends ('nccl', 'gloo' or 'mpi') 54 | 55 | world_size = num_proc * num_shards 56 | rank = shard_id * num_proc + local_rank 57 | cfg.LOCAL_RANK = rank 58 | 59 | # dump machine info 60 | print("num_proc (NUM_GPU): {}".format(num_proc)) 61 | print("shard_id (os.environ['RANK']): {}".format(shard_id)) 62 | print("num_shards (os.environ['WORLD_SIZE']): {}".format(num_shards)) 63 | print("rank: {}".format(rank)) 64 | print("local_rank (GPU_ID): {}".format(local_rank)) 65 | 66 | try: 67 | if cfg.PAI == False: 68 | torch.distributed.init_process_group( 69 | backend=backend, 70 | init_method=init_method, 71 | world_size=world_size, 72 | rank=rank, 73 | ) 74 | else: 75 | torch.distributed.init_process_group( 76 | backend=backend, 77 | world_size=world_size, 78 | rank=rank, 79 | ) 80 | except Exception as e: 81 | raise e 82 | 83 | if "VISIBLE_DEVICE_LIST" in os.environ: 84 | torch.cuda.set_device(int(os.environ["VISIBLE_DEVICE_LIST"])) 85 | else: 86 | torch.cuda.set_device(f'cuda:{local_rank}') 87 | os.system(f"CUDA_VISIBLE_DEVICES={local_rank}") 88 | func(cfg) 89 | -------------------------------------------------------------------------------- /tadaconv/utils/logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ 5 | Logging. 6 | Modified from https://github.com/facebookresearch/SlowFast/blob/master/slowfast/utils/logging.py. 7 | Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 8 | """ 9 | 10 | import builtins 11 | import decimal 12 | import functools 13 | import logging 14 | import os 15 | import sys 16 | import simplejson 17 | 18 | import tadaconv.utils.distributed as du 19 | 20 | 21 | def _suppress_print(): 22 | """ 23 | Suppresses printing from the current process. 24 | """ 25 | 26 | def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False): 27 | pass 28 | 29 | builtins.print = print_pass 30 | 31 | 32 | def setup_logging(cfg, log_file): 33 | """ 34 | Sets up the logging for multiple processes. Only enable the logging for the 35 | master process, and suppress logging for the non-master processes. 36 | """ 37 | if du.is_master_proc(du.get_world_size()): 38 | # Enable logging for the master process. 39 | logging.root.handlers = [] 40 | else: 41 | # Suppress logging for non-master processes. 42 | _suppress_print() 43 | return 44 | 45 | logger = logging.getLogger() 46 | logger.setLevel(logging.INFO) 47 | logger.propagate = False 48 | plain_formatter = logging.Formatter( 49 | "[%(asctime)s][%(levelname)s] %(name)s: %(lineno)4d: %(message)s", 50 | datefmt="%m/%d %H:%M:%S", 51 | ) 52 | 53 | if du.is_master_proc(du.get_world_size()): 54 | ch = logging.StreamHandler(stream=sys.stdout) 55 | ch.setLevel(logging.DEBUG) 56 | ch.setFormatter(plain_formatter) 57 | logger.addHandler(ch) 58 | 59 | if log_file is not None and du.is_master_proc(du.get_world_size()): 60 | filename = os.path.join(cfg.OUTPUT_DIR, log_file) 61 | fh = logging.FileHandler(filename) 62 | fh.setLevel(logging.DEBUG) 63 | fh.setFormatter(plain_formatter) 64 | logger.addHandler(fh) 65 | 66 | 67 | def get_logger(name): 68 | """ 69 | Retrieve the logger with the specified name or, if name is None, return a 70 | logger which is the root logger of the hierarchy. 71 | Args: 72 | name (string): name of the logger. 73 | """ 74 | return logging.getLogger(name) 75 | 76 | 77 | def log_json_stats(stats): 78 | """ 79 | Logs json stats. 80 | Args: 81 | stats (dict): a dictionary of statistical information to log. 82 | """ 83 | stats = { 84 | k: decimal.Decimal("{:.6f}".format(v)) if isinstance(v, float) else v 85 | for k, v in stats.items() 86 | } 87 | json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True) 88 | logger = get_logger(__name__) 89 | logger.info("{:s}".format(json_stats)) 90 | -------------------------------------------------------------------------------- /tadaconv/utils/registry.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Registry class. """ 5 | 6 | class Registry(object): 7 | """ 8 | The Registry class provides a registry for all things 9 | To initialize: 10 | REGISTRY = Registry() 11 | 12 | To register a tracker: 13 | @REGISTRY.register() 14 | class Model(): 15 | ... 16 | """ 17 | 18 | def __init__(self, table_name=""): 19 | """ 20 | Initializes the registry. 21 | Args: 22 | table_name (str): specifies the name of the registry 23 | """ 24 | self._entry_map = {} 25 | self.table_name = table_name 26 | 27 | 28 | def _register(self, name, entry): 29 | """ 30 | Registers the instance. 31 | Args: 32 | name (str): name of the entry 33 | entry (): instance of the entry, could be any type 34 | """ 35 | assert type(name) is str 36 | assert (name not in self._entry_map.keys()), "{} {} already registered.".format( 37 | self.table_name, name 38 | ) 39 | self._entry_map[name] = entry 40 | 41 | def register(self): 42 | """ 43 | Wrapper function for registering a module. 44 | """ 45 | def reg(obj): 46 | name = obj.__name__ 47 | self._register(name, obj) 48 | return obj 49 | return reg 50 | 51 | def get(self, name): 52 | """ 53 | Returns the instance specified by the name. 54 | Args: 55 | name (str): name of the specified instance. 56 | """ 57 | if name not in self._entry_map.keys(): 58 | return None 59 | obj = self._entry_map.get(name) 60 | return obj 61 | 62 | def get_all_registered(self): 63 | """ 64 | Prints all registered class. 65 | """ 66 | return self._entry_map.keys() -------------------------------------------------------------------------------- /tadaconv/utils/sampler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Multi-fold distributed sampler.""" 5 | 6 | import math 7 | import torch 8 | import torch.distributed as dist 9 | from torch.utils.data.sampler import Sampler 10 | 11 | 12 | class MultiFoldDistributedSampler(Sampler): 13 | """Modified from DistributedSampler, which performs multi fold training for 14 | accelerating distributed training with large batches. 15 | 16 | Sampler that restricts data loading to a subset of the dataset. 17 | 18 | It is especially useful in conjunction with 19 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 20 | process can pass a DistributedSampler instance as a DataLoader sampler, 21 | and load a subset of the original dataset that is exclusive to it. 22 | 23 | .. note:: 24 | Dataset is assumed to be of constant size. 25 | 26 | Arguments: 27 | dataset: Dataset used for sampling. 28 | num_replicas (optional): Number of processes participating in 29 | distributed training. 30 | rank (optional): Rank of the current process within num_replicas. 31 | shuffle (optional): If true (default), sampler will shuffle the indices 32 | 33 | .. warning:: 34 | In distributed mode, calling the ``set_epoch`` method is needed to 35 | make shuffling work; each process will use the same random seed 36 | otherwise. 37 | 38 | Example:: 39 | 40 | >>> sampler = DistributedSampler(dataset) if is_distributed else None 41 | >>> loader = DataLoader(dataset, shuffle=(sampler is None), 42 | ... sampler=sampler) 43 | >>> for epoch in range(start_epoch, n_epochs): 44 | ... if is_distributed: 45 | """ 46 | 47 | def __init__(self, dataset, num_folds=1, num_replicas=None, rank=None, shuffle=True): 48 | """ 49 | When num_folds = 1, MultiFoldDistributedSampler degenerates to DistributedSampler. 50 | """ 51 | if num_replicas is None: 52 | if not dist.is_available(): 53 | raise RuntimeError("Requires distributed package to be available") 54 | num_replicas = dist.get_world_size() 55 | if rank is None: 56 | if not dist.is_available(): 57 | raise RuntimeError("Requires distributed package to be available") 58 | rank = dist.get_rank() 59 | self.dataset = dataset 60 | self.num_folds = num_folds 61 | self.num_replicas = num_replicas 62 | self.rank = rank 63 | self.epoch = 0 64 | self.num_samples = int(math.ceil(len(self.dataset) * self.num_folds * 1.0 / self.num_replicas)) 65 | self.total_size = self.num_samples * self.num_replicas 66 | self.shuffle = shuffle 67 | 68 | def __iter__(self): 69 | # deterministically shuffle based on epoch 70 | indices = [] 71 | for fold_idx in range(self.num_folds): 72 | g = torch.Generator() 73 | g.manual_seed(self.epoch+fold_idx) 74 | if self.shuffle: 75 | indices += torch.randperm(len(self.dataset), generator=g).tolist() 76 | else: 77 | indices += list(range(len(self.dataset))) 78 | 79 | 80 | # add extra samples to make it evenly divisible 81 | indices += indices[:(self.total_size - len(indices))] 82 | assert len(indices) == self.total_size 83 | 84 | # subsample 85 | indices = indices[self.rank:self.total_size:self.num_replicas] 86 | assert len(indices) == self.num_samples 87 | 88 | return iter(indices) 89 | 90 | def __len__(self): 91 | return self.num_samples 92 | 93 | def set_epoch(self, epoch): 94 | self.epoch = epoch -------------------------------------------------------------------------------- /tadaconv/utils/tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def tensor2cuda(data): 5 | """ 6 | Put Tensor in iterable data into gpu. 7 | Args: 8 | data :(tensor or list or dict) 9 | """ 10 | if type(data) == torch.Tensor: 11 | return data.cuda(non_blocking=True) 12 | elif type(data) == dict: 13 | keys = list(data.keys()) 14 | for k in keys: 15 | data[k] = tensor2cuda(data[k]) 16 | elif type(data) == list: 17 | for i in range(len(data)): 18 | data[i] = tensor2cuda(data[i]) 19 | return data 20 | -------------------------------------------------------------------------------- /tadaconv/utils/timer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Timer class. """ 5 | 6 | from time import perf_counter 7 | from typing import Optional 8 | 9 | 10 | class Timer: 11 | """ 12 | A timer which computes the time elapsed since the start/reset of the timer. 13 | """ 14 | 15 | def __init__(self) -> None: 16 | self.reset() 17 | 18 | def reset(self) -> None: 19 | """ 20 | Reset the timer. 21 | """ 22 | self._start = perf_counter() 23 | self._paused: Optional[float] = None 24 | self._total_paused = 0 25 | self._count_start = 1 26 | 27 | def pause(self) -> None: 28 | """ 29 | Pause the timer. 30 | """ 31 | if self._paused is not None: 32 | raise ValueError("Trying to pause a Timer that is already paused!") 33 | self._paused = perf_counter() 34 | 35 | def is_paused(self) -> bool: 36 | """ 37 | Returns: 38 | bool: whether the timer is currently paused 39 | """ 40 | return self._paused is not None 41 | 42 | def resume(self) -> None: 43 | """ 44 | Resume the timer. 45 | """ 46 | if self._paused is None: 47 | raise ValueError("Trying to resume a Timer that is not paused!") 48 | self._total_paused += perf_counter() - self._paused # pyre-ignore 49 | self._paused = None 50 | self._count_start += 1 51 | 52 | def seconds(self) -> float: 53 | """ 54 | Returns: 55 | (float): the total number of seconds since the start/reset of the 56 | timer, excluding the time when the timer is paused. 57 | """ 58 | if self._paused is not None: 59 | end_time: float = self._paused # type: ignore 60 | else: 61 | end_time = perf_counter() 62 | return end_time - self._start - self._total_paused 63 | 64 | def avg_seconds(self) -> float: 65 | """ 66 | Returns: 67 | (float): the average number of seconds between every start/reset and 68 | pause. 69 | """ 70 | return self.seconds() / self._count_start -------------------------------------------------------------------------------- /tadaconv/utils/val_dist_sampler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Multi-fold distributed sampler.""" 5 | 6 | import math 7 | import torch 8 | import tadaconv.utils.distributed as dist 9 | from torch.utils.data.distributed import DistributedSampler 10 | 11 | import tadaconv.utils.logging as logging 12 | logger = logging.get_logger(__name__) 13 | 14 | 15 | class MultiSegValDistributedSampler(DistributedSampler): 16 | """Modified from DistributedSampler, which performs multi fold training for 17 | accelerating distributed training with large batches. 18 | 19 | Sampler that restricts data loading to a subset of the dataset. 20 | 21 | It is especially useful in conjunction with 22 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 23 | process can pass a DistributedSampler instance as a DataLoader sampler, 24 | and load a subset of the original dataset that is exclusive to it. 25 | 26 | .. note:: 27 | Dataset is assumed to be of constant size. 28 | 29 | Arguments: 30 | dataset: Dataset used for sampling. 31 | num_replicas (optional): Number of processes participating in 32 | distributed training. 33 | rank (optional): Rank of the current process within num_replicas. 34 | shuffle (optional): If true (default), sampler will shuffle the indices 35 | 36 | .. warning:: 37 | In distributed mode, calling the ``set_epoch`` method is needed to 38 | make shuffling work; each process will use the same random seed 39 | otherwise. 40 | 41 | Example:: 42 | 43 | >>> sampler = MultiSegValDistributedSampler(dataset) if is_distributed else None 44 | >>> loader = DataLoader(dataset, shuffle=(sampler is None), 45 | ... sampler=sampler) 46 | >>> for epoch in range(start_epoch, n_epochs): 47 | ... if is_distributed: 48 | """ 49 | 50 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 51 | """ 52 | We divide each video in epic dataset into multiple sliding windows. 53 | Each sliding window is a sample in validation process for efficient. 54 | This function will assign the sliding windows which belong to the same video to a same gpu. 55 | """ 56 | if num_replicas is None: 57 | num_replicas = dist.get_world_size() 58 | if rank is None: 59 | rank = dist.get_rank() 60 | self.dataset = dataset 61 | self.num_replicas = num_replicas 62 | self.rank = rank 63 | self.epoch = 0 64 | assert shuffle is False 65 | self.shuffle = shuffle 66 | vid_name_dict = {} 67 | self.vid_name_list = [] 68 | self.vid_num_list = [] 69 | for s in dataset._samples: 70 | if s[0] not in vid_name_dict: 71 | vid_name_dict[s[0]] = 0 72 | self.vid_name_list += [s[0]] 73 | self.vid_num_list += [0] 74 | self.vid_num_list[-1] += 1 75 | self.num_samples = int(math.ceil(len(self.vid_name_list) * 1.0 / self.num_replicas)) 76 | self.total_size = self.num_samples * self.num_replicas 77 | self.__init_dist__() 78 | 79 | def __init_dist__(self): 80 | indices = list(range(len(self.vid_name_list))) 81 | # add extra samples to make it evenly divisible 82 | indices += indices[:(self.total_size - len(indices))] 83 | assert len(indices) == self.total_size 84 | 85 | # subsample 86 | indices = indices[self.rank:self.total_size:self.num_replicas] 87 | assert len(indices) == self.num_samples 88 | self.true_indices = [] 89 | for ind in indices: 90 | if ind == 0: 91 | exist_num = 0 92 | else: 93 | exist_num = sum(self.vid_num_list[:ind]) 94 | self.true_indices.extend(list(range(exist_num, exist_num+self.vid_num_list[ind]))) 95 | 96 | def __iter__(self): 97 | return iter(self.true_indices) 98 | 99 | def __len__(self): 100 | return len(self.true_indices) 101 | 102 | def set_epoch(self, epoch): 103 | self.epoch = epoch 104 | --------------------------------------------------------------------------------