├── configs ├── init ├── projects │ ├── mosi │ │ ├── pt-hmdb │ │ │ ├── r2d3ds.yaml │ │ │ └── r2p1d.yaml │ │ ├── pt-ucf │ │ │ ├── r2d3ds.yaml │ │ │ └── r2p1d.yaml │ │ ├── ft-hmdb │ │ │ ├── r2d3ds.yaml │ │ │ ├── r2p1d.yaml │ │ │ ├── r2p1d_test.yaml │ │ │ └── r2d3ds_test.yaml │ │ ├── ft-ucf │ │ │ ├── r2d3ds.yaml │ │ │ ├── r2p1d.yaml │ │ │ ├── r2d3ds_test.yaml │ │ │ └── r2p1d_test.yaml │ │ ├── pt-imagenet │ │ │ └── r2d3ds.yaml │ │ ├── baselines │ │ │ ├── r2p1d_hmdb.yaml │ │ │ ├── r2p1d_ucf.yaml │ │ │ ├── r2d3ds_hmdb.yaml │ │ │ └── r2d3ds_ucf.yaml │ │ ├── ft_r2p1d_ucf.yaml │ │ ├── ft_r2d3ds_hmdb.yaml │ │ ├── ft_r2d3ds_ucf.yaml │ │ ├── ft_r2p1d_hmdb.yaml │ │ ├── mosi_r2p1d_hmdb.yaml │ │ ├── mosi_r2p1d_ucf.yaml │ │ ├── mosi_r2d3ds_hmdb.yaml │ │ ├── mosi_r2d3ds_ucf.yaml │ │ └── mosi_r2d3ds_imagenet.yaml │ ├── tada │ │ ├── ssv2 │ │ │ ├── tada2d_8f.yaml │ │ │ └── tada2d_16f.yaml │ │ ├── k400 │ │ │ ├── tada2d_8x8.yaml │ │ │ └── tada2d_16x5.yaml │ │ ├── tada2d_k400.yaml │ │ ├── tada2d_ssv2.yaml │ │ └── csn_ek100.yaml │ ├── epic-kitchen-ar │ │ ├── k400 │ │ │ ├── vivit_fac_enc_b16x2_test.yaml │ │ │ └── vivit_fac_enc_b16x2.yaml │ │ ├── ek100 │ │ │ ├── csn_submit.yaml │ │ │ ├── csn_test.yaml │ │ │ ├── vivit_fac_enc_submit.yaml │ │ │ ├── vivit_fac_enc_test.yaml │ │ │ ├── csn.yaml │ │ │ └── vivit_fac_enc.yaml │ │ ├── vivit_fac_enc_k400.yaml │ │ ├── vivit_fac_enc_ek100_submission.yaml │ │ ├── csn_ek100_submission.yaml │ │ ├── vivit_fac_enc_ek100.yaml │ │ └── csn_ek100.yaml │ ├── epic-kitchen-tal │ │ ├── bmn-epic │ │ │ └── vivit-os-local.yaml │ │ └── bmn_epic.yaml │ └── hyrsm │ │ ├── OTAM_base.yaml │ │ ├── ssv2_full │ │ ├── HyRSM_SSv2_Full_2shot_v1.yaml │ │ ├── HyRSM_SSv2_Full_3shot_v1.yaml │ │ ├── HyRSM_SSv2_Full_4shot_v1.yaml │ │ ├── HyRSM_SSv2_Full_5shot_v1.yaml │ │ └── HyRSM_SSv2_Full_1shot_v1.yaml │ │ ├── ssv2_small │ │ ├── HyRSM_SSv2_Small_2shot_v1.yaml │ │ ├── HyRSM_SSv2_Small_3shot_v1.yaml │ │ ├── HyRSM_SSv2_Small_4shot_v1.yaml │ │ ├── HyRSM_SSv2_Small_5shot_v1.yaml │ │ └── HyRSM_SSv2_Small_1shot_v1.yaml │ │ ├── ucf101 │ │ ├── HyRSM_UCF101_2shot_v1.yaml │ │ ├── HyRSM_UCF101_3shot_v1.yaml │ │ ├── HyRSM_UCF101_4shot_v1.yaml │ │ ├── HyRSM_UCF101_5shot_v1.yaml │ │ └── HyRSM_UCF101_1shot_v1.yaml │ │ ├── hmdb51 │ │ ├── HyRSM_HMDB51_3shot_v1.yaml │ │ ├── HyRSM_HMDB51_4shot_v1.yaml │ │ ├── HyRSM_HMDB51_5shot_v1.yaml │ │ ├── HyRSM_HMDB51_2shot_v1.yaml │ │ └── HyRSM_HMDB51_1shot_v1.yaml │ │ ├── kinetics100 │ │ ├── HyRSM_K100_2shot_v1.yaml │ │ ├── HyRSM_K100_3shot_v1.yaml │ │ ├── HyRSM_K100_4shot_v1.yaml │ │ ├── HyRSM_K100_5shot_v1.yaml │ │ └── HyRSM_K100_1shot_v1.yaml │ │ └── epic_kitchens │ │ ├── HyRSM_Epic_2shot_v1.yaml │ │ ├── HyRSM_Epic_3shot_v1.yaml │ │ ├── HyRSM_Epic_4shot_v1.yaml │ │ ├── HyRSM_Epic_5shot_v1.yaml │ │ └── HyRSM_Epic_1shot_v1.yaml └── pool │ ├── backbone │ ├── localization-conv.yaml │ ├── s3dg.yaml │ ├── timesformer.yaml │ ├── vivit.yaml │ ├── vivit_fac_enc.yaml │ ├── csn.yaml │ ├── r2d3ds.yaml │ ├── r2p1d.yaml │ ├── tada2d.yaml │ ├── slowfast_8x8.yaml │ └── slowfast_4x16.yaml │ ├── run │ └── training │ │ ├── from_scratch.yaml │ │ ├── from_scratch_large.yaml │ │ ├── finetune.yaml │ │ ├── localization.yaml │ │ └── mosi.yaml │ └── base.yaml ├── models ├── __init__.py ├── module_zoo │ ├── stems │ │ ├── __init__.py │ │ ├── downsample_stem.py │ │ ├── r2plus1d_stem.py │ │ └── embedding_stem.py │ ├── __init__.py │ ├── heads │ │ ├── __init__.py │ │ └── transformer_head.py │ └── branches │ │ ├── __init__.py │ │ ├── non_local.py │ │ ├── csn_branch.py │ │ ├── slowfast_branch.py │ │ └── r2d3d_branch.py ├── base │ ├── __init__.py │ ├── builder.py │ └── models.py └── utils │ ├── model_ema.py │ ├── lr_policy.py │ ├── params.py │ └── lars.py ├── utils ├── __init__.py ├── tensor.py ├── eval_tal │ └── eval_tal.py ├── bboxes_1d.py ├── registry.py ├── timer.py ├── logging.py └── launcher.py ├── datasets ├── __init__.py ├── utils │ ├── __init__.py │ └── collate_functions.py └── base │ ├── __init__.py │ └── builder.py ├── sslgenerators ├── __init__.py └── builder.py ├── HyRSM_arch.png ├── projects ├── mosi │ ├── MoSI.png │ └── README.md ├── tada │ ├── TAda2D.png │ └── README.md ├── epic-kitchen-tal │ └── README.md └── epic-kitchen-ar │ └── README.md ├── FEATURE_ZOO.md ├── environment.yaml ├── GUIDELINES.md ├── MODEL_ZOO.md ├── runs └── run.py └── README.md /configs/init: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sslgenerators/__init__.py: -------------------------------------------------------------------------------- 1 | from .mosi.mosi_generator import MoSIGenerator -------------------------------------------------------------------------------- /HyRSM_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/HyRSM/HEAD/HyRSM_arch.png -------------------------------------------------------------------------------- /projects/mosi/MoSI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/HyRSM/HEAD/projects/mosi/MoSI.png -------------------------------------------------------------------------------- /projects/tada/TAda2D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alibaba-mmai-research/HyRSM/HEAD/projects/tada/TAda2D.png -------------------------------------------------------------------------------- /configs/projects/mosi/pt-hmdb/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2d3ds_hmdb.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | OUTPUT_DIR: output/r2d3ds_pt_hmdb -------------------------------------------------------------------------------- /configs/projects/mosi/pt-ucf/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2d3ds_ucf.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | OUTPUT_DIR: output/r2d3ds_pt_ucf -------------------------------------------------------------------------------- /configs/projects/mosi/pt-ucf/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2p1d_ucf.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | OUTPUT_DIR: output/r2p1d_pt_ucf 5 | -------------------------------------------------------------------------------- /configs/projects/mosi/pt-hmdb/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2p1d_hmdb.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | OUTPUT_DIR: output/r2p1d_pt_hmdb 5 | -------------------------------------------------------------------------------- /models/module_zoo/stems/__init__.py: -------------------------------------------------------------------------------- 1 | from .downsample_stem import DownSampleStem 2 | from .r2plus1d_stem import R2Plus1DStem 3 | from .embedding_stem import PatchEmbedStem, TubeletEmbeddingStem -------------------------------------------------------------------------------- /configs/projects/mosi/ft-hmdb/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2d3ds_hmdb.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_hmdb_mosi_public.pyth 4 | OUTPUT_DIR: output/r2d3ds_mosi_ft_hmdb -------------------------------------------------------------------------------- /configs/projects/mosi/ft-hmdb/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2p1d_hmdb.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_hmdb_mosi_public.pyth 4 | OUTPUT_DIR: output/r2p1d_mosi_ft_hmdb -------------------------------------------------------------------------------- /configs/projects/mosi/ft-ucf/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2d3ds_ucf.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_ucf_mosi_public.pyth 4 | OUTPUT_DIR: output/r2d3ds_mosi_ft_ucf -------------------------------------------------------------------------------- /configs/projects/mosi/ft-ucf/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2p1d_ucf.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_ucf_mosi_public.pyth 4 | OUTPUT_DIR: output/r2p1d_mosi_ft_ucf 5 | -------------------------------------------------------------------------------- /configs/projects/mosi/pt-imagenet/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../mosi_r2d3ds_imagenet.yaml 2 | TRAIN: 3 | EVAL_PERIOD: 10 4 | PRETRAIN: 5 | IMAGENET_DATA_SIZE: 5 6 | OUTPUT_DIR: output/r2d3ds_pt_imagenet -------------------------------------------------------------------------------- /configs/projects/mosi/ft-hmdb/r2p1d_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2p1d_hmdb.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_hmdb_ft_hmdb_5183_public.pyth 5 | OUTPUT_DIR: output/r2p1d_mosi_ft_hmdb_test -------------------------------------------------------------------------------- /configs/projects/mosi/ft-ucf/r2d3ds_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2d3ds_ucf.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_ucf_ft_ucf_7175_public.pyth 5 | OUTPUT_DIR: output/r2d3ds_mosi_ft_ucf_test -------------------------------------------------------------------------------- /configs/projects/mosi/ft-ucf/r2p1d_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2p1d_ucf.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/r2p1d_pt_ucf_ft_ucf_8279_public.pyth 5 | OUTPUT_DIR: output/r2p1d_mosi_ft_ucf_test 6 | -------------------------------------------------------------------------------- /configs/projects/mosi/ft-hmdb/r2d3ds_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../ft_r2d3ds_hmdb.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/r2d3ds_pt_hmdb_ft_hmdb_4693_public.pyth 5 | OUTPUT_DIR: output/r2d3ds_mosi_ft_hmdb_test -------------------------------------------------------------------------------- /configs/projects/tada/ssv2/tada2d_8f.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../tada2d_ssv2.yaml 2 | TRAIN: 3 | FINE_TUNE: true 4 | CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights 5 | DATA: 6 | NUM_INPUT_FRAMES: 8 7 | OUTPUT_DIR: output/tada2d_k400_8f -------------------------------------------------------------------------------- /models/module_zoo/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from models.module_zoo.heads import * 5 | from models.module_zoo.stems import * 6 | from models.module_zoo.branches import * 7 | -------------------------------------------------------------------------------- /configs/projects/tada/ssv2/tada2d_16f.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../tada2d_ssv2.yaml 2 | TRAIN: 3 | FINE_TUNE: true 4 | BATCH_SIZE: 8 5 | CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights 6 | DATA: 7 | NUM_INPUT_FRAMES: 16 8 | OUTPUT_DIR: output/tada2d_k400_16f -------------------------------------------------------------------------------- /configs/pool/backbone/localization-conv.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: BaseVideoModel 3 | VIDEO: 4 | DIM1D: 256 5 | DIM2D: 128 6 | DIM3D: 512 7 | BACKBONE_LAYER: 2 8 | BACKBONE_GROUPS_NUM: 4 9 | BACKBONE: 10 | META_ARCH: SimpleLocalizationConv -------------------------------------------------------------------------------- /configs/projects/tada/k400/tada2d_8x8.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../tada2d_k400.yaml 2 | TRAIN: 3 | FINE_TUNE: true 4 | CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights 5 | DATA: 6 | SAMPLING_RATE: 8 7 | NUM_INPUT_FRAMES: 8 8 | OUTPUT_DIR: output/tada2d_8x8_k400 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/k400/vivit_fac_enc_b16x2_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_k400.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: "./checkpoints/vivit_fac_enc_b16x2_k400_32x224x224_7935_public.pyth" 5 | 6 | OUTPUT_DIR: output/vivit_fac_enc_k400_test -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/csn_submit.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../csn_ek100_submission.yaml 2 | TRAIN: 3 | CHECKPOINT_FILE_PATH: ./checkpoints/csn152_pt_k700_ft_ek100_32x224x224_4452_public.pyth 4 | BATCH_SIZE: 8 5 | TEST: 6 | BATCH_SIZE: 8 7 | OUTPUT_DIR: output/csn_ek100_submit -------------------------------------------------------------------------------- /models/module_zoo/heads/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from .mosi_head import MoSIHeadJoint 5 | from .slowfast_head import SlowFastHead 6 | from .transformer_head import TransformerHead 7 | from .bmn_head import BaseBMN 8 | -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/csn_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../csn_ek100.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/csn152_pt_k700_ft_ek100_32x224x224_4452_public.pyth 5 | BN: 6 | WB_LOCK: false 7 | FREEZE: true 8 | OUTPUT_DIR: output/csn_ek100_test 9 | -------------------------------------------------------------------------------- /models/base/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | import models.module_zoo 5 | from models.base.base_blocks import BaseHead, Base3DStem 6 | import models.base.transformer 7 | import models.base.slowfast 8 | import models.base.few_shot -------------------------------------------------------------------------------- /configs/projects/tada/k400/tada2d_16x5.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../tada2d_k400.yaml 2 | TRAIN: 3 | FINE_TUNE: true 4 | BATCH_SIZE: 8 5 | CHECKPOINT_FILE_PATH: "" # pretrained imagenet weights 6 | 7 | OPTIMIZER: 8 | BASE_LR: 0.12 9 | 10 | DATA: 11 | SAMPLING_RATE: 5 12 | NUM_INPUT_FRAMES: 16 13 | 14 | OUTPUT_DIR: output/tada2d_16x5_k400 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../bmn_epic.yaml 2 | TRAIN: 3 | ENABLE: true 4 | BATCH_SIZE: 4 5 | CHECKPOINT_FILE_PATH: "" 6 | TEST: 7 | ENABLE: true 8 | BATCH_SIZE: 4 9 | TEST_CHECKPOINT: [9] 10 | CHECKPOINT_FILE_PATH: "" 11 | OUTPUT_DIR: /mnt/data-nas/qingzhiwu/results/checkpoints/epic_tal/vvt-os/ 12 | 13 | -------------------------------------------------------------------------------- /datasets/base/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from .ucf101 import Ucf101 5 | from .hmdb51 import Hmdb51 6 | from .kinetics400 import Kinetics400 7 | from .kinetics700 import Kinetics700 8 | from .ssv2 import Ssv2 9 | from .imagenet import Imagenet 10 | from .epickitchen100_feature import Epickitchen100localization 11 | from .epickitchen100 import Epickitchen100 12 | from .ssv2_few_shot import Ssv2_few_shot 13 | -------------------------------------------------------------------------------- /configs/pool/backbone/s3dg.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: S3DG 3 | VIDEO: 4 | BACKBONE: 5 | META_ARCH: Inception3D 6 | NUM_OUT_FEATURES: 1024 7 | NUM_STREAMS: 1 8 | BRANCH: 9 | NAME: STConv3d 10 | GATING: true 11 | STEM: 12 | NAME: STConv3d 13 | NONLOCAL: 14 | ENABLE: false 15 | STAGES: [5] 16 | MASK_ENABLE: false 17 | HEAD: 18 | NAME: BaseHead 19 | ACTIVATION: softmax 20 | DROPOUT_RATE: 0 21 | NUM_CLASSES: # !!! -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc_submit.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_ek100_submission.yaml 2 | TRAIN: 3 | CHECKPOINT_PERIOD: 1 4 | EVAL_PERIOD: 1 5 | CHECKPOINT_FILE_PATH: ./checkpoints/vivit_fac_enc_b16x2_pt_k700_ft_ek100_32x224x224_4630_public.pyth 6 | FINE_TUNE: true 7 | BATCH_SIZE: 4 8 | 9 | DATA: 10 | TRAIN_JITTER_SCALES: [336, 448] 11 | TRAIN_CROP_SIZE: 320 12 | TEST_SCALE: 320 13 | TEST_CROP_SIZE: 320 14 | 15 | DATA_LOADER: 16 | NUM_WORKERS: 8 17 | 18 | OUTPUT_DIR: output/vivit_fac_enc_ek100_submit -------------------------------------------------------------------------------- /utils/tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def tensor2cuda(data): 5 | """ 6 | Put Tensor in iterable data into gpu. 7 | Args: 8 | data :(tensor or list or dict) 9 | """ 10 | if type(data) == torch.Tensor: 11 | return data.cuda(non_blocking=True) 12 | elif type(data) == dict: 13 | keys = list(data.keys()) 14 | for k in keys: 15 | data[k] = tensor2cuda(data[k]) 16 | elif type(data) == list: 17 | for i in range(len(data)): 18 | data[i] = tensor2cuda(data[i]) 19 | return data 20 | -------------------------------------------------------------------------------- /models/module_zoo/branches/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | from models.module_zoo.branches.r2plus1d_branch import R2Plus1DBranch 5 | from models.module_zoo.branches.r2d3d_branch import R2D3DBranch 6 | from models.module_zoo.branches.csn_branch import CSNBranch 7 | from models.module_zoo.branches.slowfast_branch import SlowfastBranch 8 | from models.module_zoo.branches.s3dg_branch import STConv3d 9 | from models.module_zoo.branches.non_local import NonLocal 10 | from models.module_zoo.branches.tada_branch import TAdaConv2d -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc_test.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_ek100.yaml 2 | TRAIN: 3 | ENABLE: false 4 | CHECKPOINT_FILE_PATH: ./checkpoints/vivit_fac_enc_b16x2_pt_k700_ft_ek100_32x224x224_4630_public.pyth 5 | CHECKPOINT_PRE_PROCESS: 6 | ENABLE: true 7 | POP_HEAD: true 8 | POS_EMBED: super-resolution 9 | PATCH_EMBD: 10 | 11 | DATA: 12 | TRAIN_JITTER_SCALES: [336, 448] 13 | TRAIN_CROP_SIZE: 320 14 | TEST_SCALE: 320 15 | TEST_CROP_SIZE: 320 16 | 17 | DATA_LOADER: 18 | NUM_WORKERS: 8 19 | 20 | OUTPUT_DIR: output/vivit_fac_enc_ek100_test -------------------------------------------------------------------------------- /configs/projects/mosi/baselines/r2p1d_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml 2 | _BASE_MODEL: ../../../pool/backbone/r2p1d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: HMDB51 9 | CHECKPOINT_FILE_PATH: "" 10 | BATCH_SIZE: 48 11 | TEST: 12 | ENABLE: true 13 | DATASET: HMDB51 14 | BATCH_SIZE: 48 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 18 | VIDEO: 19 | HEAD: 20 | NUM_CLASSES: 51 21 | DROPOUT_RATE: 0.5 22 | OUTPUT_DIR: output/r2p1d_hmdb_from_scratch -------------------------------------------------------------------------------- /configs/projects/mosi/baselines/r2p1d_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml 2 | _BASE_MODEL: ../../../pool/backbone/r2p1d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: UCF101 9 | CHECKPOINT_FILE_PATH: "" 10 | BATCH_SIZE: 48 11 | TEST: 12 | ENABLE: true 13 | DATASET: UCF101 14 | BATCH_SIZE: 48 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 18 | VIDEO: 19 | HEAD: 20 | NUM_CLASSES: 101 21 | DROPOUT_RATE: 0.5 22 | OUTPUT_DIR: output/r2p1d_ucf_from_scratch -------------------------------------------------------------------------------- /configs/projects/mosi/baselines/r2d3ds_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml 2 | _BASE_MODEL: ../../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: HMDB51 9 | CHECKPOINT_FILE_PATH: "" 10 | BATCH_SIZE: 128 11 | TEST: 12 | ENABLE: true 13 | DATASET: HMDB51 14 | BATCH_SIZE: 128 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 18 | VIDEO: 19 | HEAD: 20 | NUM_CLASSES: 51 21 | DROPOUT_RATE: 0.5 22 | OUTPUT_DIR: output/r2d3ds_hmdb_from_scratch 23 | -------------------------------------------------------------------------------- /configs/projects/mosi/baselines/r2d3ds_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../../pool/run/training/from_scratch.yaml 2 | _BASE_MODEL: ../../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: UCF101 9 | CHECKPOINT_FILE_PATH: "" 10 | BATCH_SIZE: 128 11 | TEST: 12 | ENABLE: true 13 | DATASET: UCF101 14 | BATCH_SIZE: 128 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 18 | VIDEO: 19 | HEAD: 20 | NUM_CLASSES: 101 21 | DROPOUT_RATE: 0.5 22 | OUTPUT_DIR: output/r2d3ds_ucf_from_scratch 23 | -------------------------------------------------------------------------------- /configs/projects/mosi/ft_r2p1d_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: UCF101 9 | CHECKPOINT_FILE_PATH: "" # !!@2 10 | BATCH_SIZE: 48 11 | TEST: 12 | ENABLE: true 13 | DATASET: UCF101 14 | BATCH_SIZE: 48 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 18 | MINUS_INTERVAL: false 19 | VIDEO: 20 | HEAD: 21 | NUM_CLASSES: 101 22 | DROPOUT_RATE: 0.5 23 | OPTIMIZER: 24 | BASE_LR: 0.0015 25 | WARMUP_START_LR: 0.00015 26 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /sslgenerators/builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Builder for self-supervised generator.""" 5 | 6 | from utils.registry import Registry 7 | 8 | SSL_GENERATOR_REGISTRY = Registry("SSL_Methods") 9 | 10 | def build_ssl_generator(cfg, split): 11 | """ 12 | Entry point to registered self-supervised learning methods. 13 | Returns transformed frames and the self-supervised label. 14 | Args: 15 | split (str): training, validation or test. 16 | """ 17 | ssl_generator = SSL_GENERATOR_REGISTRY.get(cfg.PRETRAIN.GENERATOR)(cfg, split) 18 | return ssl_generator 19 | -------------------------------------------------------------------------------- /configs/projects/mosi/ft_r2d3ds_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: HMDB51 9 | CHECKPOINT_FILE_PATH: "" # !!@2 10 | BATCH_SIZE: 128 11 | TEST: 12 | ENABLE: true 13 | DATASET: HMDB51 14 | BATCH_SIZE: 128 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 18 | MINUS_INTERVAL: false 19 | VIDEO: 20 | HEAD: 21 | NUM_CLASSES: 51 22 | DROPOUT_RATE: 0.5 23 | OPTIMIZER: 24 | BASE_LR: 0.002 25 | WARMUP_START_LR: 0.0002 26 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/ft_r2d3ds_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: UCF101 9 | CHECKPOINT_FILE_PATH: "" # !!@2 10 | BATCH_SIZE: 128 11 | TEST: 12 | ENABLE: true 13 | DATASET: UCF101 14 | BATCH_SIZE: 128 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 18 | MINUS_INTERVAL: false 19 | VIDEO: 20 | HEAD: 21 | NUM_CLASSES: 101 22 | DROPOUT_RATE: 0.5 23 | OPTIMIZER: 24 | BASE_LR: 0.004 25 | WARMUP_START_LR: 0.0004 26 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/projects/mosi/ft_r2p1d_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: HMDB51 9 | CHECKPOINT_FILE_PATH: "" # !!@2 10 | BATCH_SIZE: 48 11 | TEST: 12 | ENABLE: true 13 | DATASET: HMDB51 14 | BATCH_SIZE: 48 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 18 | MINUS_INTERVAL: false 19 | VIDEO: 20 | HEAD: 21 | NUM_CLASSES: 51 22 | DROPOUT_RATE: 0.5 23 | OPTIMIZER: 24 | BASE_LR: 0.00075 25 | WARMUP_START_LR: 0.000075 26 | NUM_GPUS: 8 -------------------------------------------------------------------------------- /configs/pool/backbone/timesformer.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: timesformer 3 | VIDEO: 4 | BACKBONE: 5 | META_ARCH: Transformer 6 | NUM_FEATURES: 768 7 | NUM_OUT_FEATURES: 768 8 | PATCH_SIZE: 16 9 | DEPTH: 12 10 | NUM_HEADS: 12 11 | DIM_HEAD: 64 12 | ATTN_DROPOUT: 0.1 13 | FF_DROPOUT: 0.1 14 | DROP_PATH: 0.0 15 | PRE_LOGITS: false 16 | STEM: 17 | NAME: PatchEmbedStem 18 | BRANCH: 19 | NAME: TimesformerLayer 20 | NONLOCAL: 21 | ENABLE: false 22 | STAGES: [5] 23 | MASK_ENABLE: false 24 | HEAD: 25 | NAME: TransformerHead 26 | ACTIVATION: softmax 27 | DROPOUT_RATE: 0 28 | NUM_CLASSES: # !!! -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/vivit_fac_enc_k400.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 8 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: true 13 | DATASET: kinetics400 14 | BATCH_SIZE: 8 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 18 | SAMPLING_RATE: 2 19 | NUM_INPUT_FRAMES: 32 20 | VIDEO: 21 | HEAD: 22 | NUM_CLASSES: 400 23 | DROPOUT_RATE: 0.5 24 | 25 | DATA_LOADER: 26 | NUM_WORKERS: 4 27 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /datasets/utils/collate_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Collate functions. """ 5 | 6 | import random 7 | from utils.registry import Registry 8 | from torch.utils.data._utils.collate import default_collate 9 | import torch.nn.functional as F 10 | 11 | COLLATE_FN_REGISTRY = Registry() 12 | 13 | @COLLATE_FN_REGISTRY.register() 14 | class ZeroShotCollate(object): 15 | def __init__(self, cfg): 16 | self.cfg = cfg 17 | 18 | def __call__(self, batch): 19 | batch = default_collate(batch) 20 | batch[0]["text_embedding"] = batch[0]["text_embedding"][0].unsqueeze(0) 21 | return batch -------------------------------------------------------------------------------- /FEATURE_ZOO.md: -------------------------------------------------------------------------------- 1 | # FEATURE ZOO 2 | 3 | Here, we provide strong features for temporal action localization on HACS and Epic-Kitchens-100. 4 | 5 | | dataset | model | resolution | features | classification | average mAP | 6 | | ------------ | ------------ | ------------ | ------------ | ------------ | 7 | | EK100 | ViViT Fact. Enc.-B16x2 | 32 x 2 | [features]() | [classification]() | 18.30 (A)) | 8 | | EK100 | TAda2D | 8 x 8 | [features]() | [classification]() | 13.18 | 9 | | HACS | TAda2D | 8 x 8 | [features]() | - | 32.3 | 10 | 11 | Annotations used for temporal action localization with our codebase can be found [here](). 12 | 13 | Pre-trained localization models using these features can be found in the [MODEL_ZOO.md](MODEL_ZOO.md). -------------------------------------------------------------------------------- /configs/pool/backbone/vivit.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: vivit 3 | VIDEO: 4 | BACKBONE: 5 | META_ARCH: Transformer 6 | NUM_FEATURES: 768 7 | NUM_OUT_FEATURES: 768 8 | PATCH_SIZE: 16 9 | TUBELET_SIZE: 2 10 | DEPTH: 12 11 | NUM_HEADS: 12 12 | DIM_HEAD: 64 13 | ATTN_DROPOUT: 0.0 14 | FF_DROPOUT: 0.0 15 | DROP_PATH: 0.1 16 | MLP_MULT: 4 17 | STEM: 18 | NAME: TubeletEmbeddingStem 19 | BRANCH: 20 | NAME: BaseTransformerLayer 21 | HEAD: 22 | NAME: TransformerHead 23 | ACTIVATION: softmax 24 | DROPOUT_RATE: 0 25 | NUM_CLASSES: # !!! 26 | PRE_LOGITS: false 27 | TRAIN: 28 | CHECKPOINT_PRE_PROCESS: 29 | ENABLE: true 30 | POP_HEAD: true 31 | POS_EMBED: repeat 32 | PATCH_EMBD: central_frame -------------------------------------------------------------------------------- /configs/pool/backbone/vivit_fac_enc.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: vivit 3 | VIDEO: 4 | BACKBONE: 5 | META_ARCH: FactorizedTransformer 6 | NUM_FEATURES: 768 7 | NUM_OUT_FEATURES: 768 8 | PATCH_SIZE: 16 9 | TUBELET_SIZE: 2 10 | DEPTH: 12 11 | DEPTH_TEMP: 4 12 | NUM_HEADS: 12 13 | DIM_HEAD: 64 14 | ATTN_DROPOUT: 0.0 15 | FF_DROPOUT: 0.0 16 | DROP_PATH: 0.1 17 | MLP_MULT: 4 18 | STEM: 19 | NAME: TubeletEmbeddingStem 20 | BRANCH: 21 | NAME: BaseTransformerLayer 22 | HEAD: 23 | NAME: TransformerHead 24 | ACTIVATION: softmax 25 | DROPOUT_RATE: 0 26 | NUM_CLASSES: # !!! 27 | PRE_LOGITS: false 28 | TRAIN: 29 | CHECKPOINT_PRE_PROCESS: 30 | ENABLE: true 31 | POP_HEAD: true 32 | POS_EMBED: 33 | PATCH_EMBD: central_frame -------------------------------------------------------------------------------- /configs/pool/backbone/csn.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: irCSN 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 152 6 | META_ARCH: ResNet3D 7 | NUM_FILTERS: [64, 256, 512, 1024, 2048] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 2048 10 | KERNEL_SIZE: [ 11 | [3, 7, 7], 12 | [3, 3, 3], 13 | [3, 3, 3], 14 | [3, 3, 3], 15 | [3, 3, 3] 16 | ] 17 | DOWNSAMPLING: [true, false, true, true, true] 18 | DOWNSAMPLING_TEMPORAL: [false, false, true, true, true] 19 | NUM_STREAMS: 1 20 | EXPANSION_RATIO: 4 21 | BRANCH: 22 | NAME: CSNBranch 23 | STEM: 24 | NAME: DownSampleStem 25 | NONLOCAL: 26 | ENABLE: false 27 | STAGES: [5] 28 | MASK_ENABLE: false 29 | HEAD: 30 | NAME: BaseHead 31 | ACTIVATION: softmax 32 | DROPOUT_RATE: 0 33 | NUM_CLASSES: # !!! 34 | -------------------------------------------------------------------------------- /configs/pool/backbone/r2d3ds.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: R2D3D 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 18 6 | META_ARCH: ResNet3D 7 | NUM_FILTERS: [64, 64, 128, 256, 256] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 256 10 | KERNEL_SIZE: [ 11 | [1, 7, 7], 12 | [1, 3, 3], 13 | [1, 3, 3], 14 | [3, 3, 3], 15 | [3, 3, 3] 16 | ] 17 | DOWNSAMPLING: [true, false, true, true, true] 18 | DOWNSAMPLING_TEMPORAL: [false, false, false, true, true] 19 | NUM_STREAMS: 1 20 | EXPANSION_RATIO: 2 21 | BRANCH: 22 | NAME: R2D3DBranch 23 | STEM: 24 | NAME: DownSampleStem 25 | NONLOCAL: 26 | ENABLE: false 27 | STAGES: [5] 28 | MASK_ENABLE: false 29 | HEAD: 30 | NAME: BaseHead 31 | ACTIVATION: softmax 32 | DROPOUT_RATE: 0 33 | NUM_CLASSES: # !!! 34 | -------------------------------------------------------------------------------- /configs/pool/backbone/r2p1d.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: R2Plus1D 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 10 6 | META_ARCH: ResNet3D 7 | NUM_INPUT_CHANNELS: 3 8 | NUM_FILTERS: [64, 64, 128, 256, 512] 9 | NUM_OUT_FEATURES: 512 10 | KERNEL_SIZE: [ 11 | [3, 7, 7], 12 | [3, 3, 3], 13 | [3, 3, 3], 14 | [3, 3, 3], 15 | [3, 3, 3] 16 | ] 17 | DOWNSAMPLING: [true, false, true, true, true] 18 | DOWNSAMPLING_TEMPORAL: [false, false, true, true, true] 19 | NUM_STREAMS: 1 20 | EXPANSION_RATIO: 2 21 | BRANCH: 22 | NAME: R2Plus1DBranch 23 | STEM: 24 | NAME: R2Plus1DStem 25 | NONLOCAL: 26 | ENABLE: false 27 | STAGES: [5] 28 | MASK_ENABLE: false 29 | HEAD: 30 | NAME: BaseHead 31 | ACTIVATION: softmax 32 | DROPOUT_RATE: 0 33 | NUM_CLASSES: # !!! 34 | -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2p1d_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | DATASET: HMDB51 7 | BATCH_SIZE: 5 8 | LOG_FILE: training_log.log 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 20 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false 20 | DATASET: HMDB51 21 | BATCH_SIZE: 5 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: false 29 | DATA: 30 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 31 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 32 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2p1d_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2p1d.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | DATASET: UCF101 7 | BATCH_SIZE: 5 8 | LOG_FILE: training_log.log 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 20 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false 20 | DATASET: UCF101 21 | BATCH_SIZE: 5 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: false 29 | DATA: 30 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 31 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 32 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2d3ds_hmdb.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | DATASET: HMDB51 7 | BATCH_SIZE: 10 8 | LOG_FILE: training_log.log 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 20 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false 20 | DATASET: HMDB51 21 | BATCH_SIZE: 10 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: false 29 | DATA: 30 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/hmdb51/videos/ 31 | ANNO_DIR: /mnt/ziyuan/ziyuan/hmdb51/anno_lists/ 32 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2d3ds_ucf.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | DATASET: UCF101 7 | BATCH_SIZE: 10 8 | LOG_FILE: training_log.log 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 20 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false 20 | DATASET: UCF101 21 | BATCH_SIZE: 10 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: false 29 | DATA: 30 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ucf101/videos/ 31 | ANNO_DIR: /mnt/ziyuan/ziyuan/ucf101/annotations/ 32 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /models/module_zoo/stems/downsample_stem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Downsample Stem. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from models.base.base_blocks import Base3DStem 10 | from models.base.base_blocks import STEM_REGISTRY 11 | 12 | @STEM_REGISTRY.register() 13 | class DownSampleStem(Base3DStem): 14 | """ 15 | Inherits base 3D stem and adds a maxpool as downsampling. 16 | """ 17 | def __init__(self, cfg): 18 | super(DownSampleStem, self).__init__(cfg) 19 | self.maxpool = nn.MaxPool3d( 20 | kernel_size = (1, 3, 3), 21 | stride = (1, 2, 2), 22 | padding = (0, 1, 1) 23 | ) 24 | 25 | def forward(self, x): 26 | x = self.a(x) 27 | x = self.a_bn(x) 28 | x = self.a_relu(x) 29 | x = self.maxpool(x) 30 | return x 31 | 32 | -------------------------------------------------------------------------------- /utils/eval_tal/eval_tal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | import sys 5 | from .eval_epic_detection import Epicdetection 6 | from utils import logging 7 | import numpy as np 8 | import json 9 | logger = logging.get_logger(__name__) 10 | 11 | 12 | def evaluate_detection(video_anno, detection_result_file, tiou_thresholds=np.linspace(0.5, 0.95, 10)): 13 | """ 14 | Evaluate action detection performance. 15 | Args: 16 | video_anno (str): Annotation file path. 17 | detection_result_file (str): The detection results output by your model. 18 | tiou_thresholds (np.array): Iou thresholds to be tested. 19 | """ 20 | detection = Epicdetection(video_anno, detection_result_file, 21 | tiou_thresholds=tiou_thresholds, 22 | subset='validation', verbose=True, check_status=False) 23 | detection.evaluate() 24 | -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100_submission.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: false 8 | DATASET: epickitchen100 9 | CHECKPOINT_FILE_PATH: "" 10 | TEST: 11 | ENABLE: false 12 | DATASET: epickitchen100 13 | BATCH_SIZE: 4 14 | SUBMISSION: 15 | ENABLE: true 16 | ACTION_CLASS_ENSUMBLE_METHOD: "sum" # sum or calculate 17 | TASK_TYPE: submission 18 | DATA: 19 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 20 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 21 | NUM_INPUT_FRAMES: 32 22 | SAMPLING_RATE: 2 23 | MULTI_LABEL: true 24 | TARGET_FPS: 60 25 | VIDEO: 26 | HEAD: 27 | NAME: TransformerHeadx2 28 | NUM_CLASSES: [97, 300] 29 | DROPOUT_RATE: 0.5 30 | 31 | DATA_LOADER: 32 | NUM_WORKERS: 10 33 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/pool/backbone/tada2d.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: TAda2D 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 50 6 | META_ARCH: ResNet3D 7 | NUM_FILTERS: [64, 256, 512, 1024, 2048] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 2048 10 | KERNEL_SIZE: [ 11 | [1, 7, 7], 12 | [1, 3, 3], 13 | [1, 3, 3], 14 | [1, 3, 3], 15 | [1, 3, 3] 16 | ] 17 | DOWNSAMPLING: [true, true, true, true, true] 18 | DOWNSAMPLING_TEMPORAL: [false, false, false, false, false] 19 | NUM_STREAMS: 1 20 | EXPANSION_RATIO: 4 21 | INITIALIZATION: kaiming 22 | STEM: 23 | NAME: Base2DStem 24 | BRANCH: 25 | NAME: TAdaConvBlockAvgPool 26 | ROUTE_FUNC_K: [3, 3] 27 | ROUTE_FUNC_R: 4 28 | POOL_K: [3, 1, 1] 29 | NONLOCAL: 30 | ENABLE: false 31 | STAGES: [5] 32 | MASK_ENABLE: false 33 | HEAD: 34 | NAME: BaseHead 35 | ACTIVATION: softmax 36 | DROPOUT_RATE: 0 37 | NUM_CLASSES: # !!! 38 | -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/csn_ek100_submission.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/csn.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: false 8 | DATASET: epickitchen100 9 | BATCH_SIZE: 8 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: false 13 | DATASET: epickitchen100 14 | BATCH_SIZE: 8 15 | SUBMISSION: 16 | ENABLE: true 17 | ACTION_CLASS_ENSUMBLE_METHOD: "sum" # sum or calculate 18 | TASK_TYPE: submission 19 | DATA: 20 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 21 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 22 | NUM_INPUT_FRAMES: 32 23 | SAMPLING_RATE: 2 24 | TEST_SCALE: 256 25 | TEST_CROP_SIZE: 256 26 | MULTI_LABEL: true 27 | TARGET_FPS: 60 28 | VIDEO: 29 | HEAD: 30 | NAME: BaseHeadx2 31 | NUM_CLASSES: [97, 300] 32 | DROPOUT_RATE: 0.5 33 | DATA_LOADER: 34 | NUM_WORKERS: 4 35 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/projects/mosi/mosi_r2d3ds_imagenet.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/mosi.yaml 2 | _BASE_MODEL: ../../pool/backbone/r2d3ds.yaml 3 | 4 | PRETRAIN: 5 | IMAGENET_DATA_SIZE: 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: imagenet 9 | BATCH_SIZE: 10 10 | LOG_FILE: training_log.log 11 | EVAL_PERIOD: 5 12 | NUM_FOLDS: 20 13 | AUTO_RESUME: true 14 | CHECKPOINT_PERIOD: 10 15 | CHECKPOINT_FILE_PATH: "" # !!@2p 16 | CHECKPOINT_TYPE: pytorch 17 | CHECKPOINT_INFLATE: false 18 | FINE_TUNE: false 19 | ONLY_LINEAR: false 20 | TEST: 21 | ENABLE: false 22 | DATASET: imagenet 23 | BATCH_SIZE: 10 24 | NUM_SPATIAL_CROPS: 1 25 | SPATIAL_CROPS: cc 26 | NUM_ENSEMBLE_VIEWS: 1 27 | LOG_FILE: val.log 28 | CHECKPOINT_FILE_PATH: "" 29 | CHECKPOINT_TYPE: pytorch 30 | AUTOMATIC_MULTI_SCALE_TEST: false 31 | DATA: 32 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/imagenet/ 33 | ANNO_DIR: /mnt/ziyuan/ziyuan/imagenet/ 34 | MEAN: [0.485, 0.456, 0.406] 35 | STD: [0.229, 0.224, 0.225] 36 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/tada/tada2d_k400.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tada2d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 16 10 | FINE_TUNE: true 11 | CHECKPOINT_FILE_PATH: "" # !!@2 12 | TEST: 13 | ENABLE: true 14 | DATASET: kinetics400 15 | BATCH_SIZE: 16 16 | DATA: 17 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 18 | ANNO_DIR: /mnt/ziyuan/ziyuan/kinetics400/ 19 | SAMPLING_RATE: 8 20 | NUM_INPUT_FRAMES: 8 21 | TEST_SCALE: 256 22 | TEST_CROP_SIZE: 256 23 | VIDEO: 24 | HEAD: 25 | NUM_CLASSES: 400 26 | DROPOUT_RATE: 0.5 27 | DATA_LOADER: 28 | NUM_WORKERS: 8 29 | OPTIMIZER: 30 | BASE_LR: 0.24 31 | ADJUST_LR: false 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 100 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 8 37 | WARMUP_START_LR: 0.01 38 | OPTIM_METHOD: sgd 39 | DAMPENING: 0.0 40 | NESTEROV: true 41 | NUM_GPUS: 2 -------------------------------------------------------------------------------- /configs/projects/tada/tada2d_ssv2.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tada2d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 16 10 | FINE_TUNE: true 11 | CHECKPOINT_FILE_PATH: "" 12 | TEST: 13 | ENABLE: true 14 | DATASET: kinetics400 15 | BATCH_SIZE: 16 16 | DATA: 17 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/ssv2/ 18 | ANNO_DIR: /mnt/ziyuan/ziyuan/ssv2/ 19 | NUM_INPUT_FRAMES: 8 20 | SAMPLING_MODE: segment_based 21 | TEST_SCALE: 256 22 | TEST_CROP_SIZE: 256 23 | VIDEO: 24 | HEAD: 25 | NUM_CLASSES: 174 26 | DROPOUT_RATE: 0.5 27 | DATA_LOADER: 28 | NUM_WORKERS: 8 29 | OPTIMIZER: 30 | BASE_LR: 0.48 31 | ADJUST_LR: false 32 | LR_POLICY: cosine 33 | MAX_EPOCH: 64 34 | MOMENTUM: 0.9 35 | WEIGHT_DECAY: 1e-4 36 | WARMUP_EPOCHS: 4 37 | WARMUP_START_LR: 0.0001 38 | OPTIM_METHOD: sgd 39 | DAMPENING: 0.0 40 | NESTEROV: true 41 | AUGMENTATION: 42 | SSV2_FLIP: true 43 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/csn.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../csn_ek100.yaml 2 | TRAIN: 3 | CHECKPOINT_PERIOD: 1 4 | CHECKPOINT_FILE_PATH: "" # pretrained weights from K400/K700/IG65M... 5 | FINE_TUNE: true 6 | CHECKPOINT_PRE_PROCESS: 7 | ENABLE: true 8 | POP_HEAD: true 9 | POS_EMBED: 10 | PATCH_EMBD: 11 | AUGMENTATION: 12 | COLOR_AUG: true 13 | BRIGHTNESS: 0.5 14 | CONTRAST: 0.5 15 | SATURATION: 0.5 16 | HUE: 0.25 17 | GRAYSCALE: 0.0 18 | CONSISTENT: true 19 | SHUFFLE: false 20 | GRAY_FIRST: false 21 | USE_GPU: false 22 | MIXUP: 23 | ENABLE: true 24 | ALPHA: 0.2 25 | PROB: 1.0 26 | MODE: batch 27 | SWITCH_PROB: 0.5 28 | CUTMIX: 29 | ENABLE: true 30 | ALPHA: 1.0 31 | MINMAX: 32 | RANDOM_ERASING: 33 | ENABLE: true 34 | PROB: 0.25 35 | MODE: pixel 36 | COUNT: [1, 1] 37 | NUM_SPLITS: 0 38 | AREA_RANGE: [0.02, 0.33] 39 | MIN_ASPECT: 0.3 40 | LABEL_SMOOTHING: 0.2 41 | BN: 42 | WB_LOCK: false 43 | FREEZE: true 44 | OUTPUT_DIR: output/csn_ek100 45 | -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/k400/vivit_fac_enc_b16x2.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_k400.yaml 2 | TRAIN: 3 | CHECKPOINT_PERIOD: 1 4 | EVAL_PERIOD: 1 5 | CHECKPOINT_FILE_PATH: "" # directory to the pretrained imagenet vit b16 224 model 6 | FINE_TUNE: true 7 | OPTIMIZER: 8 | BASE_LR: 0.0001 9 | ADJUST_LR: false 10 | LR_POLICY: cosine 11 | MAX_EPOCH: 30 12 | MOMENTUM: 0.9 13 | WEIGHT_DECAY: 0.1 14 | WARMUP_EPOCHS: 2.5 15 | WARMUP_START_LR: 0.000001 16 | OPTIM_METHOD: adamw 17 | DAMPENING: 0.0 18 | NESTEROV: true 19 | MODEL: 20 | EMA: 21 | ENABLE: true 22 | DECAY: 0.999 23 | 24 | AUGMENTATION: 25 | COLOR_AUG: true 26 | BRIGHTNESS: 0.5 27 | CONTRAST: 0.5 28 | SATURATION: 0.5 29 | HUE: 0.25 30 | GRAYSCALE: 0.3 31 | CONSISTENT: true 32 | SHUFFLE: true 33 | GRAY_FIRST: true 34 | USE_GPU: false 35 | MIXUP: 36 | ENABLE: true 37 | ALPHA: 0.2 38 | PROB: 1.0 39 | MODE: batch 40 | SWITCH_PROB: 0.5 41 | LABEL_SMOOTHING: 0.1 42 | 43 | VIDEO: 44 | HEAD: 45 | DROPOUT_RATE: 0.0 46 | 47 | OUTPUT_DIR: output/vivit_fac_enc_k400 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/vivit_fac_enc.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: epickitchen100 9 | BATCH_SIZE: 8 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: true 13 | DATASET: epickitchen100 14 | BATCH_SIZE: 8 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 18 | NUM_INPUT_FRAMES: 32 19 | SAMPLING_RATE: 2 20 | MULTI_LABEL: true 21 | TARGET_FPS: 60 22 | VIDEO: 23 | HEAD: 24 | NAME: TransformerHeadx2 25 | NUM_CLASSES: [97, 300] 26 | DROPOUT_RATE: 0.5 27 | 28 | DATA_LOADER: 29 | NUM_WORKERS: 4 30 | 31 | OPTIMIZER: 32 | BASE_LR: 0.0001 33 | ADJUST_LR: false 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 50 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 0.05 38 | WARMUP_EPOCHS: 5 39 | WARMUP_START_LR: 0.000001 40 | OPTIM_METHOD: adamw 41 | DAMPENING: 0.0 42 | NESTEROV: true 43 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/csn_ek100.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/csn.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: epickitchen100 9 | BATCH_SIZE: 8 10 | CHECKPOINT_FILE_PATH: "" 11 | TEST: 12 | ENABLE: true 13 | DATASET: epickitchen100 14 | BATCH_SIZE: 8 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 18 | NUM_INPUT_FRAMES: 32 19 | SAMPLING_RATE: 2 20 | TEST_SCALE: 256 21 | TEST_CROP_SIZE: 256 22 | MULTI_LABEL: true 23 | TARGET_FPS: 60 24 | VIDEO: 25 | HEAD: 26 | NAME: BaseHeadx2 27 | NUM_CLASSES: [97, 300] 28 | DROPOUT_RATE: 0.5 29 | DATA_LOADER: 30 | NUM_WORKERS: 4 31 | OPTIMIZER: 32 | BASE_LR: 0.0001 33 | ADJUST_LR: false 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 50 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 0.05 38 | WARMUP_EPOCHS: 5 39 | WARMUP_START_LR: 0.000001 40 | OPTIM_METHOD: adamw 41 | DAMPENING: 0.0 42 | NESTEROV: true 43 | NUM_GPUS: 32 -------------------------------------------------------------------------------- /configs/projects/tada/csn_ek100.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/from_scratch_large.yaml 2 | _BASE_MODEL: ../../pool/backbone/tada2d.yaml 3 | 4 | PRETRAIN: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: true 8 | DATASET: kinetics400 9 | BATCH_SIZE: 8 10 | CHECKPOINT_FILE_PATH: "" # !!@2 11 | TEST: 12 | ENABLE: true 13 | DATASET: kinetics400 14 | BATCH_SIZE: 8 15 | DATA: 16 | DATA_ROOT_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/clips_512/ 17 | ANNO_DIR: /mnt/ziyuan/ziyuan/EPIC-KITCHENS-100/annos/epic-kitchens-100-annotations-master/ 18 | NUM_INPUT_FRAMES: 32 19 | SAMPLING_RATE: 2 20 | TEST_SCALE: 256 21 | TEST_CROP_SIZE: 256 22 | MULTI_LABEL: true 23 | TARGET_FPS: 60 24 | VIDEO: 25 | HEAD: 26 | NAME: BaseHeadx2 27 | NUM_CLASSES: [97, 300] 28 | DROPOUT_RATE: 0.5 29 | DATA_LOADER: 30 | NUM_WORKERS: 4 31 | OPTIMIZER: 32 | BASE_LR: 0.0001 33 | ADJUST_LR: false 34 | LR_POLICY: cosine 35 | MAX_EPOCH: 50 36 | MOMENTUM: 0.9 37 | WEIGHT_DECAY: 0.05 38 | WARMUP_EPOCHS: 5 39 | WARMUP_START_LR: 0.000001 40 | OPTIM_METHOD: adamw 41 | DAMPENING: 0.0 42 | NESTEROV: true 43 | NUM_GPUS: 16 -------------------------------------------------------------------------------- /configs/projects/hyrsm/OTAM_base.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/finetune.yaml 2 | _BASE_MODEL: ../../pool/backbone/localization-conv.yaml 3 | 4 | # MULTI_CARD: true # for single devices multi-card -> true 5 | PRETRAIN: 6 | ENABLE: false 7 | TRAIN: 8 | ENABLE: true 9 | BATCH_SIZE: 2048 10 | DATASET: Talfeature 11 | CHECKPOINT_FILE_PATH: "" 12 | EVAL_PERIOD: 1 13 | NUM_FOLDS: 1 14 | TEST: 15 | ENABLE: true 16 | DATASET: Talfeature 17 | BATCH_SIZE: 2048 18 | TEST_SET: val 19 | AUTOMATIC_MULTI_SCALE_TEST: false 20 | UPLOAD_CLASSIFIER_RESULTS: true 21 | DATA: 22 | NORM_FEATURE: false 23 | USE_AUG_FEATURE: false 24 | AUG: false 25 | LOAD_PROPS: false 26 | TEMPORAL_SCALE: 256 27 | NUM_INPUT_CHANNELS: 2304 28 | LABELS_TYPE: cls 29 | LOAD_TYPE: pickle 30 | DOWNLOAD_FEATURE: true 31 | 32 | LOG_PERIOD: 1 33 | SOLVER: 34 | BASE_LR: 0.0002 35 | LR_POLICY: cosine 36 | MAX_EPOCH: 300 37 | MOMENTUM: 0.9 38 | WEIGHT_DECAY: 1e-3 39 | WARMUP_EPOCHS: 10 40 | WARMUP_START_LR: 0.000001 41 | OPTIM_METHOD: adam 42 | DAMPENING: 0.0 43 | NESTEROV: true 44 | 45 | VIDEO: 46 | HEAD: 47 | NAME: BaseTemporalClassifier 48 | NUM_CLASSES: 200 49 | DROPOUT_RATE: 0.0 50 | BACKBONE: 51 | META_ARCH: Identity 52 | PRE_DOWNLOAD: 53 | ENABLE: false 54 | AUGMENTATION: 55 | USE_GPU: false 56 | -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-ar/ek100/vivit_fac_enc.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../vivit_fac_enc_ek100.yaml 2 | TRAIN: 3 | CHECKPOINT_PERIOD: 1 4 | EVAL_PERIOD: 1 5 | CHECKPOINT_FILE_PATH: "" # directory of pretrained models 6 | FINE_TUNE: true 7 | BATCH_SIZE: 4 8 | CHECKPOINT_PRE_PROCESS: 9 | ENABLE: true 10 | POP_HEAD: true 11 | POS_EMBED: super-resolution 12 | PATCH_EMBD: 13 | 14 | DATA: 15 | TRAIN_JITTER_SCALES: [336, 448] 16 | TRAIN_CROP_SIZE: 320 17 | TEST_SCALE: 320 18 | TEST_CROP_SIZE: 320 19 | 20 | AUGMENTATION: 21 | COLOR_AUG: true 22 | BRIGHTNESS: 0.5 23 | CONTRAST: 0.5 24 | SATURATION: 0.5 25 | HUE: 0.25 26 | GRAYSCALE: 0.0 27 | CONSISTENT: true 28 | SHUFFLE: false 29 | GRAY_FIRST: false 30 | USE_GPU: false 31 | MIXUP: 32 | ENABLE: true 33 | ALPHA: 0.2 34 | PROB: 1.0 35 | MODE: batch 36 | SWITCH_PROB: 0.5 37 | CUTMIX: 38 | ENABLE: true 39 | ALPHA: 1.0 40 | MINMAX: 41 | RANDOM_ERASING: 42 | ENABLE: true 43 | PROB: 0.25 44 | MODE: pixel 45 | COUNT: [1, 1] 46 | NUM_SPLITS: 0 47 | AREA_RANGE: [0.02, 0.33] 48 | MIN_ASPECT: 0.3 49 | LABEL_SMOOTHING: 0.2 50 | 51 | VIDEO: 52 | BACKBONE: 53 | DROP_PATH: 0.2 54 | HEAD: 55 | DROPOUT_RATE: 0.0 56 | 57 | DATA_LOADER: 58 | NUM_WORKERS: 8 59 | 60 | OUTPUT_DIR: output/vivit_fac_enc_ek100 -------------------------------------------------------------------------------- /projects/epic-kitchen-tal/README.md: -------------------------------------------------------------------------------- 1 | 2 | # A Stronger Baseline for Ego-Centric Action Detection (CVPR 2021 Workshop) 3 | 4 | 5 | # Running instructions 6 | To train the action localization model, set the `_BASE_RUN` to point to `configs/pool/run/training/localization.yaml`. See `configs/projects/epic-kitchen-tal/bmn_epic.yaml` for more details. Alternatively, you can also find some pre-trained model in the `MODEL_ZOO.md`. 7 | 8 | For detailed explanations on the approach itself, please refer to the [paper](https://arxiv.org/pdf/2106.06942). 9 | 10 | For preparing dataset, please download [features](), [classification results]() and [dataset annotations](). 11 | 12 | 13 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR`, `CLASSIFIER_ROOT_DIR` and `NUM_GPUS` in `configs/projects/epic-kitchen-tal/bmn_epic.yaml`, and run the command 14 | 15 | ``` 16 | python runs/run.py --cfg configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml 17 | ``` 18 | 19 | 20 | # Citing this report 21 | If you find this report useful for your research, please consider citing the paper as follows: 22 | ```BibTeX 23 | @article{qing2021stronger, 24 | title={A Stronger Baseline for Ego-Centric Action Detection}, 25 | author={Qing, Zhiwu and Huang, Ziyuan and Wang, Xiang and Feng, Yutong and Zhang, Shiwei and Jiang, Jianwen and Tang, Mingqian and Gao, Changxin and Ang Jr, Marcelo H and Sang, Nong}, 26 | journal={arXiv preprint arXiv:2106.06942}, 27 | year={2021} 28 | } 29 | ``` 30 | -------------------------------------------------------------------------------- /projects/tada/README.md: -------------------------------------------------------------------------------- 1 | # TAda! Temporally-Adaptive Convolutions for Video Understanding (arXiv 2021) 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Shiwei Zhang](https://www.researchgate.net/profile/Shiwei-Zhang-14), Liang Pan, Zhiwu Qing, 3 | Mingqian Tang, [Ziwei Liu](https://liuziwei7.github.io/), [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/),
4 | In arXiv, 2021. [[Paper]](https://arxiv.org/pdf/2110.06178). 5 | 6 | # Running instructions 7 | To train TAda2D networks, set the `_BASE_MODEL` to point to `configs/pool/backbone/tada2d.yaml`. See `configs/projects/tada/tada2d_*.yaml` for more details. 8 | TAda2D networks trained on Kinetics and Something-Something can be found in [`MODEL_ZOO.md`](MODEL_ZOO.md). 9 | 10 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/tada/tada2d_k400.yaml`, and run the command 11 | 12 | ``` 13 | python runs/run.py --cfg configs/projects/tada/k400/tada2d_8x8.yaml 14 | ``` 15 | 16 |
17 |
18 | 19 |
20 |
21 | 22 | # Citing TAda! 23 | If you find TAdaConv or TAda2D useful for your research, please consider citing the paper as follows: 24 | ```BibTeX 25 | @article{huang2021tada, 26 | title={TAda! Temporally-Adaptive Convolutions for Video Understanding}, 27 | author={Huang, Ziyuan and Zhang, Shiwei and Pan, Liang and Qing, Zhiwu and Tang, Mingqian and Liu, Ziwei and Ang Jr, Marcelo H}, 28 | journal={arXiv preprint arXiv:2110.06178}, 29 | year={2021} 30 | } 31 | ``` -------------------------------------------------------------------------------- /configs/pool/backbone/slowfast_8x8.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: SlowFast_8x8 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 50 6 | META_ARCH: Slowfast 7 | NUM_FILTERS: [64, 256, 512, 1024, 2048] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 2048 10 | KERNEL_SIZE: [ 11 | [ 12 | [1, 7, 7], 13 | [1, 3, 3], 14 | [1, 3, 3], 15 | [1, 3, 3], 16 | [1, 3, 3], 17 | ], 18 | [ 19 | [5, 7, 7], 20 | [1, 3, 3], 21 | [1, 3, 3], 22 | [1, 3, 3], 23 | [1, 3, 3], 24 | ], 25 | ] 26 | DOWNSAMPLING: [true, false, true, true, true] 27 | DOWNSAMPLING_TEMPORAL: [false, false, false, false, false] 28 | TEMPORAL_CONV_BOTTLENECK: 29 | [ 30 | [false, false, false, true, true], # slow branch, 31 | [false, true, true, true, true] # fast branch 32 | ] 33 | NUM_STREAMS: 1 34 | EXPANSION_RATIO: 4 35 | BRANCH: 36 | NAME: SlowfastBranch 37 | STEM: 38 | NAME: DownSampleStem 39 | SLOWFAST: 40 | MODE: slowfast 41 | ALPHA: 4 42 | BETA: 8 # slow fast channel ratio 43 | CONV_CHANNEL_RATIO: 2 44 | KERNEL_SIZE: 7 45 | FUSION_CONV_BIAS: false 46 | FUSION_BN: true 47 | FUSION_RELU: true 48 | NONLOCAL: 49 | ENABLE: false 50 | STAGES: [5] 51 | MASK_ENABLE: false 52 | HEAD: 53 | NAME: SlowFastHead 54 | ACTIVATION: softmax 55 | DROPOUT_RATE: 0 56 | NUM_CLASSES: # !!! 57 | DATA: 58 | NUM_INPUT_FRAMES: 32 59 | SAMPLING_RATE: 2 -------------------------------------------------------------------------------- /configs/pool/backbone/slowfast_4x16.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NAME: SlowFast_4x16 3 | VIDEO: 4 | BACKBONE: 5 | DEPTH: 50 6 | META_ARCH: Slowfast 7 | NUM_FILTERS: [64, 256, 512, 1024, 2048] 8 | NUM_INPUT_CHANNELS: 3 9 | NUM_OUT_FEATURES: 2048 10 | KERNEL_SIZE: [ 11 | [ 12 | [1, 7, 7], 13 | [1, 3, 3], 14 | [1, 3, 3], 15 | [1, 3, 3], 16 | [1, 3, 3], 17 | ], 18 | [ 19 | [5, 7, 7], 20 | [1, 3, 3], 21 | [1, 3, 3], 22 | [1, 3, 3], 23 | [1, 3, 3], 24 | ], 25 | ] 26 | DOWNSAMPLING: [true, false, true, true, true] 27 | DOWNSAMPLING_TEMPORAL: [false, false, false, false, false] 28 | TEMPORAL_CONV_BOTTLENECK: 29 | [ 30 | [false, false, false, true, true], # slow branch, 31 | [false, true, true, true, true] # fast branch 32 | ] 33 | NUM_STREAMS: 1 34 | EXPANSION_RATIO: 4 35 | BRANCH: 36 | NAME: SlowfastBranch 37 | STEM: 38 | NAME: DownSampleStem 39 | SLOWFAST: 40 | MODE: slowfast 41 | ALPHA: 8 42 | BETA: 8 # slow fast channel ratio 43 | CONV_CHANNEL_RATIO: 2 44 | KERNEL_SIZE: 5 45 | FUSION_CONV_BIAS: false 46 | FUSION_BN: true 47 | FUSION_RELU: true 48 | NONLOCAL: 49 | ENABLE: false 50 | STAGES: [5] 51 | MASK_ENABLE: false 52 | HEAD: 53 | NAME: SlowFastHead 54 | ACTIVATION: softmax 55 | DROPOUT_RATE: 0 56 | NUM_CLASSES: # !!! 57 | DATA: 58 | NUM_INPUT_FRAMES: 32 59 | SAMPLING_RATE: 2 60 | -------------------------------------------------------------------------------- /projects/epic-kitchen-ar/README.md: -------------------------------------------------------------------------------- 1 | # Towards training stronger video vision transformers for epic-kitchens-100 action recognition (CVPR 2021 Workshop) 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Zhiwu Qing](https://scholar.google.com/citations?user=q9refl4AAAAJ&hl=zh-CN), Xiang Wang, Yutong Feng, [Shiwei Zhang](https://www.researchgate.net/profile/Shiwei-Zhang-14), Jianwen Jiang, Zhurong Xia, Mingqian Tang, Nong Sang, and [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/).
3 | In arXiv, 2021. [[Paper]](https://arxiv.org/pdf/2106.05058). 4 | 5 | # Running instructions 6 | Action recognition on Epic-Kitchens-100 share the same pipline with classification. Refer to `configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml` for more details. We also include some trained weights in the [MODEL ZOO](MODEL_ZOO.md). 7 | 8 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/epic-kitchen-ar/vivit_fac_enc_ek100.yaml`, and run the command 9 | 10 | ``` 11 | python runs/run.py --cfgconfigs/projects/epic-kitchen-ar/ek100/vivit_fac_enc.yaml 12 | ``` 13 | 14 | # Citing this report 15 | If you find the training setting useful, please consider citing the paper as follows: 16 | ```BibTeX 17 | @article{huang2021towards, 18 | title={Towards training stronger video vision transformers for epic-kitchens-100 action recognition}, 19 | author={Huang, Ziyuan and Qing, Zhiwu and Wang, Xiang and Feng, Yutong and Zhang, Shiwei and Jiang, Jianwen and Xia, Zhurong and Tang, Mingqian and Sang, Nong and Ang Jr, Marcelo H}, 20 | journal={arXiv preprint arXiv:2106.05058}, 21 | year={2021} 22 | } 23 | ``` -------------------------------------------------------------------------------- /utils/bboxes_1d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def ioa_with_anchors(anchors_min, anchors_max, box_min, box_max): 5 | """ 6 | calculate the overlap proportion between the anchor and all bbox for supervise signal, 7 | Args: 8 | anchors_min (np.ndarry): 1d anchors start position, shape is N. 9 | anchors_max (np.ndarry): 1d anchors end position, shape: N. 10 | box_min (np.ndarry): 1d boxes start position, shape: N. 11 | box_max (np.ndarry): 1d boxes end position, shape: N. 12 | Returns: 13 | scores: (np.ndarry) 14 | """ 15 | len_anchors = anchors_max - anchors_min 16 | int_xmin = np.maximum(anchors_min, box_min) 17 | int_xmax = np.minimum(anchors_max, box_max) 18 | inter_len = np.maximum(int_xmax - int_xmin, 0.) 19 | scores = np.divide(inter_len, len_anchors) 20 | return scores 21 | 22 | 23 | def iou_with_anchors(anchors_min, anchors_max, box_min, box_max): 24 | """ 25 | Compute jaccard score between a box and the anchors. 26 | Args: 27 | anchors_min (np.ndarry): 1d anchors start position, shape is N. 28 | anchors_max (np.ndarry): 1d anchors end position, shape: N. 29 | box_min (np.ndarry): 1d boxes start position, shape: N. 30 | box_max (np.ndarry): 1d boxes end position, shape: N. 31 | Returns: 32 | jaccard: (np.ndarry) 33 | """ 34 | len_anchors = anchors_max - anchors_min 35 | int_xmin = np.maximum(anchors_min, box_min) 36 | int_xmax = np.minimum(anchors_max, box_max) 37 | inter_len = np.maximum(int_xmax - int_xmin, 0.) 38 | union_len = len_anchors - inter_len + box_max - box_min 39 | # print inter_len,union_len 40 | jaccard = np.divide(inter_len, union_len) 41 | return jaccard -------------------------------------------------------------------------------- /projects/mosi/README.md: -------------------------------------------------------------------------------- 1 | # Self-supervised Motion Learning from Static Images (CVPR 2021) 2 | [Ziyuan Huang](https://huang-ziyuan.github.io/), [Shiwei Zhang](https://www.researchgate.net/profile/Shiwei-Zhang-14), Jianwen Jiang, Mingqian Tang, 3 | [Rong Jin](https://www.cse.msu.edu/~rongjin/), [Marcelo Ang](https://www.eng.nus.edu.sg/me/staff/ang-jr-marcelo-h/),
4 | In CVPR, 2021. [[Paper]](https://openaccess.thecvf.com/content/CVPR2021/papers/Huang_Self-Supervised_Motion_Learning_From_Static_Images_CVPR_2021_paper.pdf). 5 | 6 | # Running instructions 7 | To train the model with MoSI, set the `_BASE_RUN` to point to `configs/pool/run/training/mosi.yaml`. See `configs/projects/mosi/mosi_*.yaml` for more details. Alternatively, you can also find some pre-trained model in the `MODEL_ZOO.md`. 8 | 9 | For detailed explanations on the approach itself, please refer to the [paper](https://openaccess.thecvf.com/content/CVPR2021/papers/Huang_Self-Supervised_Motion_Learning_From_Static_Images_CVPR_2021_paper.pdf). 10 | 11 | For an example run, set the `DATA_ROOT_DIR`, `ANNO_DIR` and `NUM_GPUS` in `configs/projects/mosi/mosi_r2d3ds_hmdb.yaml`, and run the command 12 | 13 | ``` 14 | python runs/run.py --cfg configs/projects/mosi/pt-hmdb/r2d3ds.yaml 15 | ``` 16 | 17 |
18 |
19 | 20 |
21 |
22 | 23 | # Citing MoSI 24 | If you find MoSI useful for your research, please consider citing the paper as follows: 25 | ```BibTeX 26 | @inproceedings{mosi2021, 27 | title={Self-supervised motion learning from static images}, 28 | author={Huang, Ziyuan and Zhang, Shiwei and Jiang, Jianwen and Tang, Mingqian and Jin, Rong and Ang, Marcelo H}, 29 | booktitle={{CVPR}}, 30 | pages={1276--1285}, 31 | year={2021} 32 | } 33 | ``` -------------------------------------------------------------------------------- /utils/registry.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Registry class. """ 5 | 6 | class Registry(object): 7 | """ 8 | The Registry class provides a registry for all things 9 | To initialize: 10 | REGISTRY = Registry() 11 | 12 | To register a tracker: 13 | @REGISTRY.register() 14 | class Model(): 15 | ... 16 | """ 17 | 18 | def __init__(self, table_name=""): 19 | """ 20 | Initializes the registry. 21 | Args: 22 | table_name (str): specifies the name of the registry 23 | """ 24 | self._entry_map = {} 25 | self.table_name = table_name 26 | 27 | 28 | def _register(self, name, entry): 29 | """ 30 | Registers the instance. 31 | Args: 32 | name (str): name of the entry 33 | entry (): instance of the entry, could be any type 34 | """ 35 | assert type(name) is str 36 | assert (name not in self._entry_map.keys()), "{} {} already registered.".format( 37 | self.table_name, name 38 | ) 39 | self._entry_map[name] = entry 40 | 41 | def register(self): 42 | """ 43 | Wrapper function for registering a module. 44 | """ 45 | def reg(obj): 46 | name = obj.__name__ 47 | self._register(name, obj) 48 | return obj 49 | return reg 50 | 51 | def get(self, name): 52 | """ 53 | Returns the instance specified by the name. 54 | Args: 55 | name (str): name of the specified instance. 56 | """ 57 | if name not in self._entry_map.keys(): 58 | return None 59 | obj = self._entry_map.get(name) 60 | return obj 61 | 62 | def get_all_registered(self): 63 | """ 64 | Prints all registered class. 65 | """ 66 | return self._entry_map.keys() -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_full/HyRSM_SSv2_Full_2shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | TRAIN: 4 | ENABLE: true 5 | USE_CLASSIFICATION: true 6 | USE_LOCAL: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | NUM_CLASS: 64 9 | DATASET: Ssv2_few_shot 10 | META_BATCH: true # meta or not 11 | NUM_SAMPLES: 1000000 12 | WAY: 5 13 | SHOT: 2 14 | QUERY_PER_CLASS: 6 15 | QUERY_PER_CLASS_TEST: 1 16 | NUM_TRAIN_TASKS: 40000 17 | NUM_TEST_TASKS: 10000 18 | VAL_FRE_ITER: 1000 19 | BATCH_SIZE: 4 # Same with NUM_GPUS 20 | BATCH_SIZE_PER_TASK: 4 21 | CHECKPOINT_FILE_PATH: "" 22 | EVAL_PERIOD: 2 23 | NUM_FOLDS: 1 24 | 25 | AUGMENTATION: 26 | COLOR_AUG: false 27 | NO_RANDOM_ERASE: true 28 | SUPPORT_QUERY_DIFF: true 29 | 30 | TEST: 31 | ENABLE: false 32 | DATASET: Ssv2_few_shot 33 | BATCH_SIZE: 4 # Same with NUM_GPUS 34 | TEST_SET: val 35 | NUM_ENSEMBLE_VIEWS: 1 36 | SPATIAL_CROPS: cctltr 37 | AUTOMATIC_MULTI_SCALE_TEST: false 38 | UPLOAD_CLASSIFIER_RESULTS: true 39 | 40 | DATA: 41 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 42 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_full/ 43 | NUM_INPUT_FRAMES: 8 44 | SAMPLING_RATE: 50 45 | SAMPLING_UNIFORM: false 46 | TRAIN_JITTER_SCALES: [256, 256] 47 | TRAIN_CROP_SIZE: 224 # 84 # 224 48 | TEST_CROP_SIZE: 224 49 | TEST_SCALE: 256 50 | TEST_CENTER_CROP: true 51 | CLIP_INTERVAL: 8 52 | FPS: 12 53 | TARGET_FPS: 12 54 | 55 | LOG_PERIOD: 50 56 | SOLVER: 57 | BASE_LR: 0.0001 # 0.0001 # 0.0005 58 | LR_POLICY: steps_with_relative_lrs 59 | STEPS_ITER: 3000 60 | STEPS: [0, 6, 9] 61 | LRS: [1, 0.1, 0.01] 62 | MAX_EPOCH: 10 63 | MOMENTUM: 0.9 64 | WEIGHT_DECAY: 5e-5 65 | WARMUP_EPOCHS: 1 66 | WARMUP_START_LR: 0.00002 67 | OPTIM_METHOD: adam 68 | DAMPENING: 0.0 69 | NESTEROV: true 70 | # add bn, use same lr settings as server 71 | 72 | VIDEO: 73 | HEAD: 74 | NAME: CNN_HyRSM_5shot 75 | BACKBONE_NAME: "resnet50" 76 | 77 | BACKBONE: 78 | META_ARCH: Identity 79 | 80 | NUM_GPUS: 4 81 | OUTPUT_DIR: output/HyRSM_SSv2_Full_2shot 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_full/HyRSM_SSv2_Full_3shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | TRAIN: 4 | ENABLE: true 5 | USE_CLASSIFICATION: true 6 | USE_LOCAL: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | NUM_CLASS: 64 9 | DATASET: Ssv2_few_shot 10 | META_BATCH: true # meta or not 11 | NUM_SAMPLES: 1000000 12 | WAY: 5 13 | SHOT: 3 14 | QUERY_PER_CLASS: 5 15 | QUERY_PER_CLASS_TEST: 1 16 | NUM_TRAIN_TASKS: 40000 17 | NUM_TEST_TASKS: 10000 18 | VAL_FRE_ITER: 1000 19 | BATCH_SIZE: 4 # Same with NUM_GPUS 20 | BATCH_SIZE_PER_TASK: 4 21 | CHECKPOINT_FILE_PATH: "" 22 | EVAL_PERIOD: 2 23 | NUM_FOLDS: 1 24 | 25 | AUGMENTATION: 26 | COLOR_AUG: false 27 | NO_RANDOM_ERASE: true 28 | SUPPORT_QUERY_DIFF: true 29 | 30 | TEST: 31 | ENABLE: false 32 | DATASET: Ssv2_few_shot 33 | BATCH_SIZE: 4 # Same with NUM_GPUS 34 | TEST_SET: val 35 | NUM_ENSEMBLE_VIEWS: 1 36 | SPATIAL_CROPS: cctltr 37 | AUTOMATIC_MULTI_SCALE_TEST: false 38 | UPLOAD_CLASSIFIER_RESULTS: true 39 | 40 | DATA: 41 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 42 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_full/ 43 | NUM_INPUT_FRAMES: 8 44 | SAMPLING_RATE: 50 45 | SAMPLING_UNIFORM: false 46 | TRAIN_JITTER_SCALES: [256, 256] 47 | TRAIN_CROP_SIZE: 224 # 84 # 224 48 | TEST_CROP_SIZE: 224 49 | TEST_SCALE: 256 50 | TEST_CENTER_CROP: true 51 | CLIP_INTERVAL: 8 52 | FPS: 12 53 | TARGET_FPS: 12 54 | 55 | LOG_PERIOD: 50 56 | SOLVER: 57 | BASE_LR: 0.0001 # 0.0001 # 0.0005 58 | LR_POLICY: steps_with_relative_lrs 59 | STEPS_ITER: 3000 60 | STEPS: [0, 6, 9] 61 | LRS: [1, 0.1, 0.01] 62 | MAX_EPOCH: 10 63 | MOMENTUM: 0.9 64 | WEIGHT_DECAY: 5e-5 65 | WARMUP_EPOCHS: 1 66 | WARMUP_START_LR: 0.00002 67 | OPTIM_METHOD: adam 68 | DAMPENING: 0.0 69 | NESTEROV: true 70 | # add bn, use same lr settings as server 71 | 72 | VIDEO: 73 | HEAD: 74 | NAME: CNN_HyRSM_5shot 75 | BACKBONE_NAME: "resnet50" 76 | 77 | BACKBONE: 78 | META_ARCH: Identity 79 | 80 | NUM_GPUS: 4 81 | OUTPUT_DIR: output/HyRSM_SSv2_Full_3shot 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_full/HyRSM_SSv2_Full_4shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | TRAIN: 4 | ENABLE: true 5 | USE_CLASSIFICATION: true 6 | USE_LOCAL: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | NUM_CLASS: 64 9 | DATASET: Ssv2_few_shot 10 | META_BATCH: true # meta or not 11 | NUM_SAMPLES: 1000000 12 | WAY: 5 13 | SHOT: 4 14 | QUERY_PER_CLASS: 4 15 | QUERY_PER_CLASS_TEST: 1 16 | NUM_TRAIN_TASKS: 40000 17 | NUM_TEST_TASKS: 10000 18 | VAL_FRE_ITER: 1000 19 | BATCH_SIZE: 4 # Same with NUM_GPUS 20 | BATCH_SIZE_PER_TASK: 4 21 | CHECKPOINT_FILE_PATH: "" 22 | EVAL_PERIOD: 2 23 | NUM_FOLDS: 1 24 | 25 | AUGMENTATION: 26 | COLOR_AUG: false 27 | NO_RANDOM_ERASE: true 28 | SUPPORT_QUERY_DIFF: true 29 | 30 | TEST: 31 | ENABLE: false 32 | DATASET: Ssv2_few_shot 33 | BATCH_SIZE: 4 # Same with NUM_GPUS 34 | TEST_SET: val 35 | NUM_ENSEMBLE_VIEWS: 1 36 | SPATIAL_CROPS: cctltr 37 | AUTOMATIC_MULTI_SCALE_TEST: false 38 | UPLOAD_CLASSIFIER_RESULTS: true 39 | 40 | DATA: 41 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 42 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_full/ 43 | NUM_INPUT_FRAMES: 8 44 | SAMPLING_RATE: 50 45 | SAMPLING_UNIFORM: false 46 | TRAIN_JITTER_SCALES: [256, 256] 47 | TRAIN_CROP_SIZE: 224 # 84 # 224 48 | TEST_CROP_SIZE: 224 49 | TEST_SCALE: 256 50 | TEST_CENTER_CROP: true 51 | CLIP_INTERVAL: 8 52 | FPS: 12 53 | TARGET_FPS: 12 54 | 55 | LOG_PERIOD: 50 56 | SOLVER: 57 | BASE_LR: 0.0001 # 0.0001 # 0.0005 58 | LR_POLICY: steps_with_relative_lrs 59 | STEPS_ITER: 3000 60 | STEPS: [0, 6, 9] 61 | LRS: [1, 0.1, 0.01] 62 | MAX_EPOCH: 10 63 | MOMENTUM: 0.9 64 | WEIGHT_DECAY: 5e-5 65 | WARMUP_EPOCHS: 1 66 | WARMUP_START_LR: 0.00002 67 | OPTIM_METHOD: adam 68 | DAMPENING: 0.0 69 | NESTEROV: true 70 | # add bn, use same lr settings as server 71 | 72 | VIDEO: 73 | HEAD: 74 | NAME: CNN_HyRSM_5shot 75 | BACKBONE_NAME: "resnet50" 76 | 77 | BACKBONE: 78 | META_ARCH: Identity 79 | 80 | NUM_GPUS: 4 81 | OUTPUT_DIR: output/HyRSM_SSv2_Full_4shot 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_full/HyRSM_SSv2_Full_5shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | TRAIN: 4 | ENABLE: true 5 | USE_CLASSIFICATION: true 6 | USE_LOCAL: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | NUM_CLASS: 64 9 | DATASET: Ssv2_few_shot 10 | META_BATCH: true # meta or not 11 | NUM_SAMPLES: 1000000 12 | WAY: 5 13 | SHOT: 5 14 | QUERY_PER_CLASS: 3 15 | QUERY_PER_CLASS_TEST: 1 16 | NUM_TRAIN_TASKS: 40000 17 | NUM_TEST_TASKS: 10000 18 | VAL_FRE_ITER: 1000 19 | BATCH_SIZE: 4 # Same with NUM_GPUS 20 | BATCH_SIZE_PER_TASK: 4 21 | CHECKPOINT_FILE_PATH: "" 22 | EVAL_PERIOD: 2 23 | NUM_FOLDS: 1 24 | 25 | AUGMENTATION: 26 | COLOR_AUG: false 27 | NO_RANDOM_ERASE: true 28 | SUPPORT_QUERY_DIFF: true 29 | 30 | TEST: 31 | ENABLE: false 32 | DATASET: Ssv2_few_shot 33 | BATCH_SIZE: 4 # Same with NUM_GPUS 34 | TEST_SET: val 35 | NUM_ENSEMBLE_VIEWS: 1 36 | SPATIAL_CROPS: cctltr 37 | AUTOMATIC_MULTI_SCALE_TEST: false 38 | UPLOAD_CLASSIFIER_RESULTS: true 39 | 40 | DATA: 41 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 42 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_full/ 43 | NUM_INPUT_FRAMES: 8 44 | SAMPLING_RATE: 50 45 | SAMPLING_UNIFORM: false 46 | TRAIN_JITTER_SCALES: [256, 256] 47 | TRAIN_CROP_SIZE: 224 # 84 # 224 48 | TEST_CROP_SIZE: 224 49 | TEST_SCALE: 256 50 | TEST_CENTER_CROP: true 51 | CLIP_INTERVAL: 8 52 | FPS: 12 53 | TARGET_FPS: 12 54 | 55 | LOG_PERIOD: 50 56 | SOLVER: 57 | BASE_LR: 0.0001 # 0.0001 # 0.0005 58 | LR_POLICY: steps_with_relative_lrs 59 | STEPS_ITER: 3000 60 | STEPS: [0, 6, 9] 61 | LRS: [1, 0.1, 0.01] 62 | MAX_EPOCH: 10 63 | MOMENTUM: 0.9 64 | WEIGHT_DECAY: 5e-5 65 | WARMUP_EPOCHS: 1 66 | WARMUP_START_LR: 0.00002 67 | OPTIM_METHOD: adam 68 | DAMPENING: 0.0 69 | NESTEROV: true 70 | # add bn, use same lr settings as server 71 | 72 | VIDEO: 73 | HEAD: 74 | NAME: CNN_HyRSM_5shot 75 | BACKBONE_NAME: "resnet50" 76 | 77 | BACKBONE: 78 | META_ARCH: Identity 79 | 80 | NUM_GPUS: 4 81 | OUTPUT_DIR: output/HyRSM_SSv2_Full_5shot 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_full/HyRSM_SSv2_Full_1shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | TRAIN: 4 | ENABLE: true 5 | USE_CLASSIFICATION: true 6 | USE_LOCAL: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | NUM_CLASS: 64 9 | DATASET: Ssv2_few_shot 10 | META_BATCH: true # meta or not 11 | NUM_SAMPLES: 1000000 12 | WAY: 5 13 | SHOT: 1 14 | QUERY_PER_CLASS: 7 15 | QUERY_PER_CLASS_TEST: 1 16 | NUM_TRAIN_TASKS: 40000 17 | NUM_TEST_TASKS: 10000 18 | VAL_FRE_ITER: 1000 19 | BATCH_SIZE: 4 # Same with NUM_GPUS 20 | BATCH_SIZE_PER_TASK: 4 21 | CHECKPOINT_FILE_PATH: "" 22 | EVAL_PERIOD: 2 23 | NUM_FOLDS: 1 24 | 25 | AUGMENTATION: 26 | COLOR_AUG: false 27 | NO_RANDOM_ERASE: true 28 | SUPPORT_QUERY_DIFF: true 29 | 30 | TEST: 31 | ENABLE: false 32 | DATASET: Ssv2_few_shot 33 | BATCH_SIZE: 4 # Same with NUM_GPUS 34 | TEST_SET: val 35 | NUM_ENSEMBLE_VIEWS: 1 36 | SPATIAL_CROPS: cctltr 37 | AUTOMATIC_MULTI_SCALE_TEST: false 38 | UPLOAD_CLASSIFIER_RESULTS: true 39 | 40 | DATA: 41 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 42 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_full/ 43 | NUM_INPUT_FRAMES: 8 44 | SAMPLING_RATE: 50 45 | SAMPLING_UNIFORM: false 46 | TRAIN_JITTER_SCALES: [256, 256] 47 | TRAIN_CROP_SIZE: 224 # 84 # 224 48 | TEST_CROP_SIZE: 224 49 | TEST_SCALE: 256 50 | TEST_CENTER_CROP: true 51 | CLIP_INTERVAL: 8 52 | FPS: 12 53 | TARGET_FPS: 12 54 | 55 | LOG_PERIOD: 50 56 | SOLVER: 57 | BASE_LR: 0.0001 # 0.0001 # 0.0005 58 | LR_POLICY: steps_with_relative_lrs 59 | STEPS_ITER: 3000 60 | STEPS: [0, 6, 9] 61 | LRS: [1, 0.1, 0.01] 62 | MAX_EPOCH: 10 63 | MOMENTUM: 0.9 64 | WEIGHT_DECAY: 5e-5 65 | WARMUP_EPOCHS: 1 66 | WARMUP_START_LR: 0.00002 67 | OPTIM_METHOD: adam 68 | DAMPENING: 0.0 69 | NESTEROV: true 70 | # add bn, use same lr settings as server 71 | 72 | VIDEO: 73 | HEAD: 74 | NAME: CNN_HyRSM_1shot # or CNN_HyRSM_5shot 75 | BACKBONE_NAME: "resnet50" 76 | 77 | BACKBONE: 78 | META_ARCH: Identity 79 | 80 | NUM_GPUS: 4 81 | OUTPUT_DIR: output/HyRSM_SSv2_Full_1shot 82 | 83 | 84 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_small/HyRSM_SSv2_Small_2shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 64 10 | POSITION_A: 10 11 | POSITION_B: 1.0 12 | DATASET: Ssv2_few_shot 13 | META_BATCH: true # meta or not 14 | NUM_SAMPLES: 1000000 15 | WAY: 5 16 | SHOT: 2 17 | QUERY_PER_CLASS: 6 18 | QUERY_PER_CLASS_TEST: 1 19 | NUM_TRAIN_TASKS: 20000 20 | NUM_TEST_TASKS: 10000 21 | VAL_FRE_ITER: 1000 22 | BATCH_SIZE: 4 # Same with NUM_GPUS 23 | BATCH_SIZE_PER_TASK: 4 24 | CHECKPOINT_FILE_PATH: "" 25 | EVAL_PERIOD: 2 26 | NUM_FOLDS: 1 27 | 28 | AUGMENTATION: 29 | COLOR_AUG: false 30 | NO_RANDOM_ERASE: true 31 | SUPPORT_QUERY_DIFF: true 32 | 33 | TEST: 34 | ENABLE: false 35 | DATASET: Ssv2_few_shot 36 | BATCH_SIZE: 4 # Same with NUM_GPUS 37 | TEST_SET: val 38 | NUM_ENSEMBLE_VIEWS: 1 39 | SPATIAL_CROPS: cctltr 40 | AUTOMATIC_MULTI_SCALE_TEST: false 41 | UPLOAD_CLASSIFIER_RESULTS: true 42 | 43 | DATA: 44 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 45 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_small/ 46 | NUM_INPUT_FRAMES: 8 47 | SAMPLING_RATE: 50 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.0001 # 0.0001 # 0.0005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 1500 63 | STEPS: [0, 4, 8] 64 | LRS: [1, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-5 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00002 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | NUM_GPUS: 4 84 | OUTPUT_DIR: output/HyRSM_SSv2_Small_2shot 85 | 86 | 87 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_small/HyRSM_SSv2_Small_3shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 64 10 | POSITION_A: 10 11 | POSITION_B: 1.0 12 | DATASET: Ssv2_few_shot 13 | META_BATCH: true # meta or not 14 | NUM_SAMPLES: 1000000 15 | WAY: 5 16 | SHOT: 3 17 | QUERY_PER_CLASS: 5 18 | QUERY_PER_CLASS_TEST: 1 19 | NUM_TRAIN_TASKS: 20000 20 | NUM_TEST_TASKS: 10000 21 | VAL_FRE_ITER: 1000 22 | BATCH_SIZE: 4 # Same with NUM_GPUS 23 | BATCH_SIZE_PER_TASK: 4 24 | CHECKPOINT_FILE_PATH: "" 25 | EVAL_PERIOD: 2 26 | NUM_FOLDS: 1 27 | 28 | AUGMENTATION: 29 | COLOR_AUG: false 30 | NO_RANDOM_ERASE: true 31 | SUPPORT_QUERY_DIFF: true 32 | 33 | TEST: 34 | ENABLE: false 35 | DATASET: Ssv2_few_shot 36 | BATCH_SIZE: 4 # Same with NUM_GPUS 37 | TEST_SET: val 38 | NUM_ENSEMBLE_VIEWS: 1 39 | SPATIAL_CROPS: cctltr 40 | AUTOMATIC_MULTI_SCALE_TEST: false 41 | UPLOAD_CLASSIFIER_RESULTS: true 42 | 43 | DATA: 44 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 45 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_small/ 46 | NUM_INPUT_FRAMES: 8 47 | SAMPLING_RATE: 50 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.0001 # 0.0001 # 0.0005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 1500 63 | STEPS: [0, 4, 8] 64 | LRS: [1, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-5 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00002 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | NUM_GPUS: 4 84 | OUTPUT_DIR: output/HyRSM_SSv2_Small_3shot 85 | 86 | 87 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_small/HyRSM_SSv2_Small_4shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 64 10 | POSITION_A: 10 11 | POSITION_B: 1.0 12 | DATASET: Ssv2_few_shot 13 | META_BATCH: true # meta or not 14 | NUM_SAMPLES: 1000000 15 | WAY: 5 16 | SHOT: 4 17 | QUERY_PER_CLASS: 4 18 | QUERY_PER_CLASS_TEST: 1 19 | NUM_TRAIN_TASKS: 20000 20 | NUM_TEST_TASKS: 10000 21 | VAL_FRE_ITER: 1000 22 | BATCH_SIZE: 4 # Same with NUM_GPUS 23 | BATCH_SIZE_PER_TASK: 4 24 | CHECKPOINT_FILE_PATH: "" 25 | EVAL_PERIOD: 2 26 | NUM_FOLDS: 1 27 | 28 | AUGMENTATION: 29 | COLOR_AUG: false 30 | NO_RANDOM_ERASE: true 31 | SUPPORT_QUERY_DIFF: true 32 | 33 | TEST: 34 | ENABLE: false 35 | DATASET: Ssv2_few_shot 36 | BATCH_SIZE: 4 # Same with NUM_GPUS 37 | TEST_SET: val 38 | NUM_ENSEMBLE_VIEWS: 1 39 | SPATIAL_CROPS: cctltr 40 | AUTOMATIC_MULTI_SCALE_TEST: false 41 | UPLOAD_CLASSIFIER_RESULTS: true 42 | 43 | DATA: 44 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 45 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_small/ 46 | NUM_INPUT_FRAMES: 8 47 | SAMPLING_RATE: 50 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.0001 # 0.0001 # 0.0005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 1500 63 | STEPS: [0, 4, 8] 64 | LRS: [1, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-5 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00002 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | NUM_GPUS: 4 84 | OUTPUT_DIR: output/HyRSM_SSv2_Small_4shot 85 | 86 | 87 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_small/HyRSM_SSv2_Small_5shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 64 10 | POSITION_A: 10 11 | POSITION_B: 1.0 12 | DATASET: Ssv2_few_shot 13 | META_BATCH: true # meta or not 14 | NUM_SAMPLES: 1000000 15 | WAY: 5 16 | SHOT: 5 17 | QUERY_PER_CLASS: 3 18 | QUERY_PER_CLASS_TEST: 1 19 | NUM_TRAIN_TASKS: 20000 20 | NUM_TEST_TASKS: 10000 21 | VAL_FRE_ITER: 1000 22 | BATCH_SIZE: 4 # Same with NUM_GPUS 23 | BATCH_SIZE_PER_TASK: 4 24 | CHECKPOINT_FILE_PATH: "" 25 | EVAL_PERIOD: 2 26 | NUM_FOLDS: 1 27 | 28 | AUGMENTATION: 29 | COLOR_AUG: false 30 | NO_RANDOM_ERASE: true 31 | SUPPORT_QUERY_DIFF: true 32 | 33 | TEST: 34 | ENABLE: false 35 | DATASET: Ssv2_few_shot 36 | BATCH_SIZE: 4 # Same with NUM_GPUS 37 | TEST_SET: val 38 | NUM_ENSEMBLE_VIEWS: 1 39 | SPATIAL_CROPS: cctltr 40 | AUTOMATIC_MULTI_SCALE_TEST: false 41 | UPLOAD_CLASSIFIER_RESULTS: true 42 | 43 | DATA: 44 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 45 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_small/ 46 | NUM_INPUT_FRAMES: 8 47 | SAMPLING_RATE: 50 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.0001 # 0.0001 # 0.0005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 1500 63 | STEPS: [0, 4, 8] 64 | LRS: [1, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-5 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00002 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | NUM_GPUS: 4 84 | OUTPUT_DIR: output/HyRSM_SSv2_Small_5shot 85 | 86 | 87 | -------------------------------------------------------------------------------- /configs/pool/run/training/from_scratch.yaml: -------------------------------------------------------------------------------- 1 | PRETRAIN: 2 | ENABLE: false 3 | TRAIN: 4 | ENABLE: true 5 | DATASET: # !!@1 6 | BATCH_SIZE: 128 7 | LOG_FILE: training_log.log 8 | LOSS_FUNC: cross_entropy 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 30 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: false # !!@3 20 | DATASET: # !!@3 21 | BATCH_SIZE: 128 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: true 29 | DATA: 30 | DATA_ROOT_DIR: 31 | ANNO_DIR: 32 | NUM_INPUT_FRAMES: 16 33 | NUM_INPUT_CHANNELS: 3 34 | SAMPLING_MODE: interval_based 35 | SAMPLING_RATE: 4 36 | TRAIN_JITTER_SCALES: [168, 224] 37 | TRAIN_CROP_SIZE: 112 38 | TEST_SCALE: 224 39 | TEST_CROP_SIZE: 112 40 | MEAN: [0.45, 0.45, 0.45] 41 | STD: [0.225, 0.225, 0.225] 42 | MULTI_LABEL: false 43 | ENSEMBLE_METHOD: sum 44 | FPS: 30 45 | TARGET_FPS: 30 46 | OPTIMIZER: 47 | BASE_LR: 0.002 48 | LR_POLICY: cosine 49 | MAX_EPOCH: 300 50 | MOMENTUM: 0.9 51 | WEIGHT_DECAY: 1e-3 52 | WARMUP_EPOCHS: 10 53 | WARMUP_START_LR: 0.0002 54 | OPTIM_METHOD: adam 55 | DAMPENING: 0.0 56 | NESTEROV: true 57 | BN: 58 | WEIGHT_DECAY: 0.0 59 | EPS: 1e-3 60 | DATA_LOADER: 61 | NUM_WORKERS: 4 62 | PIN_MEMORY: false 63 | ENABLE_MULTI_THREAD_DECODE: true 64 | NUM_GPUS: 8 65 | SHARD_ID: 0 66 | NUM_SHARDS: 1 67 | RANDOM_SEED: 0 68 | OUTPUT_DIR: 69 | OUTPUT_CFG_FILE: configuration.log 70 | LOG_PERIOD: 10 71 | DIST_BACKEND: nccl 72 | LOG_MODEL_INFO: true 73 | LOG_CONFIG_INFO: true 74 | AUGMENTATION: 75 | COLOR_AUG: true 76 | BRIGHTNESS: 0.5 77 | CONTRAST: 0.5 78 | SATURATION: 0.5 79 | HUE: 0.25 80 | GRAYSCALE: 0.3 81 | CONSISTENT: true 82 | SHUFFLE: true 83 | GRAY_FIRST: true 84 | RATIO: [0.857142857142857, 1.1666666666666667] 85 | USE_GPU: true 86 | PAI: false -------------------------------------------------------------------------------- /configs/projects/hyrsm/ucf101/HyRSM_UCF101_2shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 70 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 2 16 | QUERY_PER_CLASS: 6 # 7 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2500 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 100 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 4 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/ucf101/ 44 | ANNO_DIR: ./configs/projects/hyrsm/ucf101/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.00005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 700 63 | STEPS: [0, 3, 5, 7] 64 | LRS: [1, 0.5, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-4 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00001 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_UCF101_2shot 86 | 87 | 88 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ucf101/HyRSM_UCF101_3shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 70 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 3 16 | QUERY_PER_CLASS: 5 # 7 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2500 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 100 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 4 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/ucf101/ 44 | ANNO_DIR: ./configs/projects/hyrsm/ucf101/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.00005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 700 63 | STEPS: [0, 3, 5, 7] 64 | LRS: [1, 0.5, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-4 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00001 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_UCF101_3shot 86 | 87 | 88 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ucf101/HyRSM_UCF101_4shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 70 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 4 16 | QUERY_PER_CLASS: 4 # 7 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2500 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 100 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 4 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/ucf101/ 44 | ANNO_DIR: ./configs/projects/hyrsm/ucf101/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.00005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 700 63 | STEPS: [0, 3, 5, 7] 64 | LRS: [1, 0.5, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-4 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00001 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_UCF101_4shot 86 | 87 | 88 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ucf101/HyRSM_UCF101_5shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 70 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 5 16 | QUERY_PER_CLASS: 3 # 7 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2500 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 100 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 4 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/ucf101/ 44 | ANNO_DIR: ./configs/projects/hyrsm/ucf101/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.00005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 700 63 | STEPS: [0, 3, 5, 7] 64 | LRS: [1, 0.5, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-4 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00001 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_UCF101_5shot 86 | 87 | 88 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ssv2_small/HyRSM_SSv2_Small_1shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 64 10 | POSITION_A: 10 11 | POSITION_B: 1.0 12 | DATASET: Ssv2_few_shot 13 | META_BATCH: true # meta or not 14 | NUM_SAMPLES: 1000000 15 | WAY: 5 16 | SHOT: 1 17 | QUERY_PER_CLASS: 7 18 | QUERY_PER_CLASS_TEST: 1 19 | NUM_TRAIN_TASKS: 20000 20 | NUM_TEST_TASKS: 10000 21 | VAL_FRE_ITER: 1000 22 | BATCH_SIZE: 4 # Same with NUM_GPUS 23 | BATCH_SIZE_PER_TASK: 4 24 | CHECKPOINT_FILE_PATH: "" 25 | EVAL_PERIOD: 2 26 | NUM_FOLDS: 1 27 | 28 | AUGMENTATION: 29 | COLOR_AUG: false 30 | NO_RANDOM_ERASE: true 31 | SUPPORT_QUERY_DIFF: true 32 | 33 | TEST: 34 | ENABLE: false 35 | DATASET: Ssv2_few_shot 36 | BATCH_SIZE: 4 # Same with NUM_GPUS 37 | TEST_SET: val 38 | NUM_ENSEMBLE_VIEWS: 1 39 | SPATIAL_CROPS: cctltr 40 | AUTOMATIC_MULTI_SCALE_TEST: false 41 | UPLOAD_CLASSIFIER_RESULTS: true 42 | 43 | DATA: 44 | DATA_ROOT_DIR: /mnt/workspace/SSv2/ 45 | ANNO_DIR: ./configs/projects/hyrsm/ssv2_small/ 46 | NUM_INPUT_FRAMES: 8 47 | SAMPLING_RATE: 50 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.0001 # 0.0001 # 0.0005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 1500 63 | STEPS: [0, 4, 8] 64 | LRS: [1, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-5 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00002 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_1shot # or CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | NUM_GPUS: 4 84 | OUTPUT_DIR: output/HyRSM_SSv2_Small_1shot 85 | 86 | 87 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/hmdb51/HyRSM_HMDB51_3shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 31 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 3 16 | QUERY_PER_CLASS: 5 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2000 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 50 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 2 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/hmdb51/ 44 | ANNO_DIR: ./configs/projects/hyrsm/hmdb51/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.0001 # 0.0005 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 300 64 | STEPS: [0, 2, 3, 7] 65 | LRS: [1, 0.5, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_HMDB51_3shot 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/hmdb51/HyRSM_HMDB51_4shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 31 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 4 16 | QUERY_PER_CLASS: 4 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2000 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 50 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 2 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/hmdb51/ 44 | ANNO_DIR: ./configs/projects/hyrsm/hmdb51/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.0001 # 0.0005 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 300 64 | STEPS: [0, 2, 3, 7] 65 | LRS: [1, 0.5, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_HMDB51_4shot 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/hmdb51/HyRSM_HMDB51_5shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 31 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 5 16 | QUERY_PER_CLASS: 3 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2000 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 50 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 2 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/hmdb51/ 44 | ANNO_DIR: ./configs/projects/hyrsm/hmdb51/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.0001 # 0.0005 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 300 64 | STEPS: [0, 2, 3, 7] 65 | LRS: [1, 0.5, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_HMDB51_5shot 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/hmdb51/HyRSM_HMDB51_2shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 31 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 2 16 | QUERY_PER_CLASS: 6 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2000 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 50 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 2 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/hmdb51/ 44 | ANNO_DIR: ./configs/projects/hyrsm/hmdb51/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.0001 # 0.0005 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 300 64 | STEPS: [0, 2, 3, 7] 65 | LRS: [1, 0.5, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_HMDB51_2shot 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/ucf101/HyRSM_UCF101_1shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 70 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 1 16 | QUERY_PER_CLASS: 7 # 7 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2500 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 100 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 4 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 2 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/ucf101/ 44 | ANNO_DIR: ./configs/projects/hyrsm/ucf101/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | LOG_PERIOD: 50 59 | SOLVER: 60 | BASE_LR: 0.00005 61 | LR_POLICY: steps_with_relative_lrs 62 | STEPS_ITER: 700 63 | STEPS: [0, 3, 5, 7] 64 | LRS: [1, 0.5, 0.1, 0.01] 65 | MAX_EPOCH: 10 66 | MOMENTUM: 0.9 67 | WEIGHT_DECAY: 5e-4 68 | WARMUP_EPOCHS: 1 69 | WARMUP_START_LR: 0.00001 70 | OPTIM_METHOD: adam 71 | DAMPENING: 0.0 72 | NESTEROV: true 73 | # add bn, use same lr settings as server 74 | 75 | VIDEO: 76 | HEAD: 77 | NAME: CNN_HyRSM_1shot # or CNN_HyRSM_5shot 78 | BACKBONE_NAME: "resnet50" 79 | 80 | BACKBONE: 81 | META_ARCH: Identity 82 | 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_UCF101_1shot 86 | 87 | 88 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/kinetics100/HyRSM_K100_2shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.4 9 | NUM_CLASS: 64 10 | POSITION_A: 2048 11 | POSITION_B: 0.01 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 2 18 | QUERY_PER_CLASS: 6 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 1200 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 50 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/kinetics400_fb/ 46 | ANNO_DIR: ./configs/projects/hyrsm/kinetics100/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 35 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 256] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: 256 54 | CLIP_INTERVAL: 8 55 | TEST_CENTER_CROP: true 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.000022 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 100 64 | STEPS: [0, 4, 6, 9] 65 | LRS: [1, 0.5, 0.1, 0.05] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.000005 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | 85 | NUM_GPUS: 4 86 | OUTPUT_DIR: output/HyRSM_K100_2shot 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/kinetics100/HyRSM_K100_3shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.4 9 | NUM_CLASS: 64 10 | POSITION_A: 2048 11 | POSITION_B: 0.01 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 3 18 | QUERY_PER_CLASS: 5 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 1200 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 50 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/kinetics400_fb/ 46 | ANNO_DIR: ./configs/projects/hyrsm/kinetics100/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 35 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 256] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: 256 54 | CLIP_INTERVAL: 8 55 | TEST_CENTER_CROP: true 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.000022 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 100 64 | STEPS: [0, 4, 6, 9] 65 | LRS: [1, 0.5, 0.1, 0.05] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.000005 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | 85 | NUM_GPUS: 4 86 | OUTPUT_DIR: output/HyRSM_K100_3shot 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/kinetics100/HyRSM_K100_4shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.4 9 | NUM_CLASS: 64 10 | POSITION_A: 2048 11 | POSITION_B: 0.01 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 4 18 | QUERY_PER_CLASS: 4 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 1200 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 50 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/kinetics400_fb/ 46 | ANNO_DIR: ./configs/projects/hyrsm/kinetics100/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 35 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 256] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: 256 54 | CLIP_INTERVAL: 8 55 | TEST_CENTER_CROP: true 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.000022 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 100 64 | STEPS: [0, 4, 6, 9] 65 | LRS: [1, 0.5, 0.1, 0.05] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.000005 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | 85 | NUM_GPUS: 4 86 | OUTPUT_DIR: output/HyRSM_K100_4shot 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/kinetics100/HyRSM_K100_5shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.4 9 | NUM_CLASS: 64 10 | POSITION_A: 2048 11 | POSITION_B: 0.01 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 5 18 | QUERY_PER_CLASS: 3 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 1200 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 50 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/kinetics400_fb/ 46 | ANNO_DIR: ./configs/projects/hyrsm/kinetics100/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 35 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 256] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: 256 54 | CLIP_INTERVAL: 8 55 | TEST_CENTER_CROP: true 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.000022 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 100 64 | STEPS: [0, 4, 6, 9] 65 | LRS: [1, 0.5, 0.1, 0.05] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.000005 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | 85 | NUM_GPUS: 4 86 | OUTPUT_DIR: output/HyRSM_K100_5shot 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/epic_kitchens/HyRSM_Epic_2shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | POSITION_A: 2048 9 | POSITION_B: 0.05 10 | USE_LOCAL: true 11 | NUM_CLASS: 60 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 2 18 | QUERY_PER_CLASS: 6 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 2000 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 400 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/epic_kitchens/ 46 | ANNO_DIR: ./configs/projects/hyrsm/epic_kitchens/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 50 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 320] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: [256, 320] 54 | TEST_CENTER_CROP: true 55 | CLIP_INTERVAL: 8 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.00002--> 67.4 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 400 64 | STEPS: [0, 4, 7] 65 | LRS: [1, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-5 # 3e-5 --> 67.4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_Epic_2shot -------------------------------------------------------------------------------- /configs/projects/hyrsm/epic_kitchens/HyRSM_Epic_3shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | POSITION_A: 2048 9 | POSITION_B: 0.05 10 | USE_LOCAL: true 11 | NUM_CLASS: 60 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 3 18 | QUERY_PER_CLASS: 5 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 2000 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 400 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/epic_kitchens/ 46 | ANNO_DIR: ./configs/projects/hyrsm/epic_kitchens/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 50 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 320] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: [256, 320] 54 | TEST_CENTER_CROP: true 55 | CLIP_INTERVAL: 8 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.00002--> 67.4 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 400 64 | STEPS: [0, 4, 7] 65 | LRS: [1, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-5 # 3e-5 --> 67.4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_Epic_3shot -------------------------------------------------------------------------------- /configs/projects/hyrsm/epic_kitchens/HyRSM_Epic_4shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | POSITION_A: 2048 9 | POSITION_B: 0.05 10 | USE_LOCAL: true 11 | NUM_CLASS: 60 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 4 18 | QUERY_PER_CLASS: 4 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 2000 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 400 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/epic_kitchens/ 46 | ANNO_DIR: ./configs/projects/hyrsm/epic_kitchens/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 50 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 320] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: [256, 320] 54 | TEST_CENTER_CROP: true 55 | CLIP_INTERVAL: 8 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.00002--> 67.4 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 400 64 | STEPS: [0, 4, 7] 65 | LRS: [1, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-5 # 3e-5 --> 67.4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_Epic_4shot -------------------------------------------------------------------------------- /configs/projects/hyrsm/epic_kitchens/HyRSM_Epic_5shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | POSITION_A: 2048 9 | POSITION_B: 0.05 10 | USE_LOCAL: true 11 | NUM_CLASS: 60 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 5 18 | QUERY_PER_CLASS: 3 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 2000 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 400 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/epic_kitchens/ 46 | ANNO_DIR: ./configs/projects/hyrsm/epic_kitchens/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 50 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 320] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: [256, 320] 54 | TEST_CENTER_CROP: true 55 | CLIP_INTERVAL: 8 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.00002--> 67.4 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 400 64 | STEPS: [0, 4, 7] 65 | LRS: [1, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-5 # 3e-5 --> 67.4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_Epic_5shot -------------------------------------------------------------------------------- /configs/projects/hyrsm/hmdb51/HyRSM_HMDB51_1shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.15 9 | NUM_CLASS: 31 10 | DATASET_FEW: HMDB_few_shot 11 | DATASET: Ssv2_few_shot 12 | META_BATCH: true # meta or not 13 | NUM_SAMPLES: 1000000 14 | WAY: 5 15 | SHOT: 1 16 | QUERY_PER_CLASS: 7 17 | QUERY_PER_CLASS_TEST: 1 18 | NUM_TRAIN_TASKS: 2000 19 | NUM_TEST_TASKS: 10000 20 | VAL_FRE_ITER: 50 21 | BATCH_SIZE: 4 # Same with NUM_GPUS 22 | BATCH_SIZE_PER_TASK: 2 23 | CHECKPOINT_FILE_PATH: "" 24 | EVAL_PERIOD: 2 25 | NUM_FOLDS: 1 26 | 27 | AUGMENTATION: 28 | COLOR_AUG: false 29 | NO_RANDOM_ERASE: false 30 | SUPPORT_QUERY_DIFF: true 31 | 32 | TEST: 33 | ENABLE: false 34 | DATASET: Ssv2_few_shot 35 | BATCH_SIZE: 4 # Same with NUM_GPUS 36 | TEST_SET: val 37 | NUM_ENSEMBLE_VIEWS: 1 38 | SPATIAL_CROPS: cctltr 39 | AUTOMATIC_MULTI_SCALE_TEST: false 40 | UPLOAD_CLASSIFIER_RESULTS: true 41 | 42 | DATA: 43 | DATA_ROOT_DIR: /mnt/workspace/hmdb51/ 44 | ANNO_DIR: ./configs/projects/hyrsm/hmdb51/ 45 | NUM_INPUT_FRAMES: 8 46 | SAMPLING_RATE: 50 # 50 47 | # SAMPLING_RATE_TRAIN: 3 48 | SAMPLING_UNIFORM: false 49 | TRAIN_JITTER_SCALES: [256, 256] 50 | TRAIN_CROP_SIZE: 224 # 84 # 224 51 | TEST_CROP_SIZE: 224 52 | TEST_SCALE: 256 53 | TEST_CENTER_CROP: true 54 | CLIP_INTERVAL: 8 55 | FPS: 12 56 | TARGET_FPS: 12 57 | 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.0001 # 0.0005 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 300 64 | STEPS: [0, 2, 3, 7] 65 | LRS: [1, 0.5, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_1shot # or CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_HMDB51_1shot 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /models/module_zoo/branches/non_local.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ NonLocal block. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from models.base.base_blocks import BaseBranch, BRANCH_REGISTRY 10 | 11 | @BRANCH_REGISTRY.register() 12 | class NonLocal(BaseBranch): 13 | """ 14 | Non-local block. 15 | 16 | See Xiaolong Wang et al. 17 | Non-local Neural Networks. 18 | """ 19 | 20 | def __init__(self, cfg, block_idx): 21 | super(NonLocal, self).__init__(cfg, block_idx) 22 | 23 | self.dim_middle = self.dim_in // 2 24 | 25 | self.qconv = nn.Conv3d( 26 | self.dim_in, 27 | self.dim_middle, 28 | kernel_size=1, 29 | stride=1, 30 | padding=0 31 | ) 32 | 33 | self.kconv = nn.Conv3d( 34 | self.dim_in, 35 | self.dim_middle, 36 | kernel_size=1, 37 | stride=1, 38 | padding=0 39 | ) 40 | 41 | self.vconv = nn.Conv3d( 42 | self.dim_in, 43 | self.dim_middle, 44 | kernel_size=1, 45 | stride=1, 46 | padding=0 47 | ) 48 | 49 | self.out_conv = nn.Conv3d( 50 | self.dim_middle, 51 | self.num_filters, 52 | kernel_size=1, 53 | stride=1, 54 | padding=0, 55 | ) 56 | self.out_bn = nn.BatchNorm3d(self.num_filters, eps=1e-5, momentum=self.bn_mmt) 57 | 58 | def forward(self, x): 59 | n,c,t,h,w = x.shape 60 | 61 | query = self.qconv(x).view(n, self.dim_middle, -1) 62 | key = self.kconv(x).view(n, self.dim_middle, -1) 63 | value = self.vconv(x).view(n, self.dim_middle, -1) 64 | 65 | attn = torch.einsum("nct,ncp->ntp", (query, key)) 66 | attn = attn * (self.dim_middle ** -0.5) 67 | attn = F.softmax(attn, dim=2) 68 | 69 | out = torch.einsum("ntg,ncg->nct", (attn, value)) 70 | out = out.view(n, self.dim_middle, t, h, w) 71 | out = self.out_conv(out) 72 | out = self.out_bn(out) 73 | return x + out 74 | 75 | 76 | -------------------------------------------------------------------------------- /configs/projects/hyrsm/kinetics100/HyRSM_K100_1shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_LOCAL: true 8 | USE_CLASSIFICATION_VALUE: 0.4 9 | NUM_CLASS: 64 10 | POSITION_A: 2048 11 | POSITION_B: 0.01 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 1 18 | QUERY_PER_CLASS: 7 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 1200 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 50 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/kinetics400_fb/ 46 | ANNO_DIR: ./configs/projects/hyrsm/kinetics100/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 35 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 256] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: 256 54 | CLIP_INTERVAL: 8 55 | TEST_CENTER_CROP: true 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.000022 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 100 64 | STEPS: [0, 4, 6, 9] 65 | LRS: [1, 0.5, 0.1, 0.05] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.000005 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_1shot # or CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | 85 | NUM_GPUS: 4 86 | OUTPUT_DIR: output/HyRSM_K100_1shot 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/pool/run/training/from_scratch_large.yaml: -------------------------------------------------------------------------------- 1 | PRETRAIN: 2 | ENABLE: false 3 | TRAIN: 4 | ENABLE: true 5 | DATASET: # !!@1 6 | BATCH_SIZE: 8 7 | LOG_FILE: training_log.log 8 | LOSS_FUNC: cross_entropy 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 1 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 5 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: false 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: true # !!@3 20 | DATASET: # !!@3 21 | BATCH_SIZE: 8 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: true 29 | AUTOMATIC_MULTI_SCALE_TEST_SPATIAL: true 30 | DATA: 31 | DATA_ROOT_DIR: 32 | ANNO_DIR: 33 | NUM_INPUT_FRAMES: 16 34 | NUM_INPUT_CHANNELS: 3 35 | SAMPLING_MODE: interval_based 36 | SAMPLING_RATE: 4 37 | TRAIN_JITTER_SCALES: [256, 320] 38 | TRAIN_CROP_SIZE: 224 39 | TEST_SCALE: 224 40 | TEST_CROP_SIZE: 224 41 | MEAN: [0.45, 0.45, 0.45] 42 | STD: [0.225, 0.225, 0.225] 43 | MULTI_LABEL: false 44 | ENSEMBLE_METHOD: sum 45 | FPS: 30 46 | TARGET_FPS: 30 47 | OPTIMIZER: 48 | BASE_LR: 0.001 49 | ADJUST_LR: false 50 | LR_POLICY: cosine 51 | MAX_EPOCH: 100 52 | MOMENTUM: 0.9 53 | WEIGHT_DECAY: 1e-4 54 | WARMUP_EPOCHS: 10 55 | WARMUP_START_LR: 0.0001 56 | OPTIM_METHOD: adam 57 | DAMPENING: 0.0 58 | NESTEROV: true 59 | BN: 60 | WEIGHT_DECAY: 0.0 61 | DATA_LOADER: 62 | NUM_WORKERS: 8 63 | PIN_MEMORY: false 64 | ENABLE_MULTI_THREAD_DECODE: true 65 | NUM_GPUS: 8 66 | SHARD_ID: 0 67 | NUM_SHARDS: 1 68 | RANDOM_SEED: 0 69 | OUTPUT_DIR: 70 | OUTPUT_CFG_FILE: configuration.log 71 | LOG_PERIOD: 10 72 | DIST_BACKEND: nccl 73 | LOG_MODEL_INFO: true 74 | LOG_CONFIG_INFO: true 75 | AUGMENTATION: 76 | COLOR_AUG: false 77 | BRIGHTNESS: 0.5 78 | CONTRAST: 0.5 79 | SATURATION: 0.5 80 | HUE: 0.25 81 | GRAYSCALE: 0.3 82 | CONSISTENT: true 83 | SHUFFLE: true 84 | GRAY_FIRST: true 85 | RATIO: [0.857142857142857, 1.1666666666666667] 86 | USE_GPU: false 87 | PAI: false -------------------------------------------------------------------------------- /configs/projects/hyrsm/epic_kitchens/HyRSM_Epic_1shot_v1.yaml: -------------------------------------------------------------------------------- 1 | _BASE: ../OTAM_base.yaml 2 | TASK_TYPE: few_shot_action 3 | RANDOM_SEED: 18 4 | TRAIN: 5 | ENABLE: true 6 | USE_CLASSIFICATION: true 7 | USE_CLASSIFICATION_VALUE: 0.6 8 | POSITION_A: 2048 9 | POSITION_B: 0.05 10 | USE_LOCAL: true 11 | NUM_CLASS: 60 12 | DATASET: Ssv2_few_shot 13 | DATASET_FEW: Kinetics_few_shot 14 | META_BATCH: true # meta or not 15 | NUM_SAMPLES: 1000000 16 | WAY: 5 17 | SHOT: 1 18 | QUERY_PER_CLASS: 7 # 7 19 | QUERY_PER_CLASS_TEST: 1 20 | NUM_TRAIN_TASKS: 2000 21 | NUM_TEST_TASKS: 10000 22 | VAL_FRE_ITER: 400 23 | BATCH_SIZE: 4 # Same with NUM_GPUS 24 | BATCH_SIZE_PER_TASK: 4 25 | CHECKPOINT_FILE_PATH: "" 26 | EVAL_PERIOD: 2 27 | NUM_FOLDS: 1 28 | 29 | AUGMENTATION: 30 | COLOR_AUG: false 31 | NO_RANDOM_ERASE: true 32 | SUPPORT_QUERY_DIFF: true 33 | 34 | TEST: 35 | ENABLE: false 36 | DATASET: Ssv2_few_shot 37 | BATCH_SIZE: 4 # Same with NUM_GPUS 38 | TEST_SET: val 39 | NUM_ENSEMBLE_VIEWS: 1 40 | SPATIAL_CROPS: cctltr 41 | AUTOMATIC_MULTI_SCALE_TEST: false 42 | UPLOAD_CLASSIFIER_RESULTS: true 43 | 44 | DATA: 45 | DATA_ROOT_DIR: /mnt/workspace/epic_kitchens/ 46 | ANNO_DIR: ./configs/projects/hyrsm/epic_kitchens/ 47 | NUM_INPUT_FRAMES: 8 48 | SAMPLING_RATE: 50 49 | SAMPLING_UNIFORM: false 50 | TRAIN_JITTER_SCALES: [256, 320] 51 | TRAIN_CROP_SIZE: 224 # 84 # 224 52 | TEST_CROP_SIZE: 224 53 | TEST_SCALE: [256, 320] 54 | TEST_CENTER_CROP: true 55 | CLIP_INTERVAL: 8 56 | FPS: 12 57 | TARGET_FPS: 12 58 | 59 | LOG_PERIOD: 50 60 | SOLVER: 61 | BASE_LR: 0.00005 # 0.00002--> 67.4 62 | LR_POLICY: steps_with_relative_lrs 63 | STEPS_ITER: 400 64 | STEPS: [0, 4, 7] 65 | LRS: [1, 0.1, 0.01] 66 | MAX_EPOCH: 10 67 | MOMENTUM: 0.9 68 | WEIGHT_DECAY: 5e-5 # 3e-5 --> 67.4 69 | WARMUP_EPOCHS: 1 70 | WARMUP_START_LR: 0.00001 71 | OPTIM_METHOD: adam 72 | DAMPENING: 0.0 73 | NESTEROV: true 74 | # add bn, use same lr settings as server 75 | 76 | VIDEO: 77 | HEAD: 78 | NAME: CNN_HyRSM_1shot # or CNN_HyRSM_5shot 79 | BACKBONE_NAME: "resnet50" 80 | 81 | BACKBONE: 82 | META_ARCH: Identity 83 | 84 | NUM_GPUS: 4 85 | OUTPUT_DIR: output/HyRSM_Epic_1shot -------------------------------------------------------------------------------- /models/module_zoo/stems/r2plus1d_stem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ R2Plus1D stem. """ 5 | 6 | import math 7 | import torch 8 | import torch.nn as nn 9 | 10 | from models.base.base_blocks import Base3DStem 11 | from models.base.base_blocks import STEM_REGISTRY 12 | 13 | @STEM_REGISTRY.register() 14 | class R2Plus1DStem(Base3DStem): 15 | """ 16 | R(2+1)D Stem. 17 | """ 18 | def __init__( 19 | self, 20 | cfg 21 | ): 22 | super(R2Plus1DStem, self).__init__(cfg) 23 | 24 | def _construct_block( 25 | self, 26 | cfg, 27 | dim_in, 28 | num_filters, 29 | kernel_sz, 30 | stride, 31 | bn_eps=1e-5, 32 | bn_mmt=0.1 33 | ): 34 | 35 | mid_dim = int( 36 | math.floor((kernel_sz[0] * kernel_sz[1] * kernel_sz[2] * dim_in * num_filters) / \ 37 | (kernel_sz[1] * kernel_sz[2] * dim_in + kernel_sz[0] * num_filters))) 38 | 39 | self.a1 = nn.Conv3d( 40 | in_channels = dim_in, 41 | out_channels = mid_dim, 42 | kernel_size = [1, kernel_sz[1], kernel_sz[2]], 43 | stride = [1, stride[1], stride[2]], 44 | padding = [0, kernel_sz[1]//2, kernel_sz[2]//2], 45 | bias = False 46 | ) 47 | self.a1_bn = nn.BatchNorm3d(mid_dim, eps=bn_eps, momentum=bn_mmt) 48 | self.a1_relu = nn.ReLU(inplace=True) 49 | 50 | self.a2 = nn.Conv3d( 51 | in_channels = mid_dim, 52 | out_channels = num_filters, 53 | kernel_size = [kernel_sz[0], 1, 1], 54 | stride = [stride[0], 1, 1], 55 | padding = [kernel_sz[0]//2, 0, 0], 56 | bias = False 57 | ) 58 | self.a2_bn = nn.BatchNorm3d(num_filters, eps=bn_eps, momentum=bn_mmt) 59 | self.a2_relu = nn.ReLU(inplace=True) 60 | 61 | def forward(self, x): 62 | x = self.a1(x) 63 | x = self.a1_bn(x) 64 | x = self.a1_relu(x) 65 | 66 | x = self.a2(x) 67 | x = self.a2_bn(x) 68 | x = self.a2_relu(x) 69 | return x -------------------------------------------------------------------------------- /configs/projects/epic-kitchen-tal/bmn_epic.yaml: -------------------------------------------------------------------------------- 1 | _BASE_RUN: ../../pool/run/training/localization.yaml 2 | _BASE_MODEL: ../../pool/backbone/localization-conv.yaml 3 | 4 | TRAIN: 5 | ENABLE: true 6 | BATCH_SIZE: 16 7 | DATASET: Epickitchen100Localization 8 | CHECKPOINT_FILE_PATH: # !!@2 9 | TEST: 10 | ENABLE: true 11 | BATCH_SIZE: 16 12 | DATASET: Epickitchen100Localization 13 | 14 | LOCALIZATION: 15 | ENABLE: true 16 | LOSS: Tem+PemReg+PemCls 17 | LOSS_WEIGHTS: [1,10,1,1] 18 | TEST_OUTPUT_DIR: ./output/ 19 | PROPS_DIR: prop_results 20 | RESULT_FILE: tal_detection_res 21 | CLASSIFIER_FILE: 22 | POST_PROCESS: 23 | PROP_NUM_RATIO: 2 24 | THREAD: 32 25 | SOFT_NMS_ALPHA: 0.4 26 | SOFT_NMS_LOW_THRES: 0.25 27 | SOFT_NMS_HIGH_THRES: 0.9 28 | PROP_NUM_RATIO: 1.0 29 | SELECT_SCORE: 0.0 30 | SCORE_TYPE: 'cr' 31 | CLR_POWER: 1.2 32 | REG_POWER: 1.0 33 | IOU_POWER: 2.0 34 | ACTION_SCORE_POWER: 1.0 35 | VIDEO_SCORES_WEIGHT: 1.0 36 | 37 | DATA: 38 | DATA_ROOT_DIR: [/mnt/data-nas/qingzhiwu/dataset/epic-tal/features/features_s8_fps60_320_-1_train/] 39 | ANNO_DIR: /mnt/data-nas/qingzhiwu/dataset/epic-tal/annotations/ 40 | VIDEO_LENGTH_FILE: epic_videos_len.txt 41 | ANNO_NAME: "EPIC_100_validation.json" 42 | TEMPORAL_SCALE: 200 43 | DURATION_SCALE: 100 44 | NUM_INPUT_CHANNELS: 6912 45 | NORM_FEATURE: false 46 | LABELS_TYPE: bmn 47 | LOAD_TYPE: torch 48 | CLIPS_LIST_FILE: 5s_clips.txt 49 | TARGET_FPS: 60 50 | NUM_INPUT_FRAMES: 32 51 | SAMPLING_RATE: 2 52 | CLIP_INTERVAL: 8 53 | MULTI_LABEL: true 54 | CLASSIFIER_ROOT_DIR: /mnt/data-nas/qingzhiwu/dataset/epic-tal/features/cls_res_s8_fps60_320_-1_train/ 55 | LOAD_CLASSIFIER_RES: true 56 | 57 | OPTIMIZER: 58 | BASE_LR: 0.002 59 | ADJUST_LR: true 60 | LR_POLICY: cosine 61 | MAX_EPOCH: 10 62 | MOMENTUM: 0.9 63 | WEIGHT_DECAY: 1e-4 64 | WARMUP_EPOCHS: 1 65 | WARMUP_START_LR: 0.00001 66 | OPTIM_METHOD: adamw 67 | DAMPENING: 0.0 68 | NESTEROV: true 69 | 70 | VIDEO: 71 | HEAD: 72 | NAME: BaseBMN 73 | ACTIVATION: sigmoid 74 | DROPOUT_RATE: 0 75 | NUM_SAMPLE: 32 76 | NUM_SAMPLE_PERBIN: 3 77 | BOUNDARY_RATIO: 0.5 78 | USE_BMN_REGRESSION: false 79 | 80 | LOG_PERIOD: 50 81 | USE_MULTISEG_VAL_DIST: true -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Timer class. """ 5 | 6 | from time import perf_counter 7 | from typing import Optional 8 | 9 | 10 | class Timer: 11 | """ 12 | A timer which computes the time elapsed since the start/reset of the timer. 13 | """ 14 | 15 | def __init__(self) -> None: 16 | self.reset() 17 | 18 | def reset(self) -> None: 19 | """ 20 | Reset the timer. 21 | """ 22 | self._start = perf_counter() 23 | self._paused: Optional[float] = None 24 | self._total_paused = 0 25 | self._count_start = 1 26 | 27 | def pause(self) -> None: 28 | """ 29 | Pause the timer. 30 | """ 31 | if self._paused is not None: 32 | raise ValueError("Trying to pause a Timer that is already paused!") 33 | self._paused = perf_counter() 34 | 35 | def is_paused(self) -> bool: 36 | """ 37 | Returns: 38 | bool: whether the timer is currently paused 39 | """ 40 | return self._paused is not None 41 | 42 | def resume(self) -> None: 43 | """ 44 | Resume the timer. 45 | """ 46 | if self._paused is None: 47 | raise ValueError("Trying to resume a Timer that is not paused!") 48 | self._total_paused += perf_counter() - self._paused # pyre-ignore 49 | self._paused = None 50 | self._count_start += 1 51 | 52 | def seconds(self) -> float: 53 | """ 54 | Returns: 55 | (float): the total number of seconds since the start/reset of the 56 | timer, excluding the time when the timer is paused. 57 | """ 58 | if self._paused is not None: 59 | end_time: float = self._paused # type: ignore 60 | else: 61 | end_time = perf_counter() 62 | return end_time - self._start - self._total_paused 63 | 64 | def avg_seconds(self) -> float: 65 | """ 66 | Returns: 67 | (float): the average number of seconds between every start/reset and 68 | pause. 69 | """ 70 | return self.seconds() / self._count_start -------------------------------------------------------------------------------- /configs/pool/run/training/finetune.yaml: -------------------------------------------------------------------------------- 1 | PRETRAIN: 2 | ENABLE: false 3 | TRAIN: 4 | ENABLE: true 5 | DATASET: # !!@1 6 | BATCH_SIZE: 128 7 | LOG_FILE: training_log.log 8 | LOSS_FUNC: cross_entropy 9 | EVAL_PERIOD: 5 10 | NUM_FOLDS: 30 11 | AUTO_RESUME: true 12 | CHECKPOINT_PERIOD: 10 13 | CHECKPOINT_FILE_PATH: "" # !!@2 14 | CHECKPOINT_TYPE: pytorch 15 | CHECKPOINT_INFLATE: false 16 | FINE_TUNE: true 17 | ONLY_LINEAR: false 18 | TEST: 19 | ENABLE: true # !!@3 20 | DATASET: # !!@3 21 | BATCH_SIZE: 128 22 | NUM_SPATIAL_CROPS: 1 23 | SPATIAL_CROPS: cc 24 | NUM_ENSEMBLE_VIEWS: 1 25 | LOG_FILE: val.log 26 | CHECKPOINT_FILE_PATH: "" 27 | CHECKPOINT_TYPE: pytorch 28 | AUTOMATIC_MULTI_SCALE_TEST: true 29 | DATA: 30 | DATA_ROOT_DIR: 31 | ANNO_DIR: 32 | NUM_INPUT_FRAMES: 16 33 | NUM_INPUT_CHANNELS: 3 34 | SAMPLING_MODE: interval_based 35 | SAMPLING_RATE: 4 36 | TRAIN_JITTER_SCALES: [168, 224] 37 | TRAIN_CROP_SIZE: 112 38 | TEST_SCALE: 224 39 | TEST_CROP_SIZE: 112 40 | MEAN: [0.45, 0.45, 0.45] 41 | STD: [0.225, 0.225, 0.225] 42 | MULTI_LABEL: false 43 | ENSEMBLE_METHOD: sum 44 | FPS: 30 45 | TARGET_FPS: 30 46 | OPTIMIZER: 47 | BASE_LR: 0.002 48 | LR_POLICY: cosine 49 | MAX_EPOCH: 300 50 | MOMENTUM: 0.9 51 | WEIGHT_DECAY: 1e-3 52 | WARMUP_EPOCHS: 10 53 | WARMUP_START_LR: 0.0002 54 | OPTIM_METHOD: adam 55 | DAMPENING: 0.0 56 | NESTEROV: true 57 | BN: 58 | WEIGHT_DECAY: 0.0 59 | EPS: 1e-3 60 | DATA_LOADER: 61 | NUM_WORKERS: 4 62 | PIN_MEMORY: false 63 | ENABLE_MULTI_THREAD_DECODE: true 64 | NUM_GPUS: 8 65 | SHARD_ID: 0 66 | NUM_SHARDS: 1 67 | RANDOM_SEED: 0 68 | OUTPUT_DIR: 69 | OUTPUT_CFG_FILE: configuration.log 70 | LOG_PERIOD: 10 71 | DIST_BACKEND: nccl 72 | LOG_MODEL_INFO: true 73 | LOG_CONFIG_INFO: true 74 | AUGMENTATION: 75 | COLOR_AUG: true 76 | BRIGHTNESS: 0.5 77 | CONTRAST: 0.5 78 | SATURATION: 0.5 79 | HUE: 0.25 80 | GRAYSCALE: 0.3 81 | CONSISTENT: true 82 | SHUFFLE: true 83 | GRAY_FIRST: true 84 | IS_SPLIT: false 85 | RATIO: [0.857142857142857, 1.1666666666666667] 86 | USE_GPU: true 87 | PAI: false 88 | USE_MULTISEG_VAL_DIST: false 89 | DETECTION: 90 | ENABLE: false 91 | TENSORBOARD: 92 | ENABLE: false 93 | -------------------------------------------------------------------------------- /models/utils/model_ema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py 3 | # thanks for the nice implementation 4 | 5 | import torch 6 | import torch.nn as nn 7 | from copy import deepcopy 8 | 9 | class ModelEmaV2(nn.Module): 10 | """ Model Exponential Moving Average V2 11 | Keep a moving average of everything in the model state_dict (parameters and buffers). 12 | V2 of this module is simpler, it does not match params/buffers based on name but simply 13 | iterates in order. It works with torchscript (JIT of full model). 14 | This is intended to allow functionality like 15 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 16 | A smoothed version of the weights is necessary for some training schemes to perform well. 17 | E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use 18 | RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA 19 | smoothing of weights to match results. Pay attention to the decay constant you are using 20 | relative to your update count per epoch. 21 | To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but 22 | disable validation of the EMA weights. Validation will have to be done manually in a separate 23 | process, or after the training stops converging. 24 | This class is sensitive where it is initialized in the sequence of model init, 25 | GPU assignment and distributed training wrappers. 26 | """ 27 | def __init__(self, model, decay=0.9999, device=None): 28 | super(ModelEmaV2, self).__init__() 29 | # make a copy of the model for accumulating moving average of weights 30 | self.module = deepcopy(model) 31 | self.module.eval() 32 | self.decay = decay 33 | self.device = device # perform ema on different device from model if set 34 | if self.device is not None: 35 | self.module.to(device=device) 36 | 37 | def _update(self, model, update_fn): 38 | with torch.no_grad(): 39 | for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()): 40 | if self.device is not None: 41 | model_v = model_v.to(device=self.device) 42 | ema_v.copy_(update_fn(ema_v, model_v)) 43 | 44 | def update(self, model): 45 | self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m) 46 | 47 | def set(self, model): 48 | self._update(model, update_fn=lambda e, m: m) -------------------------------------------------------------------------------- /configs/pool/run/training/localization.yaml: -------------------------------------------------------------------------------- 1 | TASK_TYPE: localization 2 | LOCALIZATION: 3 | ENABLE: true 4 | LOSS: Tem+PemReg+PemCls 5 | LOSS_WEIGHTS: [1,10,1] 6 | POS_CLS_THRES: 0.9 7 | POS_REG_THRES: 0.7 8 | NEG_REG_THRES: 0.3 9 | 10 | TEST_OUTPUT_DIR: ./output/ 11 | PROPS_DIR: prop_results 12 | PROPS_REGRESSION_LOSS: smoothl1 13 | RESULT_FILE: localization_detection_res 14 | CLASSIFIER_FILE: "" 15 | POST_PROCESS: 16 | THREAD: 32 17 | SOFT_NMS_ALPHA: 0.4 18 | SOFT_NMS_LOW_THRES: 0.0 19 | SOFT_NMS_HIGH_THRES: 0.0 20 | PROP_NUM: 100 21 | SELECT_SCORE: 0.0001 22 | SCORE_TYPE: 'cr' 23 | CLR_POWER: 1.2 24 | REG_POWER: 1.2 25 | IOU_POWER: 2.0 26 | TCA_POWER: 1.0 27 | ACTION_SCORE_POWER: 1.0 28 | VIDEO_SCORES_WEIGHT: 1.0 29 | 30 | TRAIN: 31 | ENABLE: true 32 | DATASET: Epickitchen100Localization # !!@1 33 | BATCH_SIZE: 64 34 | LOG_FILE: training_log.log 35 | EVAL_PERIOD: 1 36 | NUM_FOLDS: 1 37 | AUTO_RESUME: true 38 | CHECKPOINT_PERIOD: 1 39 | CHECKPOINT_FILE_PATH: "" # !!@2 40 | CHECKPOINT_TYPE: pytorch 41 | CHECKPOINT_INFLATE: false 42 | FINE_TUNE: false 43 | LR_REDUCE: false 44 | TEST: 45 | ENABLE: false # !!@3 46 | OUTPUT_TEST: false 47 | FORCE_FORWARD: false 48 | DATASET: Epickitchen100Localization # !!@3 49 | BATCH_SIZE: 128 50 | LOG_FILE: val.log 51 | TEST_SET: val 52 | CHECKPOINT_FILE_PATH: "" 53 | SAVE_RESULTS_PATH: "preds.log" 54 | CHECKPOINT_TYPE: pytorch 55 | AUTOMATIC_MULTI_SCALE_TEST: false 56 | TEST_CHECKPOINT: [7,8,9,10] 57 | 58 | DATA: 59 | DATA_ROOT_DIR: 60 | ANNO_DIR: 61 | TEMPORAL_SCALE: 200 62 | DURATION_SCALE: -1 63 | TEMPORAL_MODE: resize 64 | NUM_INPUT_CHANNELS: 2304 65 | TEMPORAL_INTERVAL: 0.53333333 66 | NORM_FEATURE: true 67 | ANNO_NAME: "" 68 | LABELS_TYPE: bmn 69 | 70 | SOLVER: 71 | BASE_LR: 0.001 72 | ADJUST_LR: true 73 | LR_POLICY: cosine 74 | MAX_EPOCH: 10 75 | MOMENTUM: 0.9 76 | WEIGHT_DECAY: 1e-4 77 | WARMUP_EPOCHS: 1 78 | WARMUP_START_LR: 0.0001 79 | OPTIM_METHOD: adam 80 | DAMPENING: 0.0 81 | NESTEROV: true 82 | BN: 83 | USE_BN: false 84 | WEIGHT_DECAY: 0.0 85 | DATA_LOADER: 86 | NUM_WORKERS: 8 87 | PIN_MEMORY: true 88 | 89 | NUM_GPUS: 8 90 | SHARD_ID: 0 91 | NUM_SHARDS: 1 92 | RANDOM_SEED: 0 93 | OUTPUT_DIR: output/test 94 | OUTPUT_CFG_FILE: configuration.log 95 | LOG_PERIOD: 10 96 | DIST_BACKEND: nccl 97 | DEBUG_MODE: false 98 | LOG_MODEL_INFO: true 99 | LOG_CONFIG_INFO: true 100 | OSS: 101 | ENABLE: false 102 | PAI: true 103 | -------------------------------------------------------------------------------- /utils/logging.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ 5 | Logging. 6 | Modified from https://github.com/facebookresearch/SlowFast/blob/master/slowfast/utils/logging.py. 7 | Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 8 | """ 9 | 10 | import builtins 11 | import decimal 12 | import functools 13 | import logging 14 | import os 15 | import sys 16 | import simplejson 17 | 18 | import utils.distributed as du 19 | 20 | 21 | def _suppress_print(): 22 | """ 23 | Suppresses printing from the current process. 24 | """ 25 | 26 | def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False): 27 | pass 28 | 29 | builtins.print = print_pass 30 | 31 | 32 | def setup_logging(cfg, log_file): 33 | """ 34 | Sets up the logging for multiple processes. Only enable the logging for the 35 | master process, and suppress logging for the non-master processes. 36 | """ 37 | if du.is_master_proc(): 38 | # Enable logging for the master process. 39 | logging.root.handlers = [] 40 | else: 41 | # Suppress logging for non-master processes. 42 | _suppress_print() 43 | 44 | logger = logging.getLogger() 45 | logger.setLevel(logging.INFO) 46 | logger.propagate = False 47 | plain_formatter = logging.Formatter( 48 | "[%(asctime)s][%(levelname)s] %(name)s: %(lineno)4d: %(message)s", 49 | datefmt="%m/%d %H:%M:%S", 50 | ) 51 | 52 | if du.is_master_proc(): 53 | ch = logging.StreamHandler(stream=sys.stdout) 54 | ch.setLevel(logging.DEBUG) 55 | ch.setFormatter(plain_formatter) 56 | logger.addHandler(ch) 57 | 58 | if log_file is not None and du.is_master_proc(du.get_world_size()): 59 | filename = os.path.join(cfg.OUTPUT_DIR, log_file) 60 | fh = logging.FileHandler(filename) 61 | fh.setLevel(logging.DEBUG) 62 | fh.setFormatter(plain_formatter) 63 | logger.addHandler(fh) 64 | 65 | 66 | def get_logger(name): 67 | """ 68 | Retrieve the logger with the specified name or, if name is None, return a 69 | logger which is the root logger of the hierarchy. 70 | Args: 71 | name (string): name of the logger. 72 | """ 73 | return logging.getLogger(name) 74 | 75 | 76 | def log_json_stats(stats): 77 | """ 78 | Logs json stats. 79 | Args: 80 | stats (dict): a dictionary of statistical information to log. 81 | """ 82 | stats = { 83 | k: decimal.Decimal("{:.6f}".format(v)) if isinstance(v, float) else v 84 | for k, v in stats.items() 85 | } 86 | json_stats = simplejson.dumps(stats, sort_keys=True, use_decimal=True) 87 | logger = get_logger(__name__) 88 | logger.info("{:s}".format(json_stats)) 89 | -------------------------------------------------------------------------------- /configs/pool/run/training/mosi.yaml: -------------------------------------------------------------------------------- 1 | PRETRAIN: 2 | ENABLE: true 3 | GENERATOR: MoSIGenerator 4 | LOSS: MoSIJoint 5 | LOSS_WEIGHTS: [1] 6 | DISTANCE_JITTER: [1, 1] 7 | SCALE_JITTER: false 8 | NUM_FRAMES: 16 9 | DATA_MODE: xy 10 | DECOUPLE: true 11 | FRAME_SIZE_STANDARDIZE_ENABLE: true 12 | STANDARD_SIZE: 320 13 | LABEL_MODE: joint # seperate / joint 14 | ZERO_OUT: false 15 | STATIC_MASK: true 16 | ASPECT_RATIO: [1, 1] 17 | MASK_SIZE_RATIO: [0.3, 0.5] 18 | NUM_CLIPS_PER_VIDEO: 1 19 | TRAIN: 20 | ENABLE: true 21 | DATASET: # !!@1 22 | BATCH_SIZE: 10 23 | LOG_FILE: training_log.log 24 | EVAL_PERIOD: 5 25 | NUM_FOLDS: 1 26 | AUTO_RESUME: true 27 | CHECKPOINT_PERIOD: 10 28 | CHECKPOINT_FILE_PATH: "" # !!@2 29 | CHECKPOINT_TYPE: pytorch 30 | CHECKPOINT_INFLATE: false 31 | FINE_TUNE: false 32 | ONLY_LINEAR: false 33 | TEST: 34 | ENABLE: false # !!@3 35 | DATASET: # !!@3 36 | BATCH_SIZE: 10 37 | NUM_SPATIAL_CROPS: 1 38 | SPATIAL_CROPS: cc 39 | NUM_ENSEMBLE_VIEWS: 1 40 | LOG_FILE: val.log 41 | CHECKPOINT_FILE_PATH: "" 42 | CHECKPOINT_TYPE: pytorch 43 | AUTOMATIC_MULTI_SCALE_TEST: false 44 | DATA: 45 | DATA_ROOT_DIR: 46 | ANNO_DIR: 47 | NUM_INPUT_FRAMES: 1 48 | NUM_INPUT_CHANNELS: 3 49 | SAMPLING_MODE: interval_based 50 | SAMPLING_RATE: 4 51 | TRAIN_JITTER_SCALES: [168, 224] 52 | TRAIN_CROP_SIZE: 112 53 | TEST_SCALE: 224 54 | TEST_CROP_SIZE: 112 55 | MEAN: [0.45, 0.45, 0.45] 56 | STD: [0.225, 0.225, 0.225] 57 | MULTI_LABEL: false 58 | ENSEMBLE_METHOD: sum 59 | FPS: 30 60 | TARGET_FPS: 30 61 | OPTIMIZER: 62 | BASE_LR: 0.001 63 | LR_POLICY: cosine 64 | MAX_EPOCH: 100 65 | MOMENTUM: 0.9 66 | WEIGHT_DECAY: 1e-4 67 | WARMUP_EPOCHS: 10 68 | WARMUP_START_LR: 0.0001 69 | OPTIM_METHOD: adam 70 | DAMPENING: 0.0 71 | NESTEROV: true 72 | BN: 73 | WEIGHT_DECAY: 0.0 74 | EPS: 1e-3 75 | DATA_LOADER: 76 | NUM_WORKERS: 4 77 | PIN_MEMORY: false 78 | ENABLE_MULTI_THREAD_DECODE: true 79 | NUM_GPUS: 8 80 | SHARD_ID: 0 81 | NUM_SHARDS: 1 82 | RANDOM_SEED: 0 83 | OUTPUT_DIR: 84 | OUTPUT_CFG_FILE: configuration.log 85 | LOG_PERIOD: 10 86 | DIST_BACKEND: nccl 87 | LOG_MODEL_INFO: true 88 | LOG_CONFIG_INFO: true 89 | AUGMENTATION: 90 | COLOR_AUG: true 91 | BRIGHTNESS: 0.5 92 | CONTRAST: 0.5 93 | SATURATION: 0.5 94 | HUE: 0.25 95 | GRAYSCALE: 0.3 96 | CONSISTENT: false 97 | SHUFFLE: true 98 | GRAY_FIRST: true 99 | RATIO: [0.857142857142857, 1.1666666666666667] 100 | USE_GPU: true 101 | PAI: false 102 | 103 | MODEL: 104 | NAME: MoSINet 105 | VIDEO: 106 | HEAD: 107 | NAME: MoSIHeadJoint 108 | NUM_CLASSES: 5 109 | DROPOUT_RATE: 0.5 -------------------------------------------------------------------------------- /models/base/builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Builder for video models. """ 5 | 6 | import sys 7 | import torch 8 | import torch.nn as nn 9 | 10 | import traceback 11 | 12 | import utils.logging as logging 13 | 14 | from models.base.models import BaseVideoModel, MODEL_REGISTRY 15 | from models.utils.model_ema import ModelEmaV2 16 | 17 | logger = logging.get_logger(__name__) 18 | 19 | def build_model(cfg, gpu_id=None): 20 | """ 21 | Builds the video model. 22 | Args: 23 | cfg (Config): global config object that provides specifics to construct the model. 24 | gpu_id (Optional[int]): specify the gpu index to build model. 25 | Returns: 26 | model: constructed model 27 | model_ema: copied model for ema 28 | """ 29 | # Construct the model 30 | if MODEL_REGISTRY.get(cfg.MODEL.NAME) == None: 31 | # attempt to find standard models 32 | model = BaseVideoModel(cfg) 33 | else: 34 | # if the model is explicitly defined, 35 | # it is directly constructed from the model pool 36 | model = MODEL_REGISTRY.get(cfg.MODEL.NAME)(cfg) 37 | 38 | if torch.cuda.is_available(): 39 | assert ( 40 | cfg.NUM_GPUS <= torch.cuda.device_count() 41 | ), "Cannot use more GPU devices than available" 42 | else: 43 | assert ( 44 | cfg.NUM_GPUS == 0 45 | ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs." 46 | 47 | if cfg.NUM_GPUS: 48 | if gpu_id is None: 49 | # Determine the GPU used by the current process 50 | cur_device = torch.cuda.current_device() 51 | else: 52 | cur_device = gpu_id 53 | model = model.cuda(device=cur_device) 54 | 55 | model_ema = None 56 | if cfg.MODEL.EMA.ENABLE: 57 | model_ema = ModelEmaV2(model, decay=cfg.MODEL.EMA.DECAY) 58 | 59 | try: 60 | # convert batchnorm to be synchronized across 61 | # different GPUs if needed 62 | sync_bn = cfg.BN.SYNC_BN 63 | if sync_bn == True and cfg.NUM_GPUS * cfg.NUM_SHARDS > 1: 64 | model = nn.SyncBatchNorm.convert_sync_batchnorm(model) 65 | except: 66 | sync_bn = None 67 | 68 | # Use multi-process data parallel model in the multi-gpu setting 69 | if cfg.NUM_GPUS*cfg.NUM_SHARDS > 1: 70 | # Make model replica operate on the current device 71 | if cfg.PAI: 72 | # Support distributed training on the cluster 73 | model = torch.nn.parallel.DistributedDataParallel( 74 | module=model 75 | ) 76 | else: 77 | model = torch.nn.parallel.DistributedDataParallel( 78 | module=model, device_ids=[cur_device], output_device=cur_device 79 | ) 80 | 81 | return model, model_ema -------------------------------------------------------------------------------- /models/module_zoo/branches/csn_branch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ CSN Branch. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from models.base.base_blocks import BaseBranch, Base3DStem, BaseHead 10 | from models.base.base_blocks import BRANCH_REGISTRY 11 | 12 | @BRANCH_REGISTRY.register() 13 | class CSNBranch(BaseBranch): 14 | """ 15 | The ir-CSN branch. 16 | 17 | See Du Tran et al. 18 | Video Classification with Channel-Separated Convolutional Networks. 19 | """ 20 | def __init__(self, cfg, block_idx): 21 | """ 22 | Args: 23 | cfg (Config): global config object. 24 | block_idx (list): list of [stage_id, block_id], both starting from 0. 25 | """ 26 | super(CSNBranch, self).__init__(cfg, block_idx) 27 | 28 | def _construct_bottleneck(self): 29 | self.a = nn.Conv3d( 30 | in_channels = self.dim_in, 31 | out_channels = self.num_filters//self.expansion_ratio, 32 | kernel_size = 1, 33 | stride = 1, 34 | padding = 0, 35 | bias = False 36 | ) 37 | self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 38 | self.a_relu = nn.ReLU(inplace=True) 39 | 40 | self.b = nn.Conv3d( 41 | in_channels = self.num_filters//self.expansion_ratio, 42 | out_channels = self.num_filters//self.expansion_ratio, 43 | kernel_size = self.kernel_size, 44 | stride = self.stride, 45 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 46 | bias = False, 47 | groups = self.num_filters//self.expansion_ratio, 48 | ) 49 | self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 50 | self.b_relu = nn.ReLU(inplace=True) 51 | 52 | self.c = nn.Conv3d( 53 | in_channels = self.num_filters//self.expansion_ratio, 54 | out_channels = self.num_filters, 55 | kernel_size = 1, 56 | stride = 1, 57 | padding = 0, 58 | bias = False 59 | ) 60 | self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 61 | 62 | def forward(self, x): 63 | if self.transformation == 'bottleneck': 64 | x = self.a(x) 65 | x = self.a_bn(x) 66 | x = self.a_relu(x) 67 | 68 | x = self.b(x) 69 | x = self.b_bn(x) 70 | x = self.b_relu(x) 71 | 72 | x = self.c(x) 73 | x = self.c_bn(x) 74 | return x 75 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: hyrsm 2 | channels: 3 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud//pytorch 4 | - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - blas=1.0=mkl 9 | - ca-certificates=2020.6.20=hecda079_0 10 | - certifi=2020.6.20=py36h9f0ad1d_0 11 | - cudatoolkit=10.1.243=h6bb024c_0 12 | - freetype=2.10.2=he06d7ca_0 13 | - intel-openmp=2020.1=217 14 | - jpeg=9d=h516909a_0 15 | - lcms2=2.11=hbd6801e_0 16 | - ld_impl_linux-64=2.33.1=h53a641e_7 17 | - libedit=3.1.20191231=h14c3975_1 18 | - libffi=3.3=he6710b0_2 19 | - libgcc-ng=9.1.0=hdf63c60_0 20 | - libpng=1.6.37=hed695b0_2 21 | - libstdcxx-ng=9.1.0=hdf63c60_0 22 | - libtiff=4.1.0=hc7e4089_6 23 | - libwebp-base=1.1.0=h516909a_3 24 | - lz4-c=1.9.2=he1b5a44_3 25 | - mkl=2020.1=217 26 | - mkl-service=2.3.0=py36he904b0f_0 27 | - mkl_fft=1.1.0=py36h23d657b_0 28 | - mkl_random=1.1.1=py36h830a2c2_0 29 | - ncurses=6.2=he6710b0_1 30 | - ninja=1.10.1=hdb11119_0 31 | - numpy=1.19.1=py36hbc911f0_0 32 | - numpy-base=1.19.1=py36hfa32c7d_0 33 | - olefile=0.46=py_0 34 | - openssl=1.1.1g=h516909a_1 35 | - pillow=7.2.0=py36h8328e55_1 36 | - pip=20.2.2=py36_0 37 | - python=3.6.10=h7579374_2 38 | - python_abi=3.6=1_cp36m 39 | - pytorch=1.5.1=py3.6_cuda10.1.243_cudnn7.6.3_0 40 | - readline=8.0=h7b6447c_0 41 | - setuptools=49.6.0=py36_0 42 | - six=1.15.0=pyh9f0ad1d_0 43 | - sqlite=3.33.0=h62c20be_0 44 | - tk=8.6.10=hbc83047_0 45 | - torchvision=0.6.1=py36_cu101 46 | - wheel=0.34.2=py36_0 47 | - xz=5.2.5=h7b6447c_0 48 | - zlib=1.2.11=h7b6447c_3 49 | - zstd=1.4.5=h6597ccf_2 50 | - pip: 51 | - absl-py==0.10.0 52 | - aliyun-python-sdk-core==2.13.36 53 | - aliyun-python-sdk-kms==2.15.0 54 | - av==8.0.2 55 | - blessings==1.7 56 | - cachetools==4.1.1 57 | - cffi==1.15.0 58 | - chardet==3.0.4 59 | - crcmod==1.7 60 | - cryptography==37.0.2 61 | - decord==0.4.0 62 | - einops==0.4.1 63 | - google-auth==1.20.1 64 | - google-auth-oauthlib==0.4.1 65 | - gpustat==0.6.0 66 | - grpcio==1.31.0 67 | - idna==2.10 68 | - importlib-metadata==4.8.3 69 | - importlib-resources==5.4.0 70 | - jmespath==0.10.0 71 | - joblib==1.1.0 72 | - markdown==3.2.2 73 | - nvidia-ml-py3==7.352.0 74 | - oauthlib==3.1.0 75 | - opencv-python==4.4.0.42 76 | - oss2==2.15.0 77 | - pandas==1.1.5 78 | - protobuf==3.13.0 79 | - psutil==5.7.2 80 | - pyasn1==0.4.8 81 | - pyasn1-modules==0.2.8 82 | - pycparser==2.21 83 | - pycryptodome==3.14.1 84 | - python-dateutil==2.8.2 85 | - pytz==2022.1 86 | - pyyaml==5.3.1 87 | - requests==2.24.0 88 | - requests-oauthlib==1.3.0 89 | - rsa==4.6 90 | - simplejson==3.17.2 91 | - tensorboard==2.3.0 92 | - tensorboard-plugin-wit==1.7.0 93 | - tqdm==4.64.0 94 | - typing-extensions==4.1.1 95 | - urllib3==1.25.10 96 | - werkzeug==1.0.1 97 | - zipp==3.6.0 98 | -------------------------------------------------------------------------------- /models/utils/lr_policy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # From https://github.com/facebookresearch/SlowFast/blob/master/slowfast/utils/lr_policy.py 4 | 5 | """Learning rate policy.""" 6 | 7 | import math 8 | 9 | 10 | def get_lr_at_epoch(cfg, cur_epoch): 11 | """ 12 | Retrieve the learning rate of the current epoch with the option to perform 13 | warm up in the beginning of the training stage. 14 | Args: 15 | cfg (Config): global config object. 16 | cur_epoch (float): the number of epoch of the current training stage. 17 | """ 18 | lr = get_lr_func(cfg.SOLVER.LR_POLICY)(cfg, cur_epoch) 19 | # Perform warm up. 20 | if cur_epoch < cfg.SOLVER.WARMUP_EPOCHS: 21 | lr_start = cfg.SOLVER.WARMUP_START_LR 22 | lr_end = get_lr_func(cfg.SOLVER.LR_POLICY)( 23 | cfg, cfg.SOLVER.WARMUP_EPOCHS 24 | ) 25 | alpha = (lr_end - lr_start) / cfg.SOLVER.WARMUP_EPOCHS 26 | lr = cur_epoch * alpha + lr_start 27 | return lr 28 | 29 | 30 | def lr_func_cosine(cfg, cur_epoch): 31 | """ 32 | Retrieve the learning rate to specified values at specified epoch with the 33 | cosine learning rate schedule. Details can be found in: 34 | Ilya Loshchilov, and Frank Hutter 35 | SGDR: Stochastic Gradient Descent With Warm Restarts. 36 | Args: 37 | cfg (Config): global config object. 38 | cur_epoch (float): the number of epoch of the current training stage. 39 | """ 40 | return ( 41 | cfg.SOLVER.BASE_LR 42 | * (math.cos(math.pi * cur_epoch / cfg.SOLVER.MAX_EPOCH) + 1.0) 43 | * 0.5 44 | ) 45 | 46 | 47 | def lr_func_steps_with_relative_lrs(cfg, cur_epoch): 48 | """ 49 | Retrieve the learning rate to specified values at specified epoch with the 50 | steps with relative learning rate schedule. 51 | Args: 52 | cfg (Config): global config object. 53 | cur_epoch (float): the number of epoch of the current training stage. 54 | """ 55 | ind = get_step_index(cfg, cur_epoch) 56 | return cfg.SOLVER.LRS[ind] * cfg.SOLVER.BASE_LR 57 | 58 | 59 | def get_step_index(cfg, cur_epoch): 60 | """ 61 | Retrieves the lr step index for the given epoch. 62 | Args: 63 | cfg (Config): global config object. 64 | cur_epoch (float): the number of epoch of the current training stage. 65 | """ 66 | steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_EPOCH] 67 | for ind, step in enumerate(steps): # NoQA 68 | if cur_epoch < step: 69 | break 70 | return ind - 1 71 | 72 | 73 | def get_lr_func(lr_policy): 74 | """ 75 | Given the configs, retrieve the specified lr policy function. 76 | Args: 77 | lr_policy (string): the learning rate policy to use for the job. 78 | """ 79 | policy = "lr_func_" + lr_policy 80 | if policy not in globals(): 81 | raise NotImplementedError("Unknown LR policy: {}".format(lr_policy)) 82 | else: 83 | return globals()[policy] 84 | -------------------------------------------------------------------------------- /models/utils/params.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Params. """ 5 | 6 | def update_3d_conv_params(cfg, conv, idx): 7 | """ 8 | Automatically decodes parameters for 3D convolution blocks according to the config and its index in the model. 9 | Args: 10 | cfg (Config): Config object that contains model parameters such as channel dimensions, whether to downsampling or not, etc. 11 | conv (BaseBranch): Branch whose parameters needs to be specified. 12 | idx (list): List containing the index of the current block. ([stage_id, block_id]) 13 | """ 14 | # extract current block location 15 | stage_id, block_id = idx 16 | conv.stage_id = stage_id 17 | conv.block_id = block_id 18 | 19 | # extract basic info 20 | if block_id == 0: 21 | conv.dim_in = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id-1] 22 | if hasattr(cfg.VIDEO.BACKBONE, "ADD_FUSION_CHANNEL") and cfg.VIDEO.BACKBONE.ADD_FUSION_CHANNEL: 23 | conv.dim_in = conv.dim_in * cfg.VIDEO.BACKBONE.SLOWFAST.CONV_CHANNEL_RATIO // cfg.VIDEO.BACKBONE.SLOWFAST.BETA + conv.dim_in 24 | conv.downsampling = cfg.VIDEO.BACKBONE.DOWNSAMPLING[stage_id] 25 | conv.downsampling_temporal = cfg.VIDEO.BACKBONE.DOWNSAMPLING_TEMPORAL[stage_id] 26 | else: 27 | conv.downsampling = False 28 | conv.dim_in = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id] 29 | conv.num_filters = cfg.VIDEO.BACKBONE.NUM_FILTERS[stage_id] 30 | conv.bn_mmt = cfg.BN.MOMENTUM 31 | conv.bn_eps = cfg.BN.EPS 32 | conv.kernel_size = cfg.VIDEO.BACKBONE.KERNEL_SIZE[stage_id] 33 | conv.expansion_ratio = cfg.VIDEO.BACKBONE.EXPANSION_RATIO if hasattr(cfg.VIDEO.BACKBONE, "EXPANSION_RATIO") else None 34 | 35 | # configure downsampling 36 | if conv.downsampling: 37 | if conv.downsampling_temporal: 38 | conv.stride = [2, 2, 2] 39 | else: 40 | conv.stride = [1, 2, 2] 41 | else: 42 | conv.stride = [1, 1, 1] 43 | 44 | # define transformation 45 | if isinstance(cfg.VIDEO.BACKBONE.DEPTH, str): 46 | conv.transformation = 'bottleneck' 47 | else: 48 | if cfg.VIDEO.BACKBONE.DEPTH <= 34: 49 | conv.transformation = 'simple_block' 50 | else: 51 | conv.transformation = 'bottleneck' 52 | 53 | # calculate the input size 54 | num_downsampling_spatial = sum( 55 | cfg.VIDEO.BACKBONE.DOWNSAMPLING[:stage_id+(block_id>0)] 56 | ) 57 | if 'DownSample' in cfg.VIDEO.BACKBONE.STEM.NAME: 58 | num_downsampling_spatial += 1 59 | num_downsampling_temporal = sum( 60 | cfg.VIDEO.BACKBONE.DOWNSAMPLING_TEMPORAL[:stage_id+(block_id>0)] 61 | ) 62 | conv.h = cfg.DATA.TRAIN_CROP_SIZE // 2**num_downsampling_spatial \ 63 | + (cfg.DATA.TRAIN_CROP_SIZE//2**(num_downsampling_spatial-1))%2 64 | conv.w = conv.h 65 | conv.t = cfg.DATA.NUM_INPUT_FRAMES // 2**num_downsampling_temporal -------------------------------------------------------------------------------- /utils/launcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ Task launcher. """ 4 | 5 | import os 6 | import torch 7 | from utils.misc import get_num_gpus 8 | 9 | def launch_task(cfg, init_method, func): 10 | """ 11 | Launches the task "func" on one or multiple devices. 12 | Args: 13 | cfg (Config): global config object. 14 | init_method (str): initialization method to launch the job with multiple 15 | devices. 16 | func (function): task to run. 17 | """ 18 | torch.cuda.empty_cache() 19 | if get_num_gpus(cfg) > 1: 20 | if cfg.PAI: 21 | # if using the PAI cluster, get info from the environment 22 | cfg.SHARD_ID = int(os.environ['RANK']) 23 | if "VISIBLE_DEVICE_LIST" in os.environ: 24 | cfg.NUM_GPUS = len(os.environ["VISIBLE_DEVICE_LIST"].split(",")) 25 | else: 26 | cfg.NUM_GPUS = torch.cuda.device_count() 27 | cfg.NUM_SHARDS = int(os.environ['WORLD_SIZE']) 28 | 29 | torch.multiprocessing.spawn( 30 | run, 31 | nprocs=cfg.NUM_GPUS, 32 | args=(func, init_method, cfg), 33 | daemon=False, 34 | ) 35 | else: 36 | func(cfg=cfg) 37 | 38 | def run( 39 | local_rank, func, init_method, cfg 40 | ): 41 | """ 42 | Runs a function from a child process. 43 | Args: 44 | local_rank (int): rank of the current process on the current machine. 45 | func (function): function to execute on each of the process. 46 | init_method (string): method to initialize the distributed training. 47 | cfg (Config): global config object. 48 | """ 49 | 50 | num_proc = cfg.NUM_GPUS # number of nodes per machine 51 | shard_id = cfg.SHARD_ID 52 | num_shards = cfg.NUM_SHARDS # number of machines 53 | backend = cfg.DIST_BACKEND # distribued backends ('nccl', 'gloo' or 'mpi') 54 | 55 | world_size = num_proc * num_shards 56 | rank = shard_id * num_proc + local_rank 57 | cfg.LOCAL_RANK = rank 58 | 59 | # dump machine info 60 | print("num_proc (NUM_GPU): {}".format(num_proc)) 61 | print("shard_id (os.environ['RANK']): {}".format(shard_id)) 62 | print("num_shards (os.environ['WORLD_SIZE']): {}".format(num_shards)) 63 | print("rank: {}".format(rank)) 64 | print("local_rank (GPU_ID): {}".format(local_rank)) 65 | 66 | try: 67 | if cfg.PAI == False: 68 | torch.distributed.init_process_group( 69 | backend=backend, 70 | init_method=init_method, 71 | world_size=world_size, 72 | rank=rank, 73 | ) 74 | else: 75 | torch.distributed.init_process_group( 76 | backend=backend, 77 | world_size=world_size, 78 | rank=rank, 79 | ) 80 | except Exception as e: 81 | raise e 82 | 83 | if "VISIBLE_DEVICE_LIST" in os.environ: 84 | torch.cuda.set_device(int(os.environ["VISIBLE_DEVICE_LIST"])) 85 | else: 86 | torch.cuda.set_device(f'cuda:{local_rank}') 87 | os.system(f"CUDA_VISIBLE_DEVICES={local_rank}") 88 | func(cfg) 89 | -------------------------------------------------------------------------------- /models/base/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | import torch 5 | import torch.nn as nn 6 | from utils.registry import Registry 7 | from models.base.backbone import BACKBONE_REGISTRY 8 | from models.base.base_blocks import HEAD_REGISTRY 9 | 10 | MODEL_REGISTRY = Registry("Model") 11 | 12 | class BaseVideoModel(nn.Module): 13 | """ 14 | Standard video model. 15 | The model is divided into the backbone and the head, where the backbone 16 | extracts features and the head performs classification. 17 | 18 | The backbones can be defined in model/base/backbone.py or anywhere else 19 | as long as the backbone is registered by the BACKBONE_REGISTRY. 20 | The heads can be defined in model/module_zoo/heads/ or anywhere else 21 | as long as the head is registered by the HEAD_REGISTRY. 22 | 23 | The registries automatically finds the registered modules and construct 24 | the base video model. 25 | """ 26 | def __init__(self, cfg): 27 | """ 28 | Args: 29 | cfg (Config): global config object. 30 | """ 31 | super(BaseVideoModel, self).__init__() 32 | self.cfg = cfg 33 | 34 | # the backbone is created according to meta-architectures 35 | # defined in models/base/backbone.py 36 | self.backbone = BACKBONE_REGISTRY.get(cfg.VIDEO.BACKBONE.META_ARCH)(cfg=cfg) 37 | 38 | # the head is created according to the heads 39 | # defined in models/module_zoo/heads 40 | self.head = HEAD_REGISTRY.get(cfg.VIDEO.HEAD.NAME)(cfg=cfg) 41 | 42 | def forward(self, x): 43 | x = self.backbone(x) 44 | x = self.head(x) 45 | return x 46 | 47 | def train(self, mode=True): 48 | r"""Sets the module in training mode. 49 | 50 | This has any effect only on certain modules. See documentations of 51 | particular modules for details of their behaviors in training/evaluation 52 | mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, 53 | etc. 54 | 55 | Args: 56 | mode (bool): whether to set training mode (``True``) or evaluation 57 | mode (``False``). Default: ``True``. 58 | 59 | Returns: 60 | Module: self 61 | """ 62 | self.training = mode 63 | super(BaseVideoModel, self).train(mode) 64 | for module in self.modules(): 65 | if isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.LayerNorm)) and self.cfg.BN.FREEZE: 66 | module.train(False) 67 | return self 68 | 69 | @MODEL_REGISTRY.register() 70 | class MoSINet(BaseVideoModel): 71 | def __init__(self, cfg): 72 | super(MoSINet, self).__init__(cfg) 73 | 74 | def forward(self, x): 75 | if isinstance(x, dict): 76 | x_data = x["video"] 77 | else: 78 | x_data = x 79 | b, n, c, t, h, w = x_data.shape 80 | x_data = x_data.reshape(b*n, c, t, h, w) 81 | res, logits = super(MoSINet, self).forward(x_data) 82 | pred = {} 83 | if isinstance(res, dict): 84 | for k, v in res.items(): 85 | pred[k] = v 86 | else: 87 | pred["move_joint"] = res 88 | return pred, logits -------------------------------------------------------------------------------- /GUIDELINES.md: -------------------------------------------------------------------------------- 1 | # Guidelines for pytorch-video-understanding 2 | 3 | ## Installation 4 | 5 | Requirements: 6 | - Python>=3.6 7 | - torch>=1.5 8 | - torchvision (version corresponding with torch) 9 | - simplejson==3.11.1 10 | - decord>=0.6.0 11 | - pyyaml 12 | - einops 13 | - oss2 14 | - psutil 15 | - tqdm 16 | - pandas 17 | 18 | optional requirements 19 | - fvcore (for flops calculation) 20 | 21 | ## Data preparation 22 | 23 | For all datasets available in `datasets/base`, the name for each dataset list is specified in the `_get_dataset_list_name` function. 24 | Here we provide a table summarizing all the name and the formats of the datasets. 25 | 26 | | dataset | split | list file name | format | 27 | | ------- | ----- | -------------- | ------ | 28 | | epic-kitchens-100 | train | EPIC_100_train.csv | as downloaded | 29 | | epic-kitchens-100 | val | EPIC_100_validation.csv | as downloaded | 30 | | epic-kitchens-100 | test | EPIC_100_test_timestamps.csv | as downloaded | 31 | | hmdb51 | train/val | hmdb51_train_list.txt/hmdb51_val_list.txt | "video_path, supervised_label" | 32 | | imagenet | train/val | imagenet_train.txt/imagenet_val.txt | "image_path, supervised_label" | 33 | | kinetics 400 | train/val | kinetics400_train_list.txt/kinetics400_val_list.txt | "video_path, supervised_label" | 34 | | ssv2 | train | something-something-v2-train-with-label.json | json file with "label_idx" specifying the class and "id" specifying the name | 35 | | ssv2 | val | something-something-v2-val-with-label.json | json file with "label_idx" specifying the class and "id" specifying the name | 36 | | ucf101 | train/val | ucf101_train_list.txt/ucf101_val_list.txt | "video_path, supervised_label" | 37 | 38 | For epic-kitchens-features, the file name is specified in the respective configs in `configs/projects/epic-kitchen-tal`. 39 | 40 | ## Running 41 | 42 | The entry file for all the runs are `runs/run.py`. 43 | 44 | Before running, some settings need to be configured in the config file. 45 | The codebase is designed to be experiment friendly for rapid development of new models and representation learning approaches, in that the config files are designed in a hierarchical way. 46 | 47 | Take Tada2D as an example, each experiment (such as TAda2D_8x8 on kinetics 400: `configs/projects/tada/k400/tada2d_8x8.yaml`) inherits the config from the following hierarchy. 48 | ``` 49 | --- base config file [configs/pool/base.yaml] 50 | --- base run config [configs/pool/run/training/from_scratch_large.yaml] 51 | --- base backbone config [configs/pool/backbone/tada2d.yaml] 52 | --- base experiment config [configs/projects/tada/tada2d_k400.yaml] 53 | --- current experiment config [configs/projects/tada/k400/tada2d_8x8.yaml] 54 | ``` 55 | Generally, the base config file `configs/pool/base.yaml` contains all the possible keys used in this codebase and the bottom config overwrites its base config when the same key is encountered in both files. 56 | A good practice would be set the parameters shared for all the experiments in the base experiment config, and set parameters that are different for each experiments to the current experiment config. 57 | 58 | For an example run, open `configs/projects/tada/tada2d_k400.yaml` 59 | A. Set `DATA.DATA_ROOT_DIR` and `DATA.DATA_ANNO_DIR` to point to the kinetics 400, 60 | B. Set the valid gpu number `NUM_GPUS` 61 | Then the codebase can be run by: 62 | ``` 63 | python runs/run.py --cfg configs/projects/tada/k400/tada2d_8x8.yaml 64 | ``` -------------------------------------------------------------------------------- /MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | # MODEL ZOO 2 | 3 | ## Kinetics 4 | 5 | | Dataset | architecture | depth | init | clips x crops | #frames x sampling rate | acc@1 | acc@5 | checkpoint | config | 6 | | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | 7 | | K400 | TAda2D | R50 | IN-1K | 10 x 3 | 8 x 8 | 76.3 | 92.4 | [`link`]() | configs/projects/tada/k400/tada2d_8x8.yaml | 8 | | K400 | TAda2D | R50 | IN-1K | 10 x 3 | 16 x 5 | 76.9 | 92.7 | [`link`]() | configs/projects/tada/k400/tada2d_16x5.yaml | 9 | | K400 | ViViT Fact. Enc. | B16x2 | IN-21K | 4 x 3 | 32 x 2 | 79.4 | 94.0 | [`link`]() | configs/projects/competition/k400/vivit_fac_enc_b16x2.yaml | 10 | 11 | ## Something-Something 12 | | Dataset | architecture | depth | init | clips x crops | #frames | acc@1 | acc@5 | checkpoint | config | 13 | | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | 14 | | SSV2 | TAda2D | R50 | IN-1K | 2 x 3 | 8 | 63.8 | 87.7 | [`link`]() | configs/projects/tada/ssv2/tada2d_8f.yaml | 15 | | SSV2 | TAda2D | R50 | IN-1K | 2 x 3 | 16 | 65.2 | 89.1 | [`link`]() | configs/projects/tada/ssv2/tada2d_16f.yaml | 16 | 17 | ## Epic-Kitchens Action Recognition 18 | 19 | | architecture | init | resolution | clips x crops | #frames x sampling rate | action acc@1 | verb acc@1 | noun acc@1 | checkpoint | config | 20 | | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | 21 | | ViViT Fact. Enc.-B16x2 | K700 | 320 | 4 x 3 | 32 x 2 | 46.3 | 67.4 | 58.9 | [`link`]() | configs/projects/competition/ek100/vivit_fac_enc.yaml | 22 | | ir-CSN-R152 | K700 | 224 | 10 x 3 | 32 x 2 | 44.5 | 68.4 | 55.9 | [`link`]() | configs/projects/competition/ek100/csn.yaml | 23 | 24 | ## Epic-Kitchens Temporal Action Localization 25 | 26 | | feature | classification | type | IoU@0.1 | IoU@0.2 | IoU@0.3 | IoU@0.4 | IoU@0.5 | Avg | checkpoint | config | 27 | | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | 28 | | ViViT | ViViT | Verb | 22.90 | 21.93 | 20.74 | 19.08 | 16.00 | 20.13 | [`link`]() | configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml | 29 | | ViViT | ViViT | Noun | 28.95 | 27.38 | 25.52 | 22.67 | 18.95 | 24.69 | [`link`]() | configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml | 30 | | ViViT | ViViT | Action | 20.82 | 19.93 | 18.67 | 17.02 | 15.06 | 18.30 | [`link`]() | configs/projects/epic-kitchen-tal/bmn-epic/vivit-os-local.yaml | 31 | 32 | ## MoSI 33 | Note: for the following models, decord 0.4.1 are used rather than the default 0.6.0 for the codebase. 34 | 35 | ### Pre-train (without finetuning) 36 | | dataset | backbone | checkpoint | config | 37 | | ------- | -------- | ---------- | ------ | 38 | | HMDB51 | R-2D3D-18 | [`link`]() | configs/projects/mosi/pt-hmdb/r2d3ds.yaml | 39 | | HMDB51 | R(2+1)D-10 | [`link`]() | configs/projects/mosi/pt-hmdb/r2p1d.yaml | 40 | | UCF101 | R-2D3D-18 | [`link`]() |configs/projects/mosi/pt-ucf/r2d3ds.yaml | 41 | | UCF101 | R(2+1)D-10 | [`link`]() | configs/projects/mosi/pt-ucf/r2p1d.yaml | 42 | 43 | ### Finetuned 44 | | dataset | backbone | acc@1 | acc@5 | checkpoint | config | 45 | | ------- | -------- | ----- | ----- | ---------- | ------ | 46 | | HMDB51 | R-2D3D-18 | 46.93 | 74.71 | [`link`]() | configs/projects/mosi/ft-hmdb/r2d3ds.yaml | 47 | | HMDB51 | R(2+1)D-10 | 51.83 | 78.63 | [`link`]() | configs/projects/mosi/ft-hmdb/r2p1d.yaml | 48 | | UCF101 | R-2D3D-18 | 71.75 | 89.14 | [`link`]() | configs/projects/mosi/ft-ucf/r2d3ds.yaml | 49 | | UCF101 | R(2+1)D-10 | 82.79 | 95.78 | [`link`]() | configs/projects/mosi/ft-ucf/r2p1d.yaml | -------------------------------------------------------------------------------- /runs/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """Entry file for training, evaluating and testing a video model.""" 5 | 6 | import os 7 | import sys 8 | import time 9 | sys.path.append(os.path.abspath(os.curdir)) 10 | 11 | from utils.launcher import launch_task 12 | 13 | from test import test 14 | from train import train 15 | from train_net_few_shot import train_few_shot 16 | from test_net_few_shot import test_few_shot 17 | from test_epic_localization import test_epic_localization 18 | from submission_test import submission_test 19 | 20 | from utils.config import Config 21 | 22 | 23 | def _prepare_data(cfg): 24 | if cfg.TASK_TYPE in ['classification']: 25 | train_func = train 26 | test_func = test 27 | elif cfg.TASK_TYPE in ['localization']: 28 | train_func = train 29 | test_func = test_epic_localization 30 | elif cfg.TASK_TYPE in ['few_shot_action']: 31 | train_func = train_few_shot 32 | test_func = test_few_shot 33 | elif cfg.TASK_TYPE in ["submission"]: 34 | cfg.TRAIN.ENABLE = False 35 | cfg.TEST.ENABLE = False 36 | train_func = None 37 | test_func = None 38 | submission_func = submission_test 39 | else: 40 | raise ValueError("unknown TASK_TYPE {}".format(cfg.TASK_TYPE)) 41 | 42 | run_list = [] 43 | if cfg.TRAIN.ENABLE: 44 | # Training process is performed by the entry function defined above. 45 | run_list.append([cfg.deep_copy(), train_func]) 46 | 47 | if cfg.TEST.ENABLE: 48 | # Test is performed by the entry function defined above. 49 | run_list.append([cfg.deep_copy(), test_func]) 50 | if cfg.TEST.AUTOMATIC_MULTI_SCALE_TEST: 51 | """ 52 | By default, test_func performs single view test. 53 | AUTOMATIC_MULTI_SCALE_TEST automatically performs multi-view test after the single view test. 54 | """ 55 | cfg.LOG_MODEL_INFO = False 56 | cfg.LOG_CONFIG_INFO = False 57 | 58 | cfg.TEST.NUM_ENSEMBLE_VIEWS = 10 59 | cfg.TEST.NUM_SPATIAL_CROPS = 1 60 | 61 | if "kinetics" in cfg.TEST.DATASET or "epickitchen" in cfg.TEST.DATASET: 62 | cfg.TEST.NUM_SPATIAL_CROPS = 3 63 | if "imagenet" in cfg.TEST.DATASET and not cfg.PRETRAIN.ENABLE: 64 | cfg.TEST.NUM_ENSEMBLE_VIEWS = 1 65 | cfg.TEST.NUM_SPATIAL_CROPS = 3 66 | if "ssv2" in cfg.TEST.DATASET: 67 | cfg.TEST.NUM_ENSEMBLE_VIEWS = 1 68 | cfg.TEST.NUM_SPATIAL_CROPS = 3 69 | cfg.TEST.LOG_FILE = "val_{}clipsx{}crops.log".format( 70 | cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.TEST.NUM_SPATIAL_CROPS 71 | ) 72 | run_list.append([cfg.deep_copy(), test_func]) 73 | 74 | if cfg.SUBMISSION.ENABLE: 75 | # currently only supports epic kitchen submission 76 | cfg.LOG_MODEL_INFO = False 77 | cfg.TEST.NUM_ENSEMBLE_VIEWS = 10 78 | cfg.TEST.NUM_SPATIAL_CROPS = 3 79 | 80 | cfg.TEST.LOG_FILE = "test_{}clipsx{}crops.log".format( 81 | cfg.TEST.NUM_ENSEMBLE_VIEWS, cfg.TEST.NUM_SPATIAL_CROPS 82 | ) 83 | run_list.append([cfg.deep_copy(), submission_func]) 84 | 85 | return run_list 86 | 87 | def main(): 88 | """ 89 | Entry function for spawning all the function processes. 90 | """ 91 | cfg = Config(load=True) 92 | 93 | # get the list of configs and functions for running 94 | run_list = _prepare_data(cfg) 95 | 96 | for run in run_list: 97 | launch_task(cfg=run[0], init_method=run[0].get_args().init_method, func=run[1]) 98 | 99 | print("Finish running with config: {}".format(cfg.args.cfg_file)) 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /configs/pool/base.yaml: -------------------------------------------------------------------------------- 1 | TASK_TYPE: classification 2 | PRETRAIN: 3 | ENABLE: false 4 | LOCALIZATION: 5 | ENABLE: false 6 | TRAIN: 7 | ENABLE: false 8 | DATASET: 9 | BATCH_SIZE: 128 10 | LOG_FILE: training_log.log 11 | EVAL_PERIOD: 10 12 | NUM_FOLDS: 1 13 | AUTO_RESUME: true 14 | CHECKPOINT_PERIOD: 10 15 | CHECKPOINT_FILE_PATH: "" 16 | CHECKPOINT_TYPE: pytorch 17 | CHECKPOINT_INFLATE: false 18 | CHECKPOINT_PRE_PROCESS: 19 | ENABLE: false 20 | FINE_TUNE: false 21 | ONLY_LINEAR: false 22 | LR_REDUCE: false 23 | TRAIN_VAL_COMBINE: false 24 | TEST: 25 | ENABLE: false 26 | DATASET: 27 | BATCH_SIZE: 100 28 | NUM_SPATIAL_CROPS: 1 29 | SPATIAL_CROPS: cc 30 | NUM_ENSEMBLE_VIEWS: 1 31 | LOG_FILE: val.log 32 | CHECKPOINT_FILE_PATH: "" 33 | CHECKPOINT_TYPE: pytorch 34 | AUTOMATIC_MULTI_SCALE_TEST: true 35 | VISUALIZATION: 36 | ENABLE: false 37 | NAME: "" 38 | FEATURE_MAPS: 39 | ENABLE: false 40 | BASE_OUTPUT_DIR: "" 41 | SUBMISSION: 42 | ENABLE: false 43 | SAVE_RESULTS_PATH: "test.json" 44 | DATA: 45 | DATA_ROOT_DIR: /data_root/ 46 | ANNO_DIR: /anno_dir/ 47 | NUM_INPUT_FRAMES: 16 48 | NUM_INPUT_CHANNELS: 3 49 | SAMPLING_MODE: interval_based 50 | SAMPLING_RATE: 4 51 | TRAIN_JITTER_SCALES: [168, 224] 52 | TRAIN_CROP_SIZE: 112 53 | TEST_SCALE: 224 54 | TEST_CROP_SIZE: 112 55 | MEAN: [0.45, 0.45, 0.45] 56 | STD: [0.225, 0.225, 0.225] 57 | MULTI_LABEL: false 58 | ENSEMBLE_METHOD: sum 59 | TARGET_FPS: 30 60 | MINUS_INTERVAL: false 61 | MODEL: 62 | NAME: 63 | EMA: 64 | ENABLE: false 65 | DECAY: 0.99996 66 | VIDEO: 67 | BACKBONE: 68 | DEPTH: 69 | META_ARCH: 70 | NUM_FILTERS: 71 | NUM_INPUT_CHANNELS: 3 72 | NUM_OUT_FEATURES: 73 | KERNEL_SIZE: 74 | DOWNSAMPLING: 75 | DOWNSAMPLING_TEMPORAL: 76 | NUM_STREAMS: 1 77 | EXPANSION_RATIO: 2 78 | BRANCH: 79 | NAME: 80 | STEM: 81 | NAME: 82 | NONLOCAL: 83 | ENABLE: false 84 | STAGES: [5] 85 | MASK_ENABLE: false 86 | INITIALIZATION: 87 | HEAD: 88 | NAME: BaseHead 89 | ACTIVATION: softmax 90 | DROPOUT_RATE: 0 91 | NUM_CLASSES: 92 | OPTIMIZER: 93 | ADJUST_LR: false 94 | BASE_LR: 0.002 95 | LR_POLICY: cosine 96 | MAX_EPOCH: 300 97 | MOMENTUM: 0.9 98 | WEIGHT_DECAY: 1e-3 99 | WARMUP_EPOCHS: 10 100 | WARMUP_START_LR: 0.0002 101 | OPTIM_METHOD: adam 102 | DAMPENING: 0.0 103 | NESTEROV: true 104 | BN: 105 | WB_LOCK: false 106 | FREEZE: false 107 | WEIGHT_DECAY: 0.0 108 | MOMENTUM: 0.1 109 | EPS: 1e-5 110 | SYNC: false 111 | DATA_LOADER: 112 | NUM_WORKERS: 4 113 | PIN_MEMORY: false 114 | ENABLE_MULTI_THREAD_DECODE: true 115 | COLLATE_FN: 116 | NUM_GPUS: 8 117 | SHARD_ID: 0 118 | NUM_SHARDS: 1 119 | RANDOM_SEED: 0 120 | OUTPUT_DIR: output/ 121 | OUTPUT_CFG_FILE: configuration.log 122 | LOG_PERIOD: 10 123 | DIST_BACKEND: nccl 124 | LOG_MODEL_INFO: true 125 | LOG_CONFIG_INFO: true 126 | OSS: 127 | ENABLE: false 128 | KEY: 129 | SECRET: 130 | ENDPOINT: 131 | CHECKPOINT_OUTPUT_PATH: # !!@7 132 | SECONDARY_DATA_OSS: 133 | ENABLE: false 134 | KEY: 135 | SECRET: 136 | ENDPOINT: 137 | BUCKETS: [""] 138 | AUGMENTATION: 139 | COLOR_AUG: false 140 | BRIGHTNESS: 0.5 141 | CONTRAST: 0.5 142 | SATURATION: 0.5 143 | HUE: 0.25 144 | GRAYSCALE: 0.3 145 | CONSISTENT: true 146 | SHUFFLE: true 147 | GRAY_FIRST: true 148 | RATIO: [0.857142857142857, 1.1666666666666667] 149 | USE_GPU: false 150 | MIXUP: 151 | ENABLE: false 152 | ALPHA: 0.0 153 | PROB: 1.0 154 | MODE: batch 155 | SWITCH_PROB: 0.5 156 | CUTMIX: 157 | ENABLE: false 158 | ALPHA: 0.0 159 | MINMAX: 160 | RANDOM_ERASING: 161 | ENABLE: false 162 | PROB: 0.25 163 | MODE: const 164 | COUNT: [1, 1] 165 | NUM_SPLITS: 0 166 | AREA_RANGE: [0.02, 0.33] 167 | MIN_ASPECT: 0.3 168 | LABEL_SMOOTHING: 0.0 169 | SSV2_FLIP: false 170 | PAI: false -------------------------------------------------------------------------------- /models/module_zoo/stems/embedding_stem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Embedding stems. """ 5 | 6 | import math 7 | import torch 8 | from torch import nn, einsum 9 | import torch.nn.functional as F 10 | from einops import rearrange, repeat 11 | from models.base.backbone import BACKBONE_REGISTRY 12 | from models.base.base_blocks import ( 13 | STEM_REGISTRY, BRANCH_REGISTRY, HEAD_REGISTRY, DropPath, BaseHead 14 | ) 15 | 16 | @STEM_REGISTRY.register() 17 | class PatchEmbedStem(nn.Module): 18 | """ 19 | Video to Patch Embedding. 20 | """ 21 | def __init__(self, cfg): 22 | """ 23 | Args: 24 | cfg (Config): global config object. 25 | """ 26 | super().__init__() 27 | image_size = cfg.DATA.TRAIN_CROP_SIZE if cfg is not None else 224 # default 224 28 | channels = cfg.DATA.NUM_INPUT_CHANNELS if cfg is not None else 3 # default 3 29 | num_frames = cfg.DATA.NUM_INPUT_FRAMES if cfg is not None else 16 30 | patch_size = cfg.VIDEO.BACKBONE.PATCH_SIZE if cfg is not None else 16 # default 16 31 | dim = cfg.VIDEO.BACKBONE.NUM_FEATURES if cfg is not None else 768 # default 768 32 | 33 | num_patches_per_image = (image_size // patch_size) ** 2 34 | num_patches = num_patches_per_image * num_frames 35 | 36 | self.image_size = image_size 37 | self.patch_size = patch_size 38 | self.num_frames = num_frames 39 | self.num_patches = num_patches 40 | 41 | self.conv1 = nn.Conv3d( 42 | in_channels =channels, 43 | out_channels =dim, 44 | kernel_size =[1, patch_size, patch_size], 45 | stride =[1, patch_size, patch_size], 46 | ) 47 | 48 | def forward(self, x): 49 | b, c, t, h, w, p = *x.shape, self.patch_size 50 | assert h % p == 0 and w % p == 0, f'height {h} and width {w} of video must be divisible by the patch size {p}' 51 | x = self.conv1(x) 52 | # b, c, t, h, w -> b, c, p (p: num patches) 53 | x = x.reshape(x.shape[0], x.shape[1], -1) 54 | # b, c, p -> b, p, c 55 | x = x.permute(0, 2, 1) 56 | return x 57 | 58 | @STEM_REGISTRY.register() 59 | class TubeletEmbeddingStem(nn.Module): 60 | """ 61 | Video to Tubelet Embedding. 62 | """ 63 | def __init__(self, cfg): 64 | """ 65 | Args: 66 | cfg (Config): global config object. 67 | """ 68 | super().__init__() 69 | image_size = cfg.DATA.TRAIN_CROP_SIZE if cfg is not None else 224 # default 224 70 | channels = cfg.DATA.NUM_INPUT_CHANNELS if cfg is not None else 3 # default 3 71 | num_frames = cfg.DATA.NUM_INPUT_FRAMES if cfg is not None else 16 72 | patch_size = cfg.VIDEO.BACKBONE.PATCH_SIZE if cfg is not None else 16 # default 16 73 | dim = cfg.VIDEO.BACKBONE.NUM_FEATURES if cfg is not None else 768 # default 768 74 | tubelet_size = cfg.VIDEO.BACKBONE.TUBELET_SIZE if cfg is not None else 2 75 | 76 | num_patches_per_image = (image_size // patch_size) ** 2 77 | num_patches = num_patches_per_image * num_frames 78 | 79 | self.image_size = image_size 80 | self.patch_size = patch_size 81 | self.num_frames = num_frames 82 | self.num_patches = num_patches 83 | 84 | self.conv1 = nn.Conv3d( 85 | in_channels =channels, 86 | out_channels =dim, 87 | kernel_size =[tubelet_size, patch_size, patch_size], 88 | stride =[tubelet_size, patch_size, patch_size], 89 | ) 90 | 91 | def forward(self, x): 92 | b, c, t, h, w, p = *x.shape, self.patch_size 93 | assert h % p == 0 and w % p == 0, f'height {h} and width {w} of video must be divisible by the patch size {p}' 94 | x = self.conv1(x) 95 | # b, c, t, h, w -> b, c, p (p: num patches) 96 | x = x.reshape(x.shape[0], x.shape[1], -1) 97 | # b, c, p -> b, p, c 98 | x = x.permute(0, 2, 1) 99 | return x -------------------------------------------------------------------------------- /models/module_zoo/branches/slowfast_branch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ SlowFast architectures. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from models.base.base_blocks import BaseBranch 10 | from models.base.base_blocks import BRANCH_REGISTRY 11 | from models.utils.init_helper import _init_convnet_weights 12 | 13 | @BRANCH_REGISTRY.register() 14 | class SlowfastBranch(BaseBranch): 15 | """ 16 | Constructs SlowFast conv branch. 17 | 18 | See Christoph Feichtenhofer et al. 19 | SlowFast Networks for Video Recognition. 20 | """ 21 | def __init__(self, cfg, block_idx): 22 | super(SlowfastBranch, self).__init__(cfg, block_idx) 23 | 24 | def _construct_simple_block(self): 25 | self.a = nn.Conv3d( 26 | in_channels = self.dim_in, 27 | out_channels = self.num_filters, 28 | kernel_size = self.kernel_size, 29 | stride = self.stride, 30 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 31 | bias = False 32 | ) 33 | self.a_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 34 | self.a_relu = nn.ReLU(inplace=True) 35 | 36 | self.b = nn.Conv3d( 37 | in_channels = self.num_filters, 38 | out_channels = self.num_filters, 39 | kernel_size = self.kernel_size, 40 | stride = 1, 41 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 42 | bias = False 43 | ) 44 | self.b_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 45 | self.b_bn.transform_final_bn = True 46 | 47 | def _construct_bottleneck(self): 48 | self.a = nn.Conv3d( 49 | in_channels = self.dim_in, 50 | out_channels = self.num_filters//self.expansion_ratio, 51 | kernel_size = [3, 1, 1] if self.cfg.VIDEO.BACKBONE.TEMPORAL_CONV_BOTTLENECK[self.stage_id] else 1, 52 | stride = 1, 53 | padding = [1, 0, 0] if self.cfg.VIDEO.BACKBONE.TEMPORAL_CONV_BOTTLENECK[self.stage_id] else 0, 54 | bias = False 55 | ) 56 | self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 57 | self.a_relu = nn.ReLU(inplace=True) 58 | 59 | self.b = nn.Conv3d( 60 | in_channels = self.num_filters//self.expansion_ratio, 61 | out_channels = self.num_filters//self.expansion_ratio, 62 | kernel_size = self.kernel_size, 63 | stride = self.stride, 64 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 65 | bias = False 66 | ) 67 | self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 68 | self.b_relu = nn.ReLU(inplace=True) 69 | 70 | self.c = nn.Conv3d( 71 | in_channels = self.num_filters//self.expansion_ratio, 72 | out_channels = self.num_filters, 73 | kernel_size = 1, 74 | stride = 1, 75 | padding = 0, 76 | bias = False 77 | ) 78 | self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 79 | self.c_bn.transform_final_bn = True 80 | 81 | def forward(self, x): 82 | if self.transformation == 'simple_block': 83 | x = self.a(x) 84 | x = self.a_bn(x) 85 | x = self.a_relu(x) 86 | 87 | x = self.b(x) 88 | x = self.b_bn(x) 89 | return x 90 | elif self.transformation == 'bottleneck': 91 | x = self.a(x) 92 | x = self.a_bn(x) 93 | x = self.a_relu(x) 94 | 95 | x = self.b(x) 96 | x = self.b_bn(x) 97 | x = self.b_relu(x) 98 | 99 | x = self.c(x) 100 | x = self.c_bn(x) 101 | return x -------------------------------------------------------------------------------- /models/module_zoo/branches/r2d3d_branch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ R2D3D branch. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from models.base.base_blocks import BaseBranch, BaseHead 10 | from models.base.base_blocks import BRANCH_REGISTRY 11 | 12 | @BRANCH_REGISTRY.register() 13 | class R2D3DBranch(BaseBranch): 14 | """ 15 | The R2D3D Branch. 16 | 17 | Essentially the MCx model in 18 | Du Tran et al. 19 | A Closer Look at Spatiotemporal Convoluitions for Action Recognition. 20 | 21 | The model is used in DPC, MemDPC for self-supervised video 22 | representation learning. 23 | """ 24 | def __init__(self, cfg, block_idx): 25 | """ 26 | Args: 27 | cfg (Config): global config object. 28 | block_idx (list): list of [stage_id, block_id], both starting from 0. 29 | """ 30 | super(R2D3DBranch, self).__init__(cfg, block_idx) 31 | 32 | def _construct_simple_block(self): 33 | self.a = nn.Conv3d( 34 | in_channels = self.dim_in, 35 | out_channels = self.num_filters, 36 | kernel_size = self.kernel_size, 37 | stride = self.stride, 38 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 39 | bias = False 40 | ) 41 | self.a_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 42 | self.a_relu = nn.ReLU(inplace=True) 43 | 44 | self.b = nn.Conv3d( 45 | in_channels = self.num_filters, 46 | out_channels = self.num_filters, 47 | kernel_size = self.kernel_size, 48 | stride = 1, 49 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 50 | bias = False 51 | ) 52 | self.b_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 53 | 54 | def _construct_bottleneck(self): 55 | self.a = nn.Conv3d( 56 | in_channels = self.dim_in, 57 | out_channels = self.num_filters//self.expansion_ratio, 58 | kernel_size = 1, 59 | stride = 1, 60 | padding = 0, 61 | bias = False 62 | ) 63 | self.a_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 64 | self.a_relu = nn.ReLU(inplace=True) 65 | 66 | self.b = nn.Conv3d( 67 | in_channels = self.num_filters//self.expansion_ratio, 68 | out_channels = self.num_filters//self.expansion_ratio, 69 | kernel_size = self.kernel_size, 70 | stride = self.stride, 71 | padding = [self.kernel_size[0]//2, self.kernel_size[1]//2, self.kernel_size[2]//2], 72 | bias = False 73 | ) 74 | self.b_bn = nn.BatchNorm3d(self.num_filters//self.expansion_ratio, eps=self.bn_eps, momentum=self.bn_mmt) 75 | self.b_relu = nn.ReLU(inplace=True) 76 | 77 | self.c = nn.Conv3d( 78 | in_channels = self.num_filters//self.expansion_ratio, 79 | out_channels = self.num_filters, 80 | kernel_size = 1, 81 | stride = 1, 82 | padding = 0, 83 | bias = False 84 | ) 85 | self.c_bn = nn.BatchNorm3d(self.num_filters, eps=self.bn_eps, momentum=self.bn_mmt) 86 | 87 | def forward(self, x): 88 | if self.transformation == 'simple_block': 89 | x = self.a(x) 90 | x = self.a_bn(x) 91 | x = self.a_relu(x) 92 | 93 | x = self.b(x) 94 | x = self.b_bn(x) 95 | return x 96 | elif self.transformation == 'bottleneck': 97 | x = self.a(x) 98 | x = self.a_bn(x) 99 | x = self.a_relu(x) 100 | 101 | x = self.b(x) 102 | x = self.b_bn(x) 103 | x = self.b_relu(x) 104 | 105 | x = self.c(x) 106 | x = self.c_bn(x) 107 | return x 108 | 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hybrid Relation Guided Set Matching for Few-shot Action Recognition (CVPR-2022) 2 | ### Official Pytorch Implementation of '[HyRSM](https://openaccess.thecvf.com/content/CVPR2022/papers/Wang_Hybrid_Relation_Guided_Set_Matching_for_Few-Shot_Action_Recognition_CVPR_2022_paper.pdf)' 3 | 4 | 5 | > **Hybrid Relation Guided Set Matching for Few-shot Action Recognition**
6 | > Xiang Wang, Shiwei Zhang, Zhiwu Qing, Mingqian Tang, Zhengrong Zuo, Changxin Gao, Rong Jin, and Nong Sang 7 | 8 | > [Paper](https://openaccess.thecvf.com/content/CVPR2022/papers/Wang_Hybrid_Relation_Guided_Set_Matching_for_Few-Shot_Action_Recognition_CVPR_2022_paper.pdf), [Project](https://hyrsm-cvpr2022.github.io/) 9 | > 10 | > **Abstract:** *Current few-shot action recognition methods reach impressive performance by learning discriminative features for each video via episodic training and designing various temporal alignment strategies. Nevertheless, they are limited in that (a) learning individual features without considering the entire task may lose the most relevant information in the current episode, and (b) these alignment strategies may fail in misaligned instances. To overcome the two limitations, we propose a novel Hybrid Relation guided Set Matching (HyRSM) approach that incorporates two key components: hybrid relation module and set matching metric. The purpose of the hybrid relation module is to learn task-specific embeddings by fully exploiting associated relations within and cross videos in an episode. Built upon the task-specific features, we reformulate distance measure between query and support videos as a set matching problem and further design a bidirectional Mean Hausdorff Metric to improve the resilience to misaligned instances. By this means, the proposed HyRSM can be highly informative and flexible to predict query categories under the few-shot settings. We evaluate HyRSM on six challenging benchmarks, and the experimental results show its superiority over the state-of-the-art methods by a convincing margin.* 11 | 12 | 13 | This code is based on [pytorch-video-understanding](https://github.com/alibaba-mmai-research/TAdaConv) codebase, which provides a comprehensive video understanding solution for video classification and temporal detection. 14 | 15 | ## Installation 16 | 17 | Requirements: 18 | - Python>=3.6 19 | - torch>=1.5 20 | - torchvision (version corresponding with torch) 21 | - simplejson==3.11.1 22 | - decord>=0.6.0 23 | - pyyaml 24 | - einops 25 | - oss2 26 | - psutil 27 | - tqdm 28 | - pandas 29 | 30 | optional requirements 31 | - fvcore (for flops calculation) 32 | 33 | Or you can create environments with the following command: 34 | ``` 35 | conda env create -f environment.yaml 36 | ``` 37 | 38 | ## Data preparation 39 | 40 | First, you need to download the datasets from their original source (If you have already downloaded, please ignore this step 41 | ): 42 | 43 | - [SSV2](https://20bn.com/datasets/something-something#download) 44 | - [Kinetics](https://github.com/Showmax/kinetics-downloader) 45 | - [UCF101](https://www.crcv.ucf.edu/data/UCF101.php) 46 | - [HMDB51](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/#Downloads) 47 | - [Epic-kitchens](https://epic-kitchens.github.io/2022) 48 | 49 | Then, prepare data according to the [splits](configs/projects/hyrsm) we provide. 50 | 51 | ## Running 52 | The entry file for all the runs are `runs/run.py`. 53 | 54 | Before running, some settings need to be configured in the config file. 55 | The codebase is designed to be experiment friendly for rapid development of new models and representation learning approaches, in that the config files are designed in a hierarchical way. 56 | 57 | For an example run, open `configs/projects/hyrsm/kinetics100/HyRSM_K100_1shot_v1.yaml` 58 | 59 | A. Set `DATA.DATA_ROOT_DIR` and `DATA.DATA_ANNO_DIR` to point to the kinetics dataset, 60 | 61 | B. Set the valid gpu number `NUM_GPUS` 62 | 63 | Then the codebase can be run by: 64 | ``` 65 | python runs/run.py --cfg configs/projects/hyrsm/kinetics100/HyRSM_K100_1shot_v1.yaml 66 | ``` 67 | 68 | ## Citation 69 | If you find this code useful, please cite our paper. 70 | 71 | ~~~~ 72 | @inproceedings{wang2022hybrid, 73 | title={Hybrid Relation Guided Set Matching for Few-shot Action Recognition}, 74 | author={Wang, Xiang and Zhang, Shiwei and Qing, Zhiwu and Tang, Mingqian and Zuo, Zhengrong and Gao, Changxin and Jin, Rong and Sang, Nong}, 75 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 76 | pages={19948--19957}, 77 | year={2022} 78 | } 79 | ~~~~ 80 | -------------------------------------------------------------------------------- /datasets/base/builder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Builder for the dataloader.""" 5 | 6 | import itertools 7 | import numpy as np 8 | import torch 9 | import utils.misc as misc 10 | from utils.sampler import MultiFoldDistributedSampler 11 | from torch.utils.data._utils.collate import default_collate 12 | from torch.utils.data.distributed import DistributedSampler 13 | from torch.utils.data.sampler import RandomSampler 14 | from utils.val_dist_sampler import MultiSegValDistributedSampler 15 | from datasets.utils.collate_functions import COLLATE_FN_REGISTRY 16 | 17 | 18 | from utils.registry import Registry 19 | 20 | DATASET_REGISTRY = Registry("DATASET") 21 | 22 | def get_sampler(cfg, dataset, split, shuffle): 23 | """ 24 | Returns the sampler object for the dataset. 25 | Args: 26 | dataset (Dataset): constructed dataset. 27 | split (str): which split is the dataset for. 28 | shuffle (bool): whether or not to shuffle the dataset. 29 | Returns: 30 | sampler (Sampler): dataset sampler. 31 | """ 32 | if misc.get_num_gpus(cfg) > 1: 33 | if split == "train" and cfg.TRAIN.NUM_FOLDS > 1: 34 | return MultiFoldDistributedSampler( 35 | dataset, cfg.TRAIN.NUM_FOLDS 36 | ) 37 | elif cfg.USE_MULTISEG_VAL_DIST and cfg.TRAIN.ENABLE is False: 38 | return MultiSegValDistributedSampler(dataset, shuffle=False) 39 | else: 40 | return DistributedSampler( 41 | dataset, 42 | shuffle=shuffle 43 | ) 44 | else: 45 | return None 46 | 47 | def build_loader(cfg, split): 48 | """ 49 | Constructs the data loader for the given dataset. 50 | Args: 51 | cfg (Configs): global config object. details in utils/config.py 52 | split (str): the split of the data loader. Options include `train`, 53 | `val`, `test`, and `submission`. 54 | Returns: 55 | loader object. 56 | """ 57 | assert split in ["train", "val", "test", "submission"] 58 | if split in ["train"]: 59 | dataset_name = cfg.TRAIN.DATASET 60 | batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 61 | shuffle = True 62 | drop_last = True 63 | elif split in ["val"]: 64 | dataset_name = cfg.TEST.DATASET 65 | batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 66 | shuffle = False 67 | drop_last = False 68 | elif split in ["test", "submission"]: 69 | dataset_name = cfg.TEST.DATASET 70 | batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS)) 71 | shuffle = False 72 | drop_last = False 73 | 74 | # Construct the dataset 75 | dataset = build_dataset(dataset_name, cfg, split) 76 | 77 | # Create a sampler for multi-process training 78 | sampler = get_sampler(cfg, dataset, split, shuffle) 79 | # Create a loader 80 | if hasattr(cfg.DATA_LOADER, "COLLATE_FN") and cfg.DATA_LOADER.COLLATE_FN is not None: 81 | collate_fn = COLLATE_FN_REGISTRY.get(cfg.DATA_LOADER.COLLATE_FN)(cfg) 82 | else: 83 | collate_fn = None 84 | loader = torch.utils.data.DataLoader( 85 | dataset, 86 | batch_size=batch_size, 87 | shuffle=(False if sampler else shuffle), 88 | sampler=sampler, 89 | num_workers=cfg.DATA_LOADER.NUM_WORKERS, 90 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, 91 | drop_last=drop_last, 92 | collate_fn=collate_fn 93 | ) 94 | return loader 95 | 96 | 97 | def shuffle_dataset(loader, cur_epoch): 98 | """" 99 | Shuffles the sampler for the dataset. 100 | Args: 101 | loader (loader): data loader to perform shuffle. 102 | cur_epoch (int): number of the current epoch. 103 | """ 104 | sampler = loader.sampler 105 | assert isinstance( 106 | sampler, (RandomSampler, DistributedSampler, MultiFoldDistributedSampler) 107 | ), "Sampler type '{}' not supported".format(type(sampler)) 108 | # RandomSampler handles shuffling automatically 109 | if isinstance(sampler, (DistributedSampler, MultiFoldDistributedSampler)): 110 | # DistributedSampler shuffles data based on epoch 111 | sampler.set_epoch(cur_epoch) 112 | 113 | def build_dataset(dataset_name, cfg, split): 114 | """ 115 | Builds a dataset according to the "dataset_name". 116 | Args: 117 | dataset_name (str): the name of the dataset to be constructed. 118 | cfg (Config): global config object. 119 | split (str): the split of the data loader. 120 | Returns: 121 | Dataset (Dataset): a dataset object constructed for the specified dataset_name. 122 | """ 123 | name = dataset_name.capitalize() 124 | return DATASET_REGISTRY.get(name)(cfg, split) 125 | -------------------------------------------------------------------------------- /models/utils/lars.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # From https://github.com/open-mmlab/OpenSelfSup/blob/1db69ecebbc129e8fa90cdcea6f2082f0a4e3d17/openselfsup/utils/optimizers.py 3 | 4 | import torch 5 | from torch.optim.optimizer import Optimizer, required 6 | from torch.optim import * 7 | 8 | 9 | class LARS(Optimizer): 10 | r"""Implements layer-wise adaptive rate scaling for SGD. 11 | 12 | Args: 13 | params (iterable): iterable of parameters to optimize or dicts defining 14 | parameter groups 15 | lr (float): base learning rate (\gamma_0) 16 | momentum (float, optional): momentum factor (default: 0) ("m") 17 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 18 | ("\beta") 19 | dampening (float, optional): dampening for momentum (default: 0) 20 | eta (float, optional): LARS coefficient 21 | nesterov (bool, optional): enables Nesterov momentum (default: False) 22 | 23 | Based on Algorithm 1 of the following paper by You, Gitman, and Ginsburg. 24 | Large Batch Training of Convolutional Networks: 25 | https://arxiv.org/abs/1708.03888 26 | 27 | Example: 28 | >>> optimizer = LARS(model.parameters(), lr=0.1, momentum=0.9, 29 | >>> weight_decay=1e-4, eta=1e-3) 30 | >>> optimizer.zero_grad() 31 | >>> loss_fn(model(input), target).backward() 32 | >>> optimizer.step() 33 | """ 34 | 35 | def __init__(self, 36 | params, 37 | lr=required, 38 | momentum=0, 39 | dampening=0, 40 | weight_decay=0, 41 | eta=0.001, 42 | nesterov=False): 43 | if lr is not required and lr < 0.0: 44 | raise ValueError("Invalid learning rate: {}".format(lr)) 45 | if momentum < 0.0: 46 | raise ValueError("Invalid momentum value: {}".format(momentum)) 47 | if weight_decay < 0.0: 48 | raise ValueError( 49 | "Invalid weight_decay value: {}".format(weight_decay)) 50 | if eta < 0.0: 51 | raise ValueError("Invalid LARS coefficient value: {}".format(eta)) 52 | 53 | defaults = dict( 54 | lr=lr, momentum=momentum, dampening=dampening, 55 | weight_decay=weight_decay, nesterov=nesterov, eta=eta) 56 | if nesterov and (momentum <= 0 or dampening != 0): 57 | raise ValueError("Nesterov momentum requires a momentum and zero dampening") 58 | 59 | super(LARS, self).__init__(params, defaults) 60 | 61 | def __setstate__(self, state): 62 | super(LARS, self).__setstate__(state) 63 | for group in self.param_groups: 64 | group.setdefault('nesterov', False) 65 | 66 | @torch.no_grad() 67 | def step(self, closure=None): 68 | """Performs a single optimization step. 69 | 70 | Args: 71 | closure (callable, optional): A closure that reevaluates the model 72 | and returns the loss. 73 | """ 74 | loss = None 75 | if closure is not None: 76 | with torch.enable_grad(): 77 | loss = closure() 78 | 79 | for group in self.param_groups: 80 | weight_decay = group['weight_decay'] 81 | momentum = group['momentum'] 82 | dampening = group['dampening'] 83 | eta = group['eta'] 84 | nesterov = group['nesterov'] 85 | lr = group['lr'] 86 | lars_exclude = group.get('lars_exclude', False) 87 | 88 | for p in group['params']: 89 | if p.grad is None: 90 | continue 91 | 92 | d_p = p.grad 93 | 94 | if lars_exclude: 95 | local_lr = 1. 96 | else: 97 | weight_norm = torch.norm(p).item() 98 | grad_norm = torch.norm(d_p).item() 99 | # Compute local learning rate for this layer 100 | local_lr = eta * weight_norm / \ 101 | (grad_norm + weight_decay * weight_norm) 102 | 103 | actual_lr = local_lr * lr 104 | d_p = d_p.add(p, alpha=weight_decay).mul(actual_lr) 105 | if momentum != 0: 106 | param_state = self.state[p] 107 | if 'momentum_buffer' not in param_state: 108 | buf = param_state['momentum_buffer'] = \ 109 | torch.clone(d_p).detach() 110 | else: 111 | buf = param_state['momentum_buffer'] 112 | buf.mul_(momentum).add_(d_p, alpha=1 - dampening) 113 | if nesterov: 114 | d_p = d_p.add(buf, alpha=momentum) 115 | else: 116 | d_p = buf 117 | p.add_(-d_p) 118 | 119 | return loss -------------------------------------------------------------------------------- /models/module_zoo/heads/transformer_head.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group Holding Limited. 3 | 4 | """ Transformer heads. """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from models.base.base_blocks import BaseHead 10 | from models.base.base_blocks import HEAD_REGISTRY 11 | 12 | from collections import OrderedDict 13 | from models.utils.init_helper import lecun_normal_, trunc_normal_, _init_transformer_weights 14 | 15 | @HEAD_REGISTRY.register() 16 | class TransformerHead(BaseHead): 17 | """ 18 | Construct head for video vision transformers. 19 | """ 20 | def __init__(self, cfg): 21 | """ 22 | Args: 23 | cfg (Config): global config object. 24 | """ 25 | super(TransformerHead, self).__init__(cfg) 26 | self.apply(_init_transformer_weights) 27 | 28 | def _construct_head( 29 | self, 30 | dim, 31 | num_classes, 32 | dropout_rate, 33 | activation_func, 34 | ): 35 | if self.cfg.VIDEO.HEAD.PRE_LOGITS: 36 | self.pre_logits = nn.Sequential(OrderedDict([ 37 | ('fc', nn.Linear(dim, dim)), 38 | ('act', nn.Tanh()) 39 | ])) 40 | 41 | self.linear = nn.Linear(dim, num_classes) 42 | 43 | if dropout_rate > 0.0: 44 | self.dropout = nn.Dropout(dropout_rate) 45 | 46 | if activation_func == "softmax": 47 | self.activation = nn.Softmax(dim=-1) 48 | elif activation_func == "sigmoid": 49 | self.activation = nn.Sigmoid() 50 | elif activation_func == "identity": 51 | self.activation = nn.Identity() 52 | else: 53 | raise NotImplementedError( 54 | "{} is not supported as an activation" 55 | "function.".format(activation_func) 56 | ) 57 | 58 | def forward(self, x): 59 | """ 60 | Returns: 61 | x (Tensor): classification predictions. 62 | logits (Tensor): global average pooled features. 63 | """ 64 | if hasattr(self, "dropout"): 65 | out = self.dropout(x) 66 | else: 67 | out = x 68 | if hasattr(self, "pre_logits"): 69 | out = self.pre_logits(out) 70 | out = self.linear(out) 71 | 72 | if not self.training: 73 | out = self.activation(out) 74 | return out, x 75 | 76 | @HEAD_REGISTRY.register() 77 | class TransformerHeadx2(BaseHead): 78 | """ 79 | The Transformer head for EPIC-KITCHENS dataset. 80 | """ 81 | def __init__(self, cfg): 82 | """ 83 | Args: 84 | cfg (Config): global config object. 85 | """ 86 | super(TransformerHeadx2, self).__init__(cfg) 87 | self.apply(_init_transformer_weights) 88 | 89 | def _construct_head( 90 | self, 91 | dim, 92 | num_classes, 93 | dropout_rate, 94 | activation_func, 95 | ): 96 | if self.cfg.VIDEO.HEAD.PRE_LOGITS: 97 | self.pre_logits1 = nn.Sequential(OrderedDict([ 98 | ('fc', nn.Linear(dim, dim)), 99 | ('act', nn.Tanh()) 100 | ])) 101 | self.pre_logits2 = nn.Sequential(OrderedDict([ 102 | ('fc', nn.Linear(dim, dim)), 103 | ('act', nn.Tanh()) 104 | ])) 105 | self.linear1 = nn.Linear(dim, num_classes[0], bias=True) 106 | self.linear2 = nn.Linear(dim, num_classes[1], bias=True) 107 | 108 | if dropout_rate > 0.0: 109 | self.dropout = nn.Dropout(dropout_rate) 110 | 111 | if activation_func == "softmax": 112 | self.activation = nn.Softmax(dim=-1) 113 | elif activation_func == "sigmoid": 114 | self.activation = nn.Sigmoid() 115 | elif activation_func == "identity": 116 | self.activation = nn.Identity() 117 | else: 118 | raise NotImplementedError( 119 | "{} is not supported as an activation" 120 | "function.".format(activation_func) 121 | ) 122 | 123 | def forward(self, x): 124 | """ 125 | Returns: 126 | x (dict): dictionary of classification predictions, 127 | with keys "verb_class" and "noun_class" indicating 128 | the predictions on the verb and noun. 129 | logits (Tensor): global average pooled features. 130 | """ 131 | if hasattr(self, "dropout"): 132 | out1 = self.dropout(x) 133 | out2 = self.dropout(x) 134 | else: 135 | out1 = x 136 | out2 = x 137 | 138 | if hasattr(self, "pre_logits1"): 139 | out1 = self.pre_logits1(out1) 140 | out2 = self.pre_logits2(out2) 141 | 142 | out1 = self.linear1(out1) 143 | out2 = self.linear2(out2) 144 | 145 | if not self.training: 146 | out1 = self.activation(out1) 147 | out2 = self.activation(out2) 148 | return {"verb_class": out1, "noun_class": out2}, x --------------------------------------------------------------------------------