├── src ├── ClipViP │ └── src │ │ ├── __init__.py │ │ ├── utils │ │ ├── __init__.py │ │ ├── misc.py │ │ ├── metrics.py │ │ └── logger.py │ │ ├── datasets │ │ └── __init__.py │ │ ├── modeling │ │ └── __init__.py │ │ ├── optimization │ │ ├── __init__.py │ │ └── sched.py │ │ └── configs │ │ ├── lsmdc_retrieval │ │ ├── lsmdc_retrieval_vip_base_16.json │ │ └── lsmdc_retrieval_vip_base_32.json │ │ ├── didemo_retrieval │ │ ├── didemo_retrieval_vip_base_32.json │ │ └── didemo_retrieval_vip_base_16.json │ │ ├── msrvtt_retrieval │ │ ├── msrvtt_retrieval_vip_base_16.json │ │ └── msrvtt_retrieval_vip_base_32.json │ │ ├── actnet_retrieval │ │ ├── actnet_retrieval_vip_base_16.json │ │ └── actnet_retrieval_vip_base_32.json │ │ ├── pretrain │ │ ├── pretrain_vip_base_16.json │ │ └── pretrain_vip_base_32.json │ │ └── pretrained │ │ └── pretrain_vip_base_32.json ├── Singularity │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ └── model_retrieval.py │ ├── configs │ │ ├── config_bert.json │ │ ├── beit-base-patch16-224-pt22k-ft22k.json │ │ ├── qa_anet.yaml │ │ ├── qa_msrvtt.yaml │ │ └── qa_vqa.yaml │ ├── dataset │ │ ├── dataloader.py │ │ ├── base_dataset.py │ │ └── qa_dataset.py │ └── utils │ │ ├── scheduler.py │ │ └── config_utils.py ├── InternVideo │ ├── __init__.py │ ├── clip_utils │ │ ├── __init__.py │ │ └── utils │ │ │ └── __init__.py │ └── bpe_simple_vocab_16e6.txt.gz ├── demo_video │ └── ssv2_194058__book_falling_like_a_rock.mp4 ├── run_scripts │ ├── train.sh │ ├── eval_downstream_task.sh │ ├── eval_actionbench.sh │ └── inference.sh ├── configs │ ├── datasets │ │ ├── downstream_tasks │ │ │ ├── downstream_tasks_moments_in_time.yaml │ │ │ ├── downstream_tasks_temporal_224x224_5fps.yaml │ │ │ ├── downstream_tasks_retrieval_ssv2_224x224_5fps.yaml │ │ │ └── downstream_tasks_qa_nextqa_224x224_5fps.yaml │ │ └── actionbench │ │ │ ├── actionbench_ssv2_224x224_5fps.yaml │ │ │ ├── actionbench_ssv2_antonyms_224x224_5fps.yaml │ │ │ ├── actionbench_ego4d_224x224_5fps.yaml │ │ │ └── actionbench_ego4d_object_shuffled_224x224_5fps.yaml │ ├── projects │ │ ├── eval │ │ │ ├── actionbench │ │ │ │ ├── knowledge_patcher │ │ │ │ │ ├── README.md │ │ │ │ │ └── acdybench_ego4d_internvideo_KP-Perceiver-VTC-DVDM__action_antonym.yaml │ │ │ │ └── backbone │ │ │ │ │ ├── internvideo │ │ │ │ │ ├── ego4d │ │ │ │ │ │ ├── acdybench_ego4d_internvideo_backbone__action_antonym.yaml │ │ │ │ │ │ ├── acdybench_ego4d_internvideo_backbone__reversed_video.yaml │ │ │ │ │ │ └── acdybench_ego4d_internvideo_backbone__object_shuffle.yaml │ │ │ │ │ └── ssv2 │ │ │ │ │ │ ├── acdybench_ssv2_internvideo_backbone__action_antonym.yaml │ │ │ │ │ │ ├── acdybench_ssv2_internvideo_backbone__reversed_video.yaml │ │ │ │ │ │ └── acdybench_ssv2_internvideo_backbone__object_shuffle.yaml │ │ │ │ │ ├── clipvip │ │ │ │ │ ├── ego4d │ │ │ │ │ │ ├── acdybench_ego4d_clipvip_backbone__action_antonym.yaml │ │ │ │ │ │ ├── acdybench_ego4d_clipvip_backbone__reversed_video.yaml │ │ │ │ │ │ └── acdybench_ego4d_clipvip_backbone__object_shuffle.yaml │ │ │ │ │ └── ssv2 │ │ │ │ │ │ ├── acdybench_ssv2_clipvip_backbone__action_antonym.yaml │ │ │ │ │ │ ├── acdybench_ssv2_clipvip_backbone__object_shuffle.yaml │ │ │ │ │ │ └── acdybench_ssv2_clipvip_backbone__reversed_video.yaml │ │ │ │ │ └── singularity │ │ │ │ │ ├── ego4d │ │ │ │ │ ├── acdybench_ego4d_singularity_backbone__action_antonym.yaml │ │ │ │ │ ├── acdybench_ego4d_singularity_backbone__reversed_video.yaml │ │ │ │ │ └── acdybench_ego4d_singularity_backbone__object_shuffle.yaml │ │ │ │ │ └── ssv2 │ │ │ │ │ ├── acdybench_ssv2_singularity_backbone__action_antonym.yaml │ │ │ │ │ ├── acdybench_ssv2_singularity_backbone__object_shuffle.yaml │ │ │ │ │ └── acdybench_ssv2_singularity_backbone__reversed_video.yaml │ │ │ └── downstream_task │ │ │ │ ├── nextqa │ │ │ │ ├── backbone_zero-shot.yaml │ │ │ │ ├── side_tuning.yaml │ │ │ │ └── patch_and_fuse.yaml │ │ │ │ ├── temporal_ssv2 │ │ │ │ ├── backbone_zero-shot.yaml │ │ │ │ ├── side_tuning.yaml │ │ │ │ └── patch_and_fuse.yaml │ │ │ │ ├── ssv2_label │ │ │ │ ├── backbone_zero-shot.yaml │ │ │ │ ├── side_tuning.yaml │ │ │ │ └── patch_and_fuse.yaml │ │ │ │ ├── ssv2_template │ │ │ │ ├── backbone_zero-shot.yaml │ │ │ │ ├── side_tuning.yaml │ │ │ │ └── patch_and_fuse.yaml │ │ │ │ ├── moments_in_time │ │ │ │ ├── backbone_zero-shot.yaml │ │ │ │ ├── side_tuning_zero-shot.yaml │ │ │ │ └── patch_and_fuse_zero-shot.yaml │ │ │ │ └── temporal_kinetics │ │ │ │ ├── backbone_zero-shot.yaml │ │ │ │ ├── side_tuning_zero-shot.yaml │ │ │ │ └── patch_and_fuse_zero-shot.yaml │ │ └── train │ │ │ ├── downstream_tasks │ │ │ └── nextqa │ │ │ │ ├── KP-Perceiver-VTC-DVDM.yaml │ │ │ │ ├── Patch_and_Fuse.yaml │ │ │ │ └── Side_Tuning.yaml │ │ │ └── actionbench │ │ │ ├── ssv2 │ │ │ └── KP-Transformer-VTC.yaml │ │ │ └── ego4d │ │ │ └── KP-Transformer-VTC.yaml │ └── models │ │ ├── patch_and_fuse_intern_video.yaml │ │ ├── patch_and_fuse_clip_vip.yaml │ │ └── patch_and_fuse_singularity.yaml ├── preprocessing │ ├── ego4d │ │ └── downsample_downsize_video_clips.py │ ├── ssv2 │ │ └── downsample_downsize_video_clips.py │ ├── kinetics │ │ └── downsample_downsize_video_clips.py │ └── nextqa │ │ └── downsample_downsize_video_nextqa.py └── _get_model_computational_complexity.py ├── Paxion_overview.png ├── ActionBench_overview.png ├── .gitmodules ├── ActionBench └── src │ ├── ignored_verbs_ssv2.json │ ├── ignored_verbs_ego4d.json │ ├── README.md │ ├── additional_antonyms_mapping_ssv2.json │ ├── get_object_shuffling_ssv2.py │ └── split_train_val_test_ego4d.py ├── .gitignore └── dataset_cards ├── moments_in_time.md ├── downstream_tasks_ssv2.md ├── actionbench_ssv2.md ├── nextqa.md ├── downstream_tasks_temporal.md └── actionbench_ego4d.md /src/ClipViP/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/Singularity/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ClipViP/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ClipViP/src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ClipViP/src/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/Singularity/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ClipViP/src/optimization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/InternVideo/__init__.py: -------------------------------------------------------------------------------- 1 | from .internvideo import * -------------------------------------------------------------------------------- /src/InternVideo/clip_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | -------------------------------------------------------------------------------- /Paxion_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MikeWangWZHL/Paxion/HEAD/Paxion_overview.png -------------------------------------------------------------------------------- /ActionBench_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MikeWangWZHL/Paxion/HEAD/ActionBench_overview.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/LAVIS"] 2 | path = src/LAVIS 3 | url = https://github.com/MikeWangWZHL/LAVIS.git -------------------------------------------------------------------------------- /src/InternVideo/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MikeWangWZHL/Paxion/HEAD/src/InternVideo/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /src/demo_video/ssv2_194058__book_falling_like_a_rock.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MikeWangWZHL/Paxion/HEAD/src/demo_video/ssv2_194058__book_falling_like_a_rock.mp4 -------------------------------------------------------------------------------- /ActionBench/src/ignored_verbs_ssv2.json: -------------------------------------------------------------------------------- 1 | [ 2 | "hitting", 3 | "moving", 4 | "pass", 5 | "poking", 6 | "pick", 7 | "pretending", 8 | "poke", 9 | "putting" 10 | ] -------------------------------------------------------------------------------- /src/InternVideo/clip_utils/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # from .evl_module import TransformerDecoder 2 | from .clip_vit_only_global import vit_only_global_b32, vit_only_global_b16, vit_only_global_l14, vit_only_global_l14_336 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | datasets/* 2 | ActionBench/ego4d 3 | ActionBench/ssv2 4 | pretrained_ckpt/* 5 | *.pth 6 | *.pt 7 | *.ckpt 8 | checkpoint/ 9 | dummy_dataset/ 10 | src/visualization 11 | output/ 12 | src/testing_outputs 13 | _backup 14 | **/__pycache__/ 15 | **/*.code-workspace 16 | ckpt/ 17 | *.pt 18 | *tar.gz 19 | *.zip 20 | # *.png 21 | # *.jpg -------------------------------------------------------------------------------- /src/Singularity/models/model_retrieval.py: -------------------------------------------------------------------------------- 1 | from .model_retrieval_base import SingularityRetrievalBase 2 | # from models.model_retrieval_base import SingularityRetrievalBase 3 | 4 | 5 | class Singularity(SingularityRetrievalBase): 6 | def __init__(self, config=None, tokenizer=None): 7 | super(Singularity, self).__init__( 8 | config=config, tokenizer=tokenizer, pretrain=False 9 | ) 10 | -------------------------------------------------------------------------------- /src/run_scripts/train.sh: -------------------------------------------------------------------------------- 1 | DEVICES=0,1 # comma-separated list of GPU IDs 2 | N_GPU=2 # number of GPUs to use for training 3 | PORT=29501 4 | 5 | # takes in a .yaml config file from configs/projects/train, e.g., 6 | CONFIG="configs/projects/train/actionbench/ssv2/KP-Perceiver-VTC-DVDM.yaml" 7 | CUDA_VISIBLE_DEVICES=${DEVICES} python -m torch.distributed.run \ 8 | --nproc_per_node=${N_GPU} \ 9 | --master_port=${PORT} \ 10 | train.py --cfg-path ${CONFIG} -------------------------------------------------------------------------------- /src/run_scripts/eval_downstream_task.sh: -------------------------------------------------------------------------------- 1 | DEVICES=0 # support one GPU only for downstream evaluation (fast) 2 | N_GPU=1 3 | PORT=29501 4 | 5 | # takes in a .yaml config file from configs/projects/eval/downstream_task, e.g., 6 | CONFIG="configs/projects/eval/downstream_task/ssv2_template/backbone_zero-shot.yaml" 7 | CUDA_VISIBLE_DEVICES=${DEVICES} python -m torch.distributed.run \ 8 | --nproc_per_node=${N_GPU} \ 9 | --master_port=${PORT} \ 10 | evaluate.py --cfg-path ${CONFIG} -------------------------------------------------------------------------------- /dataset_cards/moments_in_time.md: -------------------------------------------------------------------------------- 1 | ## Instruction for Downloading Videos 2 | - Download the videos following the instructions [here](http://moments.csail.mit.edu/) 3 | - Put the downloaded videos to `datasets/Moments_In_Time/videos` 4 | 5 | ## Annotation Details 6 | We subsample ~2k instances from the original validation set for doing the zero-shot action classification. 7 | - validation size: 1830 8 | - ann_path: `datasets/Moments_In_Time/ann/validationSet_2k.csv` 9 | - format: refer to `datasets/Moments_In_Time/ann/README.md` -------------------------------------------------------------------------------- /src/run_scripts/eval_actionbench.sh: -------------------------------------------------------------------------------- 1 | DEVICES=0,1 # comma-separated list of GPU IDs 2 | N_GPU=2 # number of GPUs to use for training 3 | PORT=29501 4 | 5 | # takes in a .yaml config file from configs/projects/eval/actionbench, e.g., 6 | CONFIG="configs/projects/eval/actionbench/backbone/internvideo/ssv2/actionbench_ssv2_internvideo_backbone__action_antonym.yaml" 7 | CUDA_VISIBLE_DEVICES=${DEVICES} python -m torch.distributed.run \ 8 | --nproc_per_node=${N_GPU} \ 9 | --master_port=${PORT} \ 10 | evaluate.py --cfg-path ${CONFIG} -------------------------------------------------------------------------------- /src/Singularity/configs/config_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30522, 19 | "fusion_layer": 9, 20 | "encoder_width": 768 21 | } 22 | -------------------------------------------------------------------------------- /src/configs/datasets/downstream_tasks/downstream_tasks_moments_in_time.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | downstream_tasks_moment_in_time: # name of the dataset builder 3 | dataset_card: dataset_cards/moments_in_time.md 4 | data_type: videos #extracted features of videos (I3D, VGGish) # [images|videos|features] 5 | 6 | build_info: 7 | # Be careful not to append minus sign (-) before split to avoid itemizing 8 | annotations: 9 | train: 10 | path: datasets/Moments_In_Time/ann 11 | val: 12 | path: datasets/Moments_In_Time/ann 13 | videos: 14 | path: datasets/Moments_In_Time/videos -------------------------------------------------------------------------------- /src/configs/datasets/downstream_tasks/downstream_tasks_temporal_224x224_5fps.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | downstream_tasks_temporal: # name of the dataset builder 3 | dataset_card: dataset_cards/downstream_tasks_temporal.md 4 | data_type: videos #extracted features of videos (I3D, VGGish) # [images|videos|features] 5 | 6 | build_info: 7 | annotations: 8 | val: 9 | path: datasets/Temporal/ann 10 | videos: 11 | path: 12 | kinetics: datasets/Temporal/video_clips/kinetics400/clips_downsampled_5fps_downsized_224x224 13 | ssv2: datasets/SSv2/video_clips/clips_downsampled_5fps_downsized_224x224 -------------------------------------------------------------------------------- /src/configs/datasets/downstream_tasks/downstream_tasks_retrieval_ssv2_224x224_5fps.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | downstream_tasks_retrieval_ssv2_224x224_5fps: # name of the dataset builder 3 | dataset_card: dataset_cards/downstream_task_ssv2.md 4 | data_type: videos # [images|videos|features] 5 | 6 | build_info: 7 | # Be careful not to append minus sign (-) before split to avoid itemizing 8 | annotations: 9 | train: 10 | path: datasets/SSv2/ssv2_label_ssv2_template 11 | val: 12 | path: datasets/SSv2/ssv2_label_ssv2_template 13 | videos: 14 | path: datasets/SSv2/video_clips/clips_downsampled_5fps_downsized_224x224 -------------------------------------------------------------------------------- /src/ClipViP/src/utils/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | modified from UNITER 3 | """ 4 | import json 5 | import random 6 | import sys 7 | 8 | import torch 9 | import numpy as np 10 | 11 | 12 | class NoOp(object): 13 | """ useful for distributed training No-Ops """ 14 | def __getattr__(self, name): 15 | return self.noop 16 | 17 | def noop(self, *args, **kwargs): 18 | return 19 | 20 | 21 | def set_random_seed(seed): 22 | random.seed(seed) 23 | np.random.seed(seed) 24 | torch.manual_seed(seed) 25 | torch.cuda.manual_seed_all(seed) 26 | 27 | 28 | def zero_none_grad(model): 29 | for p in model.parameters(): 30 | if p.grad is None and p.requires_grad: 31 | p.grad = p.data.new(p.size()).zero_() 32 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/knowledge_patcher/README.md: -------------------------------------------------------------------------------- 1 | For constructing eval configs for knowledge patcher on AcdyBench, we show two examples in this folder on SSv2 action_antonym and Ego4d action_antonym, with KP-Perceiver based on InternVideo. 2 | 3 | - To evaluate on other method such as KP-Transformer: replace the "model" section with the corresponding "model" sections in `configs/train/acdybench//.yaml`. And set the "model.pretrained" field to the corresponding trained checkpoint path. 4 | - To evaluate on other tasks such as reversed_video: replace the "dataset" section with the corresponding "dataset" sections in `configs/eval/acdybench/backbone/*/*_.yaml`. 5 | - Set the "run.output_dir" according to the custom setting. 6 | -------------------------------------------------------------------------------- /src/configs/datasets/downstream_tasks/downstream_tasks_qa_nextqa_224x224_5fps.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | downstream_tasks_qa_nextqa_224x224_5fps: # name of the dataset builder 3 | dataset_card: dataset_cards/downstream_task_nextqa.md 4 | data_type: videos # [images|videos|features] 5 | 6 | build_info: 7 | # Be careful not to append minus sign (-) before split to avoid itemizing 8 | annotations: 9 | train: 10 | path: datasets/NextQA/ann/nextqa_action_antonym 11 | val: 12 | path: datasets/NextQA/ann/nextqa_action_antonym 13 | test: 14 | path: datasets/NextQA/ann/nextqa_action_antonym 15 | videos: 16 | path: datasets/NextQA/video_clips/NExTVideo_downsampled_5fps_downsized_224x224 -------------------------------------------------------------------------------- /src/configs/models/patch_and_fuse_intern_video.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: patch_and_fuse_internvideo 8 | 9 | load_pretrained: True 10 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 11 | 12 | preprocess: 13 | vis_processor: 14 | train: 15 | name: "video_train" 16 | image_size: 224 17 | eval: 18 | name: "internvideo_eval" 19 | image_size: 224 20 | text_processor: 21 | train: 22 | name: "vl_dynamic_ego4d_text" 23 | eval: 24 | name: "vl_dynamic_ego4d_text" 25 | -------------------------------------------------------------------------------- /src/configs/models/patch_and_fuse_clip_vip.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: patch_and_fuse_clipvip 8 | 9 | load_pretrained: True 10 | backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 11 | 12 | 13 | preprocess: 14 | vis_processor: 15 | train: 16 | name: "video_train" 17 | image_size: 224 18 | eval: 19 | name: "internvideo_eval" 20 | image_size: 224 21 | text_processor: 22 | train: 23 | name: "vl_dynamic_ego4d_text" 24 | eval: 25 | name: "vl_dynamic_ego4d_text" 26 | -------------------------------------------------------------------------------- /src/configs/models/patch_and_fuse_singularity.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: patch_and_fuse_singularity 8 | 9 | load_pretrained: True 10 | backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 11 | 12 | 13 | preprocess: 14 | vis_processor: 15 | train: 16 | name: "video_train" 17 | image_size: 224 18 | eval: 19 | name: "internvideo_eval" 20 | image_size: 224 21 | text_processor: 22 | train: 23 | name: "vl_dynamic_ego4d_text" 24 | eval: 25 | name: "vl_dynamic_ego4d_text" 26 | -------------------------------------------------------------------------------- /src/configs/datasets/actionbench/actionbench_ssv2_224x224_5fps.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | actionbench_ssv2_224x224_5fps: # name of the dataset builder 3 | dataset_card: dataset_cards/actionbench_ssv2.md 4 | data_type: videos # [images|videos|features] 5 | 6 | build_info: 7 | # Be careful not to append minus sign (-) before split to avoid itemizing 8 | annotations: 9 | train: 10 | path: ActionBench/ssv2/original 11 | use_templates_as_labels: false 12 | val: 13 | path: ActionBench/ssv2/original 14 | use_templates_as_labels: false 15 | test: 16 | path: ActionBench/ssv2/original 17 | use_templates_as_labels: true # This needs to be true to be loaded properly 18 | videos: 19 | path: datasets/SSv2/video_clips/clips_downsampled_5fps_downsized_224x224 -------------------------------------------------------------------------------- /dataset_cards/downstream_tasks_ssv2.md: -------------------------------------------------------------------------------- 1 | ## Instruction for Downloading Videos 2 | refer to [actionbench_ssv2.md](./actionbench_ssv2.md) 3 | 4 | ## Downstream Task: SSv2-label and SSv2-template 5 | - paper: https://arxiv.org/abs/2206.03428 6 | - train size: 168913 7 | - val size: 2088 8 | - ann_path: `/shared/nas/data/m1/wangz3/Paper_Code_Repos_Cleaned/neurips23_patch_and_fuse/datasets/SSv2/ssv2_label_ssv2_template` 9 | - format SSv2-label: 10 | ``` 11 | [ 12 | {"video": ["62211.webm"], "caption": "spinning soap that quickly stops spinning"}, 13 | ] 14 | ``` 15 | - format SSv2-template: 16 | ``` 17 | [ 18 | {"video": ["62211.webm", "63095.webm", "174825.webm", "65027.webm", "65677.webm", "37955.webm", "9741.webm", "47588.webm", "31811.webm", "155308.webm", "6483.webm", "106444.webm"], "caption": "Spinning [something] that quickly stops spinning"} 19 | ] 20 | ``` -------------------------------------------------------------------------------- /src/Singularity/configs/beit-base-patch16-224-pt22k-ft22k.json: -------------------------------------------------------------------------------- 1 | { 2 | "note": "this file is a copy of the BEiT model config, not used directly", 3 | "architectures": [ 4 | "BeitForImageClassification" 5 | ], 6 | "url": "https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k/raw/main/config.json", 7 | "attention_probs_dropout_prob": 0.0, 8 | "drop_path_rate": 0.1, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.0, 11 | "hidden_size": 768, 12 | "image_size": 224, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 3072, 15 | "layer_norm_eps": 1e-12, 16 | "layer_scale_init_value": 0.1, 17 | "model_type": "beit", 18 | "num_attention_heads": 12, 19 | "num_channels": 3, 20 | "num_hidden_layers": 12, 21 | "patch_size": 16, 22 | "torch_dtype": "float32", 23 | "transformers_version": "4.11.0.dev0", 24 | "use_absolute_position_embeddings": false, 25 | "use_mask_token": false, 26 | "use_mean_pooling": true, 27 | "use_relative_position_bias": true, 28 | "use_shared_relative_position_bias": false, 29 | "vocab_size": 8192 30 | } 31 | -------------------------------------------------------------------------------- /ActionBench/src/ignored_verbs_ego4d.json: -------------------------------------------------------------------------------- 1 | [ 2 | "puts", 3 | "moves", 4 | "looks", 5 | "places", 6 | "takes", 7 | "keeps", 8 | "keep", 9 | "turns", 10 | "adjusts", 11 | "operates", 12 | "move", 13 | "look", 14 | "place", 15 | "take", 16 | "turn", 17 | "adjust", 18 | "operate", 19 | "using", 20 | "Puts", 21 | "carries", 22 | "plays", 23 | "saw", 24 | "uses", 25 | "sits", 26 | "applies", 27 | "reads", 28 | "left", 29 | "Looks", 30 | "gives", 31 | "checks", 32 | "drives", 33 | "cooking", 34 | "inspects", 35 | "watches", 36 | "put", 37 | "hand", 38 | "carry", 39 | "play", 40 | "use", 41 | "apply", 42 | "read", 43 | "Look", 44 | "give", 45 | "interact", 46 | "check", 47 | "drive", 48 | "cook", 49 | "watch", 50 | "pass", 51 | "passes", 52 | "fail", 53 | "fails", 54 | "show", 55 | "shows", 56 | "share", 57 | "shares", 58 | "keep", 59 | "keeps", 60 | "looking", 61 | "found", 62 | "find" 63 | ] -------------------------------------------------------------------------------- /src/configs/datasets/actionbench/actionbench_ssv2_antonyms_224x224_5fps.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | actionbench_ssv2_224x224_5fps: # name of the dataset builder 3 | dataset_card: dataset_cards/actionbench_ssv2.md 4 | data_type: videos # [images|videos|features] 5 | 6 | build_info: 7 | # Be careful not to append minus sign (-) before split to avoid itemizing 8 | annotations: 9 | train: 10 | path: ActionBench/ssv2/shuffled_object_and_action_antonyms 11 | use_templates_as_labels: false 12 | state_change_filtering_json: ActionBench/ssv2/shuffled_object_and_action_antonyms/state_change_heavy_instance_filtering_train.json 13 | val: 14 | path: ActionBench/ssv2/shuffled_object_and_action_antonyms 15 | use_templates_as_labels: false 16 | state_change_filtering_json: ActionBench/ssv2/shuffled_object_and_action_antonyms/state_change_heavy_instance_filtering_val.json 17 | test: 18 | path: ActionBench/ssv2/shuffled_object_and_action_antonyms 19 | use_templates_as_labels: true # This needs to be true to be loaded properly 20 | videos: 21 | path: datasets/SSv2/video_clips/clips_downsampled_5fps_downsized_224x224 -------------------------------------------------------------------------------- /ActionBench/src/README.md: -------------------------------------------------------------------------------- 1 | # Code Description 2 | You need to install `spacy`, `nltk` and `pyinflect` for using these script. 3 | 4 | ## get action antonyms 5 | - `get_action_antonyms_ego4d_ssv2.py`: contains example code for getting altered text sentences with verbs replaced with their antonyms. 6 | - `get_action_antonyms()` is the main function for finding action antonyms given an original text annotation; 7 | - `get_action_antonyms_ego4d()` shows an example and some comments on processing Ego4d annotations; 8 | - variable `ADDITIONAL_ANTONYYMS_MAPPING` should contain a table of semi-automatically constructed verb-antonym pairs which can be dataset specific; (For Ego4d, we first get a list of verbs from the provided taxonomy and then ask ChatGPT to generate antonym for each of them, and then manually clean up.) 9 | - variable `IGNORED_VERBS` should contain a list of verbs that does not have a good antonym, this is manually created; 10 | - `get_action_antonyms_ssv2()` shows a similar example on ssv2 11 | 12 | ## get shuffled objects 13 | - `get_object_shuffling_*`: contains example code for getting altered text sentences with object names replaced by a random object based on dataset taxonomy (Ego4d and SSv2) -------------------------------------------------------------------------------- /src/run_scripts/inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export TOKENIZERS_PARALLELISM=false 3 | 4 | ### usage examples for inferencing different tasks ### 5 | # CONFIG is taken from .yaml from configs/projects/eval 6 | 7 | ## == inference actionbench == ## 8 | INFERENCE_TYPE="physical_knowledge_bench" 9 | CONFIG="configs/projects/eval/actionbench/backbone/internvideo/ssv2/actionbench_ssv2_internvideo_backbone__action_antonym.yaml" 10 | 11 | ## == inference nextqa == ## 12 | INFERENCE_TYPE="downstream_task_next_qa" 13 | CONFIG="configs/projects/eval/downstream_task/nextqa/backbone_zero-shot.yaml" 14 | 15 | ## == inference ssv2-label == ## 16 | INFERENCE_TYPE="downstream_task_retrieval_v2t_ssv2_label" 17 | CONFIG="configs/projects/eval/downstream_task/ssv2_label/backbone_zero-shot.yaml" 18 | 19 | ## == inference ssv2-template == ## 20 | INFERENCE_TYPE="downstream_task_retrieval_v2t_ssv2_template" 21 | CONFIG="configs/projects/eval/downstream_task/ssv2_template/backbone_zero-shot.yaml" 22 | 23 | ## == inference temporal-ssv2 == ## 24 | INFERENCE_TYPE="downstream_task_retrieval_v2t_temporal_ssv2" 25 | CONFIG="configs/projects/eval/downstream_task/temporal_ssv2/backbone_zero-shot.yaml" 26 | 27 | 28 | 29 | # run inference 30 | CUDA_VISIBLE_DEVICES=${DEVICES} python -m torch.distributed.run \ 31 | --nproc_per_node=${N_GPU} \ 32 | --master_port=${PORT} \ 33 | inference.py --cfg-path $CONFIG --inference_type $INFERENCE_TYPE -------------------------------------------------------------------------------- /dataset_cards/actionbench_ssv2.md: -------------------------------------------------------------------------------- 1 | # Action Dynamic Benchmark (ActionBench) on SSv2 2 | 3 | ## Instruction for Downloading Videos 4 | - Download the videos from [here](https://developer.qualcomm.com/software/ai-datasets/something-something) 5 | - Put the downloaded `.webm` files into `datasets/ssv2/video_clips/clips` 6 | - Run preprocessing script (at the root dir of this repo): 7 | ``` 8 | python src/preprocessing/ssv2/downsample_downsize_video_clips.py 9 | ``` 10 | - The resulting preprocessed video clips are stored at `datasets/ssv2/video_clips/clips_downsampled_5fps_downsized_224x224` 11 | 12 | ## Annotation Details 13 | 14 | ### Action Antonym Task & Video Reversal Task & Object Shuffle Task 15 | - train: 162,475 16 | - validation: 23,807 17 | - ann_path: `ActionBench/ssv2/shuffled_object_and_action_antonyms` 18 | - format: 19 | ``` 20 | { 21 | "label": "Spinning cube that quickly stops spinning", 22 | "template": "Spinning something that quickly stops spinning", 23 | "placeholders": [ 24 | "cube" 25 | ], 26 | "template_action_antonym_clip_text": "Spinning something that quickly starts spinning", 27 | "label_action_antonym_clip_text": "Spinning cube that quickly starts spinning", 28 | "id": 74225, 29 | "label_object_shuffled_clip_text": "spinning feeding lid that quickly stops spinning" 30 | } 31 | ``` 32 | -------------------------------------------------------------------------------- /dataset_cards/nextqa.md: -------------------------------------------------------------------------------- 1 | ## Instruction for Downloading Videos 2 | - Download the videos following the instructions [here](https://github.com/doc-doc/NExT-QA) 3 | - Put the downloaded videos to `datasets/NextQA/video_clips/NExTVideo` 4 | - Run preprocessing script (at the root dir of this repo): 5 | ``` 6 | python src/preprocessing/nextqa/downsample_downsize_video_nextqa.py 7 | ``` 8 | - The resulting preprocessed video clips are stored at `datasets/NextQA/video_clips/NExTVideo_downsampled_5fps_downsized_224x224` 9 | 10 | ## Annotation Details 11 | - train size: 34132 12 | - val size: 4996 13 | - action antonym ann_path for patcher DVDM training: `datasets/NextQA/ann/nextqa_action_antonym` 14 | - format: 15 | ``` 16 | video,frame_count,width,height,question,answer,qid,type,a0,a1,a2,a3,a4,action_antonym_choices 17 | 4010069381,369,640,480,how do the two man play the instrument,0,6,CH,roll the handle,tap their feet,strum the string,hit with sticks,pat with hand,"['unwind the handle', 'hit their feet', 'missed with sticks']" 18 | ``` 19 | - original ann_path for finetuning and evaluation: `datasets/NextQA/ann/nextqa` 20 | - format: 21 | ``` 22 | video,frame_count,width,height,question,answer,qid,type,a0,a1,a2,a3,a4 23 | 4010069381,369,640,480,how do the two man play the instrument,0,6,CH,roll the handle,tap their feet,strum the string,hit with sticks,pat with hand 24 | ``` -------------------------------------------------------------------------------- /ActionBench/src/additional_antonyms_mapping_ssv2.json: -------------------------------------------------------------------------------- 1 | { 2 | "approaching": "moving away from", 3 | "attaching": "detaching", 4 | "bending": "straightening", 5 | "burying": "unearthing", 6 | "closing": "opening", 7 | "covering": "uncovering", 8 | "digging": "filling", 9 | "dropping": "picking up", 10 | "failing": "succeeding", 11 | "folding": "unfolding", 12 | "holding": "releasing", 13 | "laying": "picking up", 14 | "letting": "preventing", 15 | "lifting": "lowering", 16 | "towards": "away from", 17 | "opening": "closing", 18 | "picking": "putting", 19 | "piling": "scattering", 20 | "plugging": "removing", 21 | "pouring": "filling", 22 | "trying": "succeeding", 23 | "tearing": "mending", 24 | "close": "open", 25 | "open": "close", 26 | "pour": "fill", 27 | "scoop": "fill", 28 | "spread": "gather", 29 | "sprinkle": "dump", 30 | "squeeze": "expand", 31 | "take": "return", 32 | "throw": "catch", 33 | "pulling": "pushing", 34 | "pushing": "pulling", 35 | "removing": "placing", 36 | "rolling": "halting", 37 | "scooping": "filling", 38 | "showing": "concealing", 39 | "falling": "ascending", 40 | "spilling": "collecting", 41 | "squeezing": "expanding", 42 | "stuffing": "emptying", 43 | "throwing": "catching", 44 | "tilting": "righting", 45 | "tipping": "righting", 46 | "turning": "straightening", 47 | "wiping": "spilling", 48 | "twisting": "untwisting", 49 | "uncovering": "covering", 50 | "unfolding": "folding" 51 | } -------------------------------------------------------------------------------- /src/preprocessing/ego4d/downsample_downsize_video_clips.py: -------------------------------------------------------------------------------- 1 | # modified from EgoVLP https://github.com/showlab/EgoVLP/blob/main/utils/video_resize.py 2 | 3 | import os 4 | import time 5 | import sys 6 | import subprocess 7 | from multiprocessing import Pool, Value 8 | 9 | image_size = 224 10 | fps = 5 11 | 12 | original_clips = 'datasets/Ego4D/video_clips/clips' 13 | output_dir = f'datasets/Ego4D/video_clips/clips_downsampled_{fps}fps_downsized_{image_size}x{image_size}' 14 | 15 | os.makedirs(output_dir, exist_ok=True) 16 | 17 | def videos_resize(videoinfos): 18 | global count 19 | 20 | videoidx, videoname = videoinfos 21 | 22 | if os.path.exists(os.path.join(output_dir, videoname)): 23 | print(f'{videoname} already exists.') 24 | return 25 | 26 | inname = original_clips + '/' + videoname 27 | outname = output_dir + '/' + videoname 28 | 29 | # cmd = "ffmpeg -y -i {} -filter:v scale=\"trunc(oh*a/2)*2:256\" -c:a copy {}".format(inname, outname) 30 | cmd = f"ffmpeg -loglevel info -y -i {inname} -filter:v scale={image_size}:{image_size},fps={fps} -c:a copy {outname}" 31 | subprocess.call(cmd, shell=True) 32 | 33 | return 34 | 35 | 36 | if __name__ == "__main__": 37 | 38 | file_list = [] 39 | mp4_list = [item for item in os.listdir(original_clips) if item.endswith('.mp4')] # load mp4 files 40 | 41 | for idx, video in enumerate(mp4_list): 42 | file_list.append([idx, video]) 43 | 44 | print(file_list) 45 | print(len(file_list)) 46 | 47 | pool = Pool(8) 48 | pool.map(videos_resize, tuple(file_list)) -------------------------------------------------------------------------------- /src/configs/datasets/actionbench/actionbench_ego4d_224x224_5fps.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | actionbench_ego4d_224x224_5fps: # name of the dataset builder 3 | dataset_card: dataset_cards/actionbench_ego4d.md 4 | data_type: videos # [images|videos|features] 5 | 6 | build_info: 7 | # Be careful not to append minus sign (-) before split to avoid itemizing 8 | annotations: 9 | train: 10 | url: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/train.jsonl 11 | storage: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/train.jsonl 12 | state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_train.json 13 | val: 14 | url: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/val.jsonl 15 | storage: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/val.jsonl 16 | state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_val.json 17 | test: 18 | url: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/test.jsonl 19 | storage: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/test.jsonl 20 | state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_test.json 21 | videos: 22 | storage: datasets/Ego4D/video_clips/clips_downsampled_5fps_downsized_224x224 -------------------------------------------------------------------------------- /dataset_cards/downstream_tasks_temporal.md: -------------------------------------------------------------------------------- 1 | ## Instruction for Downloading Videos 2 | - **SSv2 videos**: refer to [actionbench_ssv2.md](./actionbench_ssv2.md). 3 | - **Kinetics400 videos**: download the subset of Kinetic400 that are required in Temporal-Kinetics 4 | - Install `yt-dlp` following the instructions [here](https://github.com/yt-dlp/yt-dlp.git) 5 | - Download the required videos using our provided script: 6 | ``` 7 | cd datasets/Temporal/ann 8 | bash download_kinetic_videos_yt_dlp.sh 9 | ``` 10 | - Put the downloaded videos into `datasets/Temporal/video_clips/kinetics400/clips` 11 | - Run preprocessing script (at the root dir of this repo): 12 | ``` 13 | python src/preprocessing/kinetics/downsample_downsize_video_clips.py 14 | ``` 15 | - The resulting preprocessed video clips are stored at `datasets/Temporal/video_clips/kinetics400/clips_downsampled_5fps_downsized_224x224` 16 | 17 | ## Annotation Details 18 | - paper: https://arxiv.org/abs/2301.02074 19 | - Temporal-kinetics size: 1309 | 32 action texts 20 | - Temporal-ssv2 size: 864 | 18 action texts 21 | - ann_path: `datasets/Temporal/ann/val-v1.0-2.4k.csv` 22 | - format: 23 | ``` 24 | ,index,video_id,text,dataset 25 | 4153,2561,169724,Approaching [something] with your camera,SSv2 26 | ... 27 | 188,2281,cartwheeling/RUNwB3-Qxqg_000007_000017,cartwheeling,kinetics 28 | ... 29 | ```` 30 | 31 | ## Video directory 32 | - Temporal-kinetics: `datasets/Temporal/video_clips/kinetics400/clips_downsampled_5fps_downsized_224x224` 33 | - Temporal-ssv2: `datasets/ssv2/video_clips/clips_downsampled_5fps_downsized_224x224` -------------------------------------------------------------------------------- /src/configs/datasets/actionbench/actionbench_ego4d_object_shuffled_224x224_5fps.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | actionbench_ego4d_224x224_5fps: # name of the dataset builder 3 | type: "object_shuffled" 4 | dataset_card: dataset_cards/actionbench_ego4d.md 5 | data_type: videos # [images|videos|features] 6 | 7 | build_info: 8 | # Be careful not to append minus sign (-) before split to avoid itemizing 9 | annotations: 10 | train: 11 | url: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/train.jsonl 12 | storage: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/train.jsonl 13 | state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_train.json 14 | val: 15 | url: ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/val.jsonl 16 | storage: ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/val.jsonl 17 | state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_val.json 18 | test: 19 | url: ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/test.jsonl 20 | storage: ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/test.jsonl 21 | state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_test.json 22 | videos: 23 | storage: datasets/Ego4D/video_clips/clips_downsampled_5fps_downsized_224x224 -------------------------------------------------------------------------------- /src/preprocessing/ssv2/downsample_downsize_video_clips.py: -------------------------------------------------------------------------------- 1 | # modified from EgoVLP https://github.com/showlab/EgoVLP/blob/main/utils/video_resize.py 2 | # Downsamples, downsizes, and converts to mp4 3 | 4 | import os 5 | import subprocess 6 | from multiprocessing import Pool 7 | from tqdm import tqdm 8 | from functools import partial 9 | 10 | image_size = 224 11 | fps = 5 12 | 13 | 14 | original_clips = 'datasets/SSv2/video_clips/clips' 15 | output_dir = f'datasets/SSv2/video_clips/clips_downsampled_{fps}fps_downsized_{image_size}x{image_size}' 16 | 17 | def resize_video(videoname, suppress_stdout=False, suppress_stderr=False): 18 | if os.path.exists(os.path.join(output_dir, videoname)): 19 | print(f'{videoname} already exists.') 20 | return 21 | 22 | inname = original_clips + '/' + videoname 23 | outname = output_dir + '/' + f'{videoname.split(".")[0]}.mp4' 24 | 25 | cmd = f"ffmpeg -loglevel info -y -i {inname} -filter:v scale={image_size}:{image_size},fps={fps} -c:a copy {outname}" 26 | 27 | kwargs = {} 28 | if suppress_stdout: 29 | kwargs['stdout'] = subprocess.DEVNULL 30 | if suppress_stderr: 31 | kwargs['stderr'] = subprocess.DEVNULL 32 | 33 | subprocess.run(cmd, shell=True, **kwargs) 34 | 35 | return 36 | 37 | if __name__ == "__main__": 38 | suppress_stdout = True 39 | suppress_stderr = True 40 | num_proc = 10 41 | 42 | os.makedirs(output_dir, exist_ok=True) 43 | webm_list = [item for item in os.listdir(original_clips) if item.endswith('.webm')] # load webm files 44 | print('Total files to consider:', len(webm_list)) 45 | 46 | resizer = partial(resize_video, suppress_stdout=suppress_stdout, suppress_stderr=suppress_stderr) 47 | for _ in tqdm(Pool(num_proc).imap_unordered(resizer, webm_list), total=len(webm_list)): 48 | pass -------------------------------------------------------------------------------- /src/_get_model_computational_complexity.py: -------------------------------------------------------------------------------- 1 | import torchvision.models as models 2 | import torch 3 | import torch.nn as nn 4 | 5 | from ptflops import get_model_complexity_info 6 | 7 | import InternVideo 8 | from models import ( 9 | PatchAndFuseInternVideo, 10 | KnowledgePatcherInternVideo_Baseline_Simple 11 | ) 12 | def set_up_device(gpu_index): 13 | # single gpu 14 | if torch.cuda.is_available() and gpu_index != -1: 15 | dev = f"cuda:{gpu_index}" 16 | else: 17 | dev = "cpu" 18 | return torch.device(dev) 19 | 20 | class Wrapper(nn.Module): 21 | def __init__(self, module) -> None: 22 | super(Wrapper, self).__init__() 23 | self.net = module 24 | def forward(self, x): 25 | return self.net.encode_video(x) 26 | 27 | with torch.cuda.device(1): 28 | # device = set_up_device(gpu_index=3) 29 | 30 | for model_name in [ 31 | "patch_and_fuse_internvideo", 32 | "patch_and_fuse_internvideo_baseline_simple" 33 | ]: 34 | 35 | print("model_name:",model_name) 36 | 37 | model_type = "InternVideo-MM-L-14" 38 | print("model_type:",model_type) 39 | 40 | # load_model 41 | if model_name == "patch_and_fuse_internvideo": 42 | module = PatchAndFuseInternVideo.from_pretrained(model_type=model_type) 43 | elif model_name == "patch_and_fuse_internvideo_baseline_simple": 44 | module = KnowledgePatcherInternVideo_Baseline_Simple.from_pretrained(model_type=model_type) 45 | 46 | model = Wrapper(module) 47 | 48 | macs, params = get_model_complexity_info(model, (8,3,224,224), as_strings=True, 49 | print_per_layer_stat=True, verbose=True) 50 | print('{:<30} {:<8}'.format('Computational complexity: ', macs)) 51 | print('{:<30} {:<8}'.format('Number of parameters: ', params)) -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/nextqa/backbone_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo_mcqa 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | 7 | datasets: 8 | downstream_tasks_qa_nextqa_224x224_5fps: 9 | type: "default" 10 | vis_processor: 11 | train: 12 | name: "video_train" 13 | image_size: 224 14 | eval: 15 | name: "internvideo_eval" 16 | image_size: 224 17 | text_processor: 18 | train: 19 | name: "minimum_text" 20 | eval: 21 | name: "minimum_text" 22 | 23 | # IMPORTANT configs: 24 | task: 5way-multiple-choice-qa # 25 | neg_sampling_same_clip: 0 26 | eval_only: True 27 | 28 | # other arguements 29 | train_k: null # sample a subset of k instances 30 | eval_k: null # sample a subset of k instances, reduce evaluation time 31 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 32 | num_frm: 8 33 | train_frame_height: 224 34 | train_frame_width: 224 35 | eval_frame_height: 224 36 | eval_frame_width: 224 37 | 38 | run: 39 | # use custom runner 40 | runner: runner_base_patch_and_fuse 41 | 42 | # task object name 43 | task: downstream_tasks_multi_choice_qa 44 | 45 | # which module is used for inference ["backbone", "knowledge_patcher"] 46 | eval_module: backbone 47 | eval_task: 5way-multiple-choice-qa 48 | 49 | batch_size_train: 32 50 | batch_size_eval: 8 51 | num_workers: 4 52 | 53 | seed: 42 54 | output_dir: "output/downstream_tasks/NextQA/eval/backbone_zero-shot" 55 | 56 | amp: False 57 | resume_ckpt_path: null 58 | 59 | evaluate: True 60 | 61 | # train_splits: ["train"] 62 | valid_splits: ["val"] 63 | test_splits: ["test"] 64 | 65 | device: "cuda" 66 | world_size: 1 67 | dist_url: "env://" 68 | distributed: True 69 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/temporal_ssv2/backbone_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | 8 | datasets: 9 | downstream_tasks_temporal: 10 | type: "default" 11 | vis_processor: 12 | train: 13 | name: "video_train" 14 | image_size: 224 15 | eval: 16 | name: "internvideo_eval" 17 | image_size: 224 18 | text_processor: 19 | train: 20 | name: "minimum_text" 21 | eval: 22 | name: "minimum_text" 23 | 24 | # IMPORTANT configs: 25 | fps: 5 26 | task: v1.0_2.4k 27 | subset: ssv2 28 | neg_sampling_same_clip: 0 29 | eval_only: True 30 | 31 | # other arguements 32 | train_k: null # sample a subset of k instances 33 | eval_k: null # sample a subset of k instances, reduce evaluation time 34 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 35 | num_frm: 8 36 | train_frame_height: 224 37 | train_frame_width: 224 38 | eval_frame_height: 224 39 | eval_frame_width: 224 40 | 41 | run: 42 | # use custom runner 43 | runner: runner_base_patch_and_fuse 44 | 45 | # task object name 46 | task: downstream_tasks_retrieval 47 | 48 | # which module is used for inference ["backbone", "knowledge_patcher"] 49 | eval_module: backbone 50 | eval_task: v1.0_2.4k 51 | 52 | batch_size_train: 32 53 | batch_size_eval: 4 54 | num_workers: 4 55 | 56 | seed: 42 57 | output_dir: "output/downstream_tasks/temporal_ssv2/backbone_zero-shot" 58 | 59 | amp: False 60 | resume_ckpt_path: null 61 | 62 | evaluate: True 63 | 64 | # train_splits: ["train"] 65 | valid_splits: ["val"] 66 | # test_splits: ["test"] 67 | 68 | device: "cuda" 69 | world_size: 1 70 | dist_url: "env://" 71 | distributed: True 72 | -------------------------------------------------------------------------------- /src/preprocessing/kinetics/downsample_downsize_video_clips.py: -------------------------------------------------------------------------------- 1 | # modified from EgoVLP https://github.com/showlab/EgoVLP/blob/main/utils/video_resize.py 2 | # Downsamples, downsizes, and converts to mp4 3 | 4 | import os 5 | import subprocess 6 | from multiprocessing import Pool 7 | from tqdm import tqdm 8 | from functools import partial 9 | 10 | image_size = 224 11 | fps = 5 12 | 13 | # original_clips = '' 14 | # output_dir = f'/clips_downsampled_{fps}fps_downsized_{image_size}x{image_size}' 15 | 16 | original_clips = 'datasets/Temporal/video_clips/kinetics400/clips' 17 | output_dir = f'datasets/Temporal/video_clips/kinetics400/clips_downsampled_{fps}fps_downsized_{image_size}x{image_size}' 18 | 19 | def resize_video(videoname, suppress_stdout=False, suppress_stderr=False): 20 | if os.path.exists(os.path.join(output_dir, videoname)): 21 | print(f'{videoname} already exists.') 22 | return 23 | 24 | inname = original_clips + '/' + videoname 25 | outname = output_dir + '/' + f'{videoname.split(".")[0]}.mp4' 26 | 27 | cmd = f"ffmpeg -loglevel info -y -i {inname} -filter:v scale={image_size}:{image_size},fps={fps} -c:a copy {outname}" 28 | 29 | kwargs = {} 30 | if suppress_stdout: 31 | kwargs['stdout'] = subprocess.DEVNULL 32 | if suppress_stderr: 33 | kwargs['stderr'] = subprocess.DEVNULL 34 | 35 | subprocess.run(cmd, shell=True, **kwargs) 36 | 37 | return 38 | 39 | if __name__ == "__main__": 40 | suppress_stdout = True 41 | suppress_stderr = True 42 | num_proc = 10 43 | 44 | os.makedirs(output_dir, exist_ok=True) 45 | mp4_list = [item for item in os.listdir(original_clips) if item.endswith('.mp4')] # load original mp4 files 46 | print('Total files to consider:', len(mp4_list)) 47 | 48 | resizer = partial(resize_video, suppress_stdout=suppress_stdout, suppress_stderr=suppress_stderr) 49 | for _ in tqdm(Pool(num_proc).imap_unordered(resizer, mp4_list), total=len(mp4_list)): 50 | pass -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/ssv2_label/backbone_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | 8 | datasets: 9 | downstream_tasks_retrieval_ssv2_224x224_5fps: 10 | type: "default" 11 | vis_processor: 12 | train: 13 | name: "video_train" 14 | image_size: 224 15 | eval: 16 | name: "internvideo_eval" 17 | image_size: 224 18 | text_processor: 19 | train: 20 | name: "minimum_text" 21 | eval: 22 | name: "minimum_text" 23 | 24 | # IMPORTANT configs: 25 | fps: 5 # if downsampled, use 5 fps 26 | task: ssv2_label # ssv2_label, ssv2_template 27 | neg_sampling_same_clip: 0 28 | eval_only: True 29 | 30 | # other arguements 31 | train_k: null # sample a subset of k instances 32 | eval_k: null # sample a subset of k instances, reduce evaluation time 33 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 34 | num_frm: 8 35 | train_frame_height: 224 36 | train_frame_width: 224 37 | eval_frame_height: 224 38 | eval_frame_width: 224 39 | 40 | run: 41 | # use custom runner 42 | runner: runner_base_patch_and_fuse 43 | 44 | # task object name 45 | task: downstream_tasks_retrieval 46 | 47 | # which module is used for inference ["backbone", "knowledge_patcher"] 48 | eval_module: backbone 49 | eval_task: ssv2_label 50 | 51 | batch_size_train: 32 52 | batch_size_eval: 4 53 | num_workers: 4 54 | 55 | seed: 42 56 | output_dir: "output/downstream_tasks/ssv2_label/backbone_zero-shot" 57 | 58 | amp: False 59 | resume_ckpt_path: null 60 | 61 | evaluate: True 62 | 63 | # train_splits: ["train"] 64 | valid_splits: ["val"] 65 | # test_splits: ["test"] 66 | 67 | device: "cuda" 68 | world_size: 1 69 | dist_url: "env://" 70 | distributed: True 71 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/ssv2_template/backbone_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | 8 | datasets: 9 | downstream_tasks_retrieval_ssv2_224x224_5fps: 10 | type: "default" 11 | vis_processor: 12 | train: 13 | name: "video_train" 14 | image_size: 224 15 | eval: 16 | name: "internvideo_eval" 17 | image_size: 224 18 | text_processor: 19 | train: 20 | name: "minimum_text" 21 | eval: 22 | name: "minimum_text" 23 | 24 | # IMPORTANT configs: 25 | fps: 5 # if downsampled, use 5 fps 26 | task: ssv2_template # ssv2_label, ssv2_template 27 | neg_sampling_same_clip: 0 28 | eval_only: True 29 | 30 | # other arguements 31 | train_k: null # sample a subset of k instances 32 | eval_k: null # sample a subset of k instances, reduce evaluation time 33 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 34 | num_frm: 8 35 | train_frame_height: 224 36 | train_frame_width: 224 37 | eval_frame_height: 224 38 | eval_frame_width: 224 39 | 40 | run: 41 | # use custom runner 42 | runner: runner_base_patch_and_fuse 43 | 44 | # task object name 45 | task: downstream_tasks_retrieval 46 | 47 | # which module is used for inference ["backbone", "knowledge_patcher"] 48 | eval_module: backbone 49 | eval_task: ssv2_template 50 | 51 | batch_size_train: 32 52 | batch_size_eval: 4 53 | num_workers: 4 54 | 55 | seed: 42 56 | output_dir: "output/downstream_tasks/ssv2_template/backbone_zero-shot" 57 | 58 | amp: False 59 | resume_ckpt_path: null 60 | 61 | evaluate: True 62 | 63 | # train_splits: ["train"] 64 | valid_splits: ["val"] 65 | # test_splits: ["test"] 66 | 67 | device: "cuda" 68 | world_size: 1 69 | dist_url: "env://" 70 | distributed: True 71 | -------------------------------------------------------------------------------- /ActionBench/src/get_object_shuffling_ssv2.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections import defaultdict 4 | from nltk.corpus import wordnet 5 | import nltk 6 | import spacy 7 | import pyinflect 8 | from tqdm import tqdm 9 | import random 10 | 11 | def get_object_shuffling_ssv2(output_path = None): 12 | ann_path = "../ssv2/antonyms/validation.json" 13 | assert output_path is not None 14 | object_taxonomy = json.load(open("../ssv2/object_taxonomy.json")) 15 | print(len(object_taxonomy)) 16 | 17 | val_annotations = json.load(open(ann_path)) 18 | 19 | output_annotations = [] 20 | for item in tqdm(val_annotations): 21 | orig_objects = item['placeholders'] 22 | 23 | cand_object_taxonomy = object_taxonomy.copy() 24 | for orig in orig_objects: 25 | if orig in cand_object_taxonomy: 26 | cand_object_taxonomy.remove(orig) 27 | 28 | cand_objects = random.sample(cand_object_taxonomy, len(orig_objects)) 29 | 30 | object_shuffled_text = item['label'].lower() 31 | for i, orig in enumerate(orig_objects): 32 | orig = orig.lower() 33 | # Find the index of the first occurrence of the substring 34 | index = object_shuffled_text.find(orig) 35 | assert index != -1 36 | # Replace the first occurrence of the substring with a new string 37 | object_shuffled_text = object_shuffled_text[:index] + cand_objects[i] + object_shuffled_text[index+len(orig):] 38 | 39 | assert object_shuffled_text != item['label'] 40 | item['label_object_shuffled_clip_text'] = object_shuffled_text 41 | 42 | output_annotations.append(item) 43 | 44 | assert len(output_annotations) == len(val_annotations) 45 | with open(output_path, 'w') as o: 46 | json.dump(output_annotations, o, indent=4) 47 | 48 | 49 | 50 | if __name__ == "__main__": 51 | 52 | random.seed(42) 53 | get_object_shuffling_ssv2(output_path = "") -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/moments_in_time/backbone_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | 7 | datasets: 8 | downstream_tasks_moment_in_time: 9 | type: "default" 10 | vis_processor: 11 | train: 12 | name: "video_train" 13 | image_size: 224 14 | eval: 15 | name: "internvideo_eval" 16 | image_size: 224 17 | text_processor: 18 | train: 19 | name: "minimum_text" 20 | eval: 21 | name: "minimum_text" 22 | 23 | # IMPORTANT configs: 24 | task: video_action_retrieval_2k 25 | neg_sampling_same_clip: 0 26 | eval_only: True 27 | 28 | # other arguements 29 | train_k: null # sample a subset of k instances 30 | eval_k: null # sample a subset of 3000 instances, reduce evaluation time 31 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 32 | num_frm: 8 33 | train_frame_height: 224 34 | train_frame_width: 224 35 | eval_frame_height: 224 36 | eval_frame_width: 224 37 | 38 | run: 39 | # use custom runner 40 | runner: runner_base_patch_and_fuse 41 | 42 | # task object name 43 | task: downstream_tasks_retrieval 44 | 45 | # which module is used for inference ["backbone", "knowledge_patcher"] 46 | eval_module: backbone 47 | eval_task: video_action_retrieval_2k 48 | 49 | ## NOTE: uncomment the following to use Backbone Ensemble 50 | # eval_method: ensemble_with_backbone 51 | 52 | batch_size_train: 32 53 | batch_size_eval: 4 54 | num_workers: 4 55 | 56 | seed: 42 57 | output_dir: "output/downstream_tasks/MomentsInTime/eval/backbone_zero-shot" 58 | 59 | amp: False 60 | resume_ckpt_path: null 61 | 62 | evaluate: True 63 | 64 | # train_splits: ["train"] 65 | valid_splits: ["val"] 66 | # test_splits: ["test"] 67 | 68 | device: "cuda" 69 | world_size: 1 70 | dist_url: "env://" 71 | distributed: True 72 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/temporal_kinetics/backbone_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | 7 | datasets: 8 | downstream_tasks_temporal: 9 | type: "default" 10 | vis_processor: 11 | train: 12 | name: "video_train" 13 | image_size: 224 14 | eval: 15 | name: "internvideo_eval" 16 | image_size: 224 17 | text_processor: 18 | train: 19 | name: "minimum_text" 20 | eval: 21 | name: "minimum_text" 22 | 23 | # IMPORTANT configs: 24 | fps: 5 25 | task: v1.0_2.4k 26 | subset: kinetics 27 | neg_sampling_same_clip: 0 28 | eval_only: True 29 | 30 | # other arguements 31 | train_k: null # sample a subset of k instances 32 | eval_k: null # sample a subset of 3000 instances, reduce evaluation time 33 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 34 | num_frm: 8 35 | train_frame_height: 224 36 | train_frame_width: 224 37 | eval_frame_height: 224 38 | eval_frame_width: 224 39 | 40 | run: 41 | # use custom runner 42 | runner: runner_base_patch_and_fuse 43 | 44 | # task object name 45 | task: downstream_tasks_retrieval 46 | 47 | # which module is used for inference ["backbone", "knowledge_patcher"] 48 | eval_module: backbone 49 | eval_task: v1.0_2.4k 50 | 51 | ## NOTE: uncomment the following to use Backbone Ensemble 52 | # eval_method: ensemble_with_backbone 53 | 54 | batch_size_train: 32 55 | batch_size_eval: 4 56 | num_workers: 4 57 | 58 | seed: 42 59 | output_dir: "output/downstream_tasks/temporal-kinetics/eval/backbone_zero-shot" 60 | 61 | amp: False 62 | resume_ckpt_path: null 63 | 64 | evaluate: True 65 | 66 | # train_splits: ["train"] 67 | valid_splits: ["val"] 68 | # test_splits: ["test"] 69 | 70 | device: "cuda" 71 | world_size: 1 72 | dist_url: "env://" 73 | distributed: True 74 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/internvideo/ego4d/acdybench_ego4d_internvideo_backbone__action_antonym.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_internvideo 5 | model_type: InternVideo-MM-L-14 6 | load_pretrained: True 7 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 8 | 9 | 10 | datasets: 11 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 12 | vis_processor: 13 | train: 14 | name: "video_train" 15 | image_size: 224 16 | eval: 17 | name: "internvideo_eval" 18 | image_size: 224 19 | text_processor: 20 | train: 21 | name: "vl_dynamic_ego4d_text" 22 | eval: 23 | name: "vl_dynamic_ego4d_text" 24 | 25 | # IMPORTANT configs: 26 | fps: 5 # if downsampled, use 5 fps 27 | task: "action_antonym" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 28 | neg_sampling_same_clip: 0 # evaluation set to 0 29 | eval_only: True 30 | 31 | # other arguements 32 | k: null # sample a subset of k instances 33 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 34 | num_frm: 8 35 | train_frame_height: 224 36 | train_frame_width: 224 37 | eval_frame_height: 224 38 | eval_frame_width: 224 39 | 40 | run: 41 | # use custom runner 42 | runner: runner_base_patch_and_fuse 43 | 44 | # task object name 45 | task: actionbench 46 | 47 | # which module is used for inference ["backbone", "knowledge_patcher"] 48 | eval_module: backbone 49 | eval_task: action_antonym 50 | 51 | batch_size_train: 32 52 | batch_size_eval: 4 53 | num_workers: 4 54 | 55 | seed: 42 56 | output_dir: "output/actionbench/eval/InternVideo/ego4d__InternVideo_backbone__action_antonym" 57 | 58 | amp: False 59 | resume_ckpt_path: null 60 | 61 | evaluate: True 62 | # train_splits: ["train"] 63 | # valid_splits: ["val"] 64 | test_splits: ["test"] 65 | 66 | device: "cuda" 67 | world_size: 1 68 | dist_url: "env://" 69 | distributed: True 70 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/internvideo/ego4d/acdybench_ego4d_internvideo_backbone__reversed_video.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | 4 | model: 5 | arch: patch_and_fuse_internvideo 6 | model_type: InternVideo-MM-L-14 7 | load_pretrained: True 8 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 9 | 10 | 11 | 12 | datasets: 13 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 14 | vis_processor: 15 | train: 16 | name: "video_train" 17 | image_size: 224 18 | eval: 19 | name: "internvideo_eval" 20 | image_size: 224 21 | text_processor: 22 | train: 23 | name: "vl_dynamic_ego4d_text" 24 | eval: 25 | name: "vl_dynamic_ego4d_text" 26 | 27 | # IMPORTANT configs: 28 | fps: 5 # if downsampled, use 5 fps 29 | task: "reversed_video" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 30 | neg_sampling_same_clip: 0 # evaluation set to 0 31 | eval_only: True 32 | 33 | # other arguements 34 | k: null # sample a subset of k instances 35 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 36 | num_frm: 8 37 | train_frame_height: 224 38 | train_frame_width: 224 39 | eval_frame_height: 224 40 | eval_frame_width: 224 41 | 42 | run: 43 | # use custom runner 44 | runner: runner_base_patch_and_fuse 45 | 46 | # task object name 47 | task: actionbench 48 | 49 | # which module is used for inference ["backbone", "knowledge_patcher"] 50 | eval_module: backbone 51 | eval_task: reversed_video 52 | 53 | batch_size_train: 32 54 | batch_size_eval: 4 55 | num_workers: 4 56 | 57 | seed: 42 58 | output_dir: "output/actionbench/eval/InternVideo/ego4d__InternVideo_backbone__reversed_video" 59 | 60 | amp: False 61 | resume_ckpt_path: null 62 | 63 | evaluate: True 64 | # train_splits: ["train"] 65 | # valid_splits: ["val"] 66 | test_splits: ["test"] 67 | 68 | device: "cuda" 69 | world_size: 1 70 | dist_url: "env://" 71 | distributed: True 72 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/internvideo/ssv2/acdybench_ssv2_internvideo_backbone__action_antonym.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_internvideo 5 | model_type: InternVideo-MM-L-14 6 | load_pretrained: True 7 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 8 | 9 | 10 | datasets: 11 | actionbench_ssv2_224x224_5fps: 12 | type: "action_antonyms_and_object_shuffled" 13 | vis_processor: 14 | train: 15 | name: "video_train" 16 | image_size: 224 17 | eval: 18 | name: "internvideo_eval" 19 | image_size: 224 20 | text_processor: 21 | train: 22 | name: "minimum_text" 23 | eval: 24 | name: "minimum_text" 25 | 26 | # IMPORTANT configs: 27 | fps: 5 # if downsampled, use 5 fps 28 | task: "action_antonym" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 29 | neg_sampling_same_clip: 0 30 | eval_only: True 31 | 32 | # other arguements 33 | train_k: null # sample a subset of k instances 34 | eval_k: null # sample a subset of k instances, reduce evaluation time 35 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 36 | num_frm: 8 37 | train_frame_height: 224 38 | train_frame_width: 224 39 | eval_frame_height: 224 40 | eval_frame_width: 224 41 | 42 | run: 43 | # use custom runner 44 | runner: runner_base_patch_and_fuse 45 | 46 | # task object name 47 | task: actionbench 48 | 49 | # which module is used for inference ["backbone", "knowledge_patcher"] 50 | eval_module: backbone 51 | eval_task: action_antonym 52 | 53 | batch_size_train: 32 54 | batch_size_eval: 4 55 | num_workers: 4 56 | 57 | seed: 42 58 | output_dir: "output/actionbench/eval/InternVideo/ssv2__InternVideo_backbone__action_antonym" 59 | 60 | amp: False 61 | resume_ckpt_path: null 62 | 63 | evaluate: True 64 | # train_splits: ["train"] 65 | valid_splits: ["val"] 66 | # test_splits: ["test"] 67 | 68 | device: "cuda" 69 | world_size: 1 70 | dist_url: "env://" 71 | distributed: True 72 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/internvideo/ssv2/acdybench_ssv2_internvideo_backbone__reversed_video.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | 4 | model: 5 | arch: patch_and_fuse_internvideo 6 | model_type: InternVideo-MM-L-14 7 | load_pretrained: True 8 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 9 | 10 | 11 | datasets: 12 | actionbench_ssv2_224x224_5fps: 13 | type: "action_antonyms_and_object_shuffled" 14 | vis_processor: 15 | train: 16 | name: "video_train" 17 | image_size: 224 18 | eval: 19 | name: "internvideo_eval" 20 | image_size: 224 21 | text_processor: 22 | train: 23 | name: "minimum_text" 24 | eval: 25 | name: "minimum_text" 26 | 27 | # IMPORTANT configs: 28 | fps: 5 # if downsampled, use 5 fps 29 | task: "reversed_video" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 30 | neg_sampling_same_clip: 0 31 | eval_only: True 32 | 33 | # other arguements 34 | train_k: null # sample a subset of k instances 35 | eval_k: null # sample a subset of k instances, reduce evaluation time 36 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 37 | num_frm: 8 38 | train_frame_height: 224 39 | train_frame_width: 224 40 | eval_frame_height: 224 41 | eval_frame_width: 224 42 | 43 | run: 44 | # use custom runner 45 | runner: runner_base_patch_and_fuse 46 | 47 | # task object name 48 | task: actionbench 49 | 50 | # which module is used for inference ["backbone", "knowledge_patcher"] 51 | eval_module: backbone 52 | eval_task: reversed_video 53 | 54 | batch_size_train: 32 55 | batch_size_eval: 4 56 | num_workers: 4 57 | 58 | seed: 42 59 | output_dir: "output/actionbench/eval/InternVideo/ssv2__InternVideo_backbone__reversed_video" 60 | 61 | amp: False 62 | resume_ckpt_path: null 63 | 64 | evaluate: True 65 | # train_splits: ["train"] 66 | valid_splits: ["val"] 67 | # test_splits: ["test"] 68 | 69 | device: "cuda" 70 | world_size: 1 71 | dist_url: "env://" 72 | distributed: True 73 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/internvideo/ssv2/acdybench_ssv2_internvideo_backbone__object_shuffle.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | 4 | model: 5 | arch: patch_and_fuse_internvideo 6 | model_type: InternVideo-MM-L-14 7 | load_pretrained: True 8 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 9 | 10 | 11 | 12 | datasets: 13 | actionbench_ssv2_224x224_5fps: 14 | type: "action_antonyms_and_object_shuffled" 15 | vis_processor: 16 | train: 17 | name: "video_train" 18 | image_size: 224 19 | eval: 20 | name: "internvideo_eval" 21 | image_size: 224 22 | text_processor: 23 | train: 24 | name: "minimum_text" 25 | eval: 26 | name: "minimum_text" 27 | 28 | # IMPORTANT configs: 29 | fps: 5 # if downsampled, use 5 fps 30 | task: "object_shuffle" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 31 | neg_sampling_same_clip: 0 32 | eval_only: True 33 | 34 | # other arguements 35 | train_k: null # sample a subset of k instances 36 | eval_k: null # sample a subset of k instances, reduce evaluation time 37 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 38 | num_frm: 8 39 | train_frame_height: 224 40 | train_frame_width: 224 41 | eval_frame_height: 224 42 | eval_frame_width: 224 43 | 44 | run: 45 | # use custom runner 46 | runner: runner_base_patch_and_fuse 47 | 48 | # task object name 49 | task: actionbench 50 | 51 | # which module is used for inference ["backbone", "knowledge_patcher"] 52 | eval_module: backbone 53 | eval_task: object_shuffle 54 | 55 | batch_size_train: 32 56 | batch_size_eval: 4 57 | num_workers: 4 58 | 59 | seed: 42 60 | output_dir: "output/actionbench/eval/InternVideo/ssv2__InternVideo_backbone__object_shuffle" 61 | 62 | amp: False 63 | resume_ckpt_path: null 64 | 65 | evaluate: True 66 | # train_splits: ["train"] 67 | valid_splits: ["val"] 68 | # test_splits: ["test"] 69 | 70 | device: "cuda" 71 | world_size: 1 72 | dist_url: "env://" 73 | distributed: True 74 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/internvideo/ego4d/acdybench_ego4d_internvideo_backbone__object_shuffle.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | 4 | model: 5 | arch: patch_and_fuse_internvideo 6 | model_type: InternVideo-MM-L-14 7 | load_pretrained: True 8 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 9 | 10 | 11 | 12 | datasets: 13 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 14 | type: "object_shuffled" 15 | vis_processor: 16 | train: 17 | name: "video_train" 18 | image_size: 224 19 | eval: 20 | name: "internvideo_eval" 21 | image_size: 224 22 | text_processor: 23 | train: 24 | name: "vl_dynamic_ego4d_text" 25 | eval: 26 | name: "vl_dynamic_ego4d_text" 27 | 28 | # IMPORTANT configs: 29 | fps: 5 # if downsampled, use 5 fps 30 | task: "object_shuffle" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 31 | neg_sampling_same_clip: 0 # evaluation set to 0 32 | eval_only: True 33 | 34 | # other arguements 35 | k: null # sample a subset of k instances 36 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 37 | num_frm: 8 38 | train_frame_height: 224 39 | train_frame_width: 224 40 | eval_frame_height: 224 41 | eval_frame_width: 224 42 | 43 | run: 44 | # use custom runner 45 | runner: runner_base_patch_and_fuse 46 | 47 | # task object name 48 | task: actionbench 49 | 50 | # which module is used for inference ["backbone", "knowledge_patcher"] 51 | eval_module: backbone 52 | eval_task: object_shuffle 53 | 54 | batch_size_train: 32 55 | batch_size_eval: 4 56 | num_workers: 4 57 | 58 | seed: 42 59 | output_dir: "output/actionbench/eval/InternVideo/ego4d__InternVideo_backbone__object_shuffle" 60 | 61 | amp: False 62 | resume_ckpt_path: null 63 | 64 | evaluate: True 65 | # train_splits: ["train"] 66 | # valid_splits: ["val"] 67 | test_splits: ["test"] 68 | 69 | device: "cuda" 70 | world_size: 1 71 | dist_url: "env://" 72 | distributed: True 73 | -------------------------------------------------------------------------------- /src/Singularity/dataset/dataloader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from utils.distributed import get_rank, is_dist_avail_and_initialized, is_main_process 4 | import random 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class MetaLoader(object): 11 | """ wraps multiple data loader """ 12 | def __init__(self, name2loader): 13 | """Iterates over multiple dataloaders, it ensures all processes 14 | work on data from the same dataloader. This loader will end when 15 | the shorter dataloader raises StopIteration exception. 16 | 17 | loaders: Dict, {name: dataloader} 18 | """ 19 | self.name2loader = name2loader 20 | self.name2iter = {name: iter(l) for name, l in name2loader.items()} 21 | name2index = {name: idx for idx, (name, l) in enumerate(name2loader.items())} 22 | index2name = {v: k for k, v in name2index.items()} 23 | 24 | iter_order = [] 25 | for n, l in name2loader.items(): 26 | iter_order.extend([name2index[n]]*len(l)) 27 | 28 | random.shuffle(iter_order) 29 | iter_order = torch.Tensor(iter_order).to(torch.device("cuda")).to(torch.uint8) 30 | 31 | # sync 32 | if is_dist_avail_and_initialized(): 33 | # make sure all processes have the same order so that 34 | # each step they will have data from the same loader 35 | dist.broadcast(iter_order, src=0) 36 | self.iter_order = [index2name[int(e.item())] for e in iter_order.cpu()] 37 | 38 | logger.info(str(self)) 39 | 40 | def __str__(self): 41 | output = [f"MetaLoader has {len(self.name2loader)} dataloaders, {len(self)} batches in total"] 42 | for idx, (name, loader) in enumerate(self.name2loader.items()): 43 | output.append( 44 | f"dataloader index={idx} name={name}, batch-size={loader.batch_size} length(#batches)={len(loader)} " 45 | ) 46 | return "\n".join(output) 47 | 48 | def __len__(self): 49 | return len(self.iter_order) 50 | 51 | def __iter__(self): 52 | """ this iterator will run indefinitely """ 53 | for name in self.iter_order: 54 | _iter = self.name2iter[name] 55 | batch = next(_iter) 56 | yield name, batch 57 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/lsmdc_retrieval/lsmdc_retrieval_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "lsmdc-101k", 5 | "vis_format": "video", 6 | "txt": "clip_data/vis_db/lsmdc/train_101k_frame.jsonl", 7 | "vis": "datasets/lsmdc" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "lsmdc-1k", 13 | "vis_format": "video", 14 | "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl", 15 | "vis": "datasets/lsmdc" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "lsmdc-1k", 21 | "vis_format": "video", 22 | "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl", 23 | "vis": "datasets/lsmdc" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch16", 39 | "clip_config": "openai/clip-vit-base-patch16", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 10, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/lsmdc_retrieval/lsmdc_retrieval_vip_base_16", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/lsmdc_retrieval/lsmdc_retrieval_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "lsmdc-101k", 5 | "vis_format": "video", 6 | "txt": "clip_data/vis_db/lsmdc/train_101k_frame.jsonl", 7 | "vis": "datasets/lsmdc" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "lsmdc-1k", 13 | "vis_format": "video", 14 | "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl", 15 | "vis": "datasets/lsmdc" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "lsmdc-1k", 21 | "vis_format": "video", 22 | "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl", 23 | "vis": "datasets/lsmdc" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch32", 39 | "clip_config": "openai/clip-vit-base-patch32", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 10, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/lsmdc_retrieval/lsmdc_retrieval_vip_base_32", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/didemo_retrieval/didemo_retrieval_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "didemo-train", 5 | "vis_format": "video", 6 | "txt": "datasets/lfvideo_data/task/didemo/train.jsonl", 7 | "vis": "datasets/didemo/didemo_video_xfps/" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "didemo-val", 13 | "vis_format": "video", 14 | "txt": "datasets/lfvideo_data/task/didemo/val.jsonl", 15 | "vis": "datasets/didemo/didemo_video_xfps/" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "didemo-test", 21 | "vis_format": "video", 22 | "txt": "datasets/lfvideo_data/task/didemo/test.jsonl", 23 | "vis": "datasets/didemo/didemo_video_xfps/" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch32", 39 | "clip_config": "openai/clip-vit-base-patch32", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 20, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/didemo_retrieval/didemo_retrieval_vip_base_32", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/didemo_retrieval/didemo_retrieval_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "didemo-train", 5 | "vis_format": "video", 6 | "txt": "datasets/lfvideo_data/task/didemo/train.jsonl", 7 | "vis": "datasets/didemo/didemo_video_xfps/" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "didemo-val", 13 | "vis_format": "video", 14 | "txt": "datasets/lfvideo_data/task/didemo/val.jsonl", 15 | "vis": "datasets/didemo/didemo_video_xfps/" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "didemo-test", 21 | "vis_format": "video", 22 | "txt": "datasets/lfvideo_data/task/didemo/test.jsonl", 23 | "vis": "datasets/didemo/didemo_video_xfps/" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 70, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch16", 39 | "clip_config": "openai/clip-vit-base-patch16", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 10, 63 | "lr_mul_prefix": "logit_scale", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 20, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/didemo_retrieval/didemo_retrieval_vip_base_16", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /src/preprocessing/nextqa/downsample_downsize_video_nextqa.py: -------------------------------------------------------------------------------- 1 | # modified from EgoVLP https://github.com/showlab/EgoVLP/blob/main/utils/video_resize.py 2 | # Downsamples, downsizes, and converts to mp4 3 | 4 | import os 5 | import subprocess 6 | from multiprocessing import Pool 7 | from tqdm import tqdm 8 | from functools import partial 9 | from glob import glob 10 | 11 | def resize_video(input_output_path, suppress_stdout=False, suppress_stderr=False): 12 | input_path, output_path = input_output_path 13 | 14 | if os.path.exists(output_path): 15 | print(f'{output_path} already exists.') 16 | return 17 | 18 | cmd = f"ffmpeg -loglevel info -y -i {input_path} -filter:v scale={image_size}:{image_size},fps={fps} -c:a copy {output_path}" 19 | 20 | kwargs = {} 21 | if suppress_stdout: 22 | kwargs['stdout'] = subprocess.DEVNULL 23 | if suppress_stderr: 24 | kwargs['stderr'] = subprocess.DEVNULL 25 | 26 | subprocess.run(cmd, shell=True, **kwargs) 27 | 28 | return 29 | 30 | if __name__ == "__main__": 31 | suppress_stdout = True 32 | suppress_stderr = True 33 | num_proc = 10 34 | 35 | image_size = 224 36 | fps = 5 37 | 38 | original_clips = 'datasets/NextQA/video_clips/NExTVideo' 39 | output_dir = f'datasets/NextQA/video_clips/NExTVideo_downsampled_{fps}fps_downsized_{image_size}x{image_size}' 40 | 41 | os.makedirs(output_dir, exist_ok=True) 42 | 43 | input_output_paths = [] 44 | 45 | input_dirs = glob(os.path.join(original_clips, "*")) 46 | for d in input_dirs: 47 | input_paths = glob(os.path.join(d, "*.mp4")) 48 | input_dir_name = os.path.basename(d) 49 | for ip in input_paths: 50 | video_name = os.path.basename(ip) 51 | os.makedirs(os.path.join(output_dir, input_dir_name), exist_ok=True) 52 | op = os.path.join(output_dir, input_dir_name, video_name) 53 | input_output_paths.append((ip,op)) 54 | 55 | # mp4_list = [item for item in os.listdir(original_clips) if item.endswith('.mp4')] # load original mp4 files 56 | # print('Total files to consider:', len(mp4_list)) 57 | 58 | print('Total files to consider:', len(input_output_paths)) 59 | 60 | 61 | resizer = partial(resize_video, suppress_stdout=suppress_stdout, suppress_stderr=suppress_stderr) 62 | for _ in tqdm(Pool(num_proc).imap_unordered(resizer, input_output_paths), total=len(input_output_paths)): 63 | pass -------------------------------------------------------------------------------- /src/ClipViP/src/configs/msrvtt_retrieval/msrvtt_retrieval_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "msrvtt-9k", 5 | "vis_format": "video", 6 | "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl", 7 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "msrvtt-1ka", 13 | "vis_format": "video", 14 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 15 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "msrvtt-1ka", 21 | "vis_format": "video", 22 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 23 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch16", 39 | "clip_config": "openai/clip-vit-base-patch16", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 100, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_16", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/msrvtt_retrieval/msrvtt_retrieval_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "msrvtt-9k", 5 | "vis_format": "video", 6 | "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl", 7 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "msrvtt-1ka", 13 | "vis_format": "video", 14 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 15 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "msrvtt-1ka", 21 | "vis_format": "video", 22 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 23 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch32", 39 | "clip_config": "openai/clip-vit-base-patch32", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 100, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_32", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/actnet_retrieval/actnet_retrieval_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "actnet-train", 5 | "vis_format": "frame", 6 | "txt": "clip_data/vis_db/anet_retrieval/train.jsonl", 7 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "actnet-test", 13 | "vis_format": "frame", 14 | "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl", 15 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "actnet-test", 21 | "vis_format": "frame", 22 | "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl", 23 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 32, 29 | "test_n_clips": 1, 30 | "test_num_frms": 32, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 70, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch16", 39 | "clip_config": "openai/clip-vit-base-patch16", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 8, 49 | "test_batch_size": 8, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 20, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/actnet_retrieval/actnet_retrieval_vip_base_16", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/actnet_retrieval/actnet_retrieval_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "actnet-train", 5 | "vis_format": "frame", 6 | "txt": "clip_data/vis_db/anet_retrieval/train.jsonl", 7 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "actnet-test", 13 | "vis_format": "frame", 14 | "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl", 15 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "actnet-test", 21 | "vis_format": "frame", 22 | "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl", 23 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 32, 29 | "test_n_clips": 1, 30 | "test_num_frms": 32, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 70, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch32", 39 | "clip_config": "openai/clip-vit-base-patch32", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 20, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/actnet_retrieval/actnet_retrieval_vip_base_32", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /src/ClipViP/src/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def cal_cossim(feats1, feats2): 4 | sim_matrix = np.dot(feats1, feats2.T) 5 | return sim_matrix 6 | 7 | def np_softmax(X, theta = 1.0, axis = None): 8 | """ 9 | Compute the softmax of each element along an axis of X. 10 | 11 | Parameters 12 | ---------- 13 | X: ND-Array. Probably should be floats. 14 | theta (optional): float parameter, used as a multiplier 15 | prior to exponentiation. Default = 1.0 16 | axis (optional): axis to compute values along. Default is the 17 | first non-singleton axis. 18 | 19 | Returns an array the same size as X. The result will sum to 1 20 | along the specified axis. 21 | """ 22 | # make X at least 2d 23 | y = np.atleast_2d(X) 24 | # find axis 25 | if axis is None: 26 | axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1) 27 | # multiply y against the theta parameter, 28 | y = y * float(theta) 29 | # subtract the max for numerical stability 30 | y = y - np.expand_dims(np.max(y, axis = axis), axis) 31 | # exponentiate y 32 | y = np.exp(y) 33 | # take the sum along the specified axis 34 | ax_sum = np.expand_dims(np.sum(y, axis = axis), axis) 35 | # finally: divide elementwise 36 | p = y / ax_sum 37 | # flatten if X was 1D 38 | if len(X.shape) == 1: p = p.flatten() 39 | return p 40 | 41 | def compute_metrics(x): 42 | sx = np.sort(-x, axis=1) 43 | d = np.diag(-x) 44 | d = d[:, np.newaxis] 45 | ind = sx - d 46 | ind = np.where(ind == 0) 47 | ind = ind[1] 48 | r1 = float(np.sum(ind == 0)) / len(ind) 49 | r5 = float(np.sum(ind < 5)) / len(ind) 50 | r10 = float(np.sum(ind < 10)) / len(ind) 51 | medr = np.median(ind) + 1 52 | meanr = np.mean(ind) + 1 53 | return r1, r5, r10, medr, meanr 54 | 55 | def compute_metrics_multi(x, t2v_labels_list): 56 | sx = np.sort(-x, axis=1) 57 | t2v_labels_list = np.array(t2v_labels_list) 58 | arg = np.arange(x.shape[0]) 59 | d = -x[arg, t2v_labels_list] 60 | d = d[:, np.newaxis] 61 | ind = sx - d 62 | ind = np.where(ind == 0) 63 | ind = ind[1] 64 | r1 = float(np.sum(ind == 0)) / len(ind) 65 | r5 = float(np.sum(ind < 5)) / len(ind) 66 | r10 = float(np.sum(ind < 10)) / len(ind) 67 | medr = np.median(ind) + 1 68 | meanr = np.mean(ind) + 1 69 | return r1, r5, r10, medr, meanr 70 | 71 | 72 | if __name__ == '__main__': 73 | 74 | sim_matrix = np.random.random((5,5)) 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/Singularity/dataset/base_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from dataset.utils import load_image_from_path 3 | import random 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ImageVideoBaseDataset(Dataset): 10 | """Base class that implements the image and video loading methods""" 11 | media_type = "video" 12 | 13 | def __init__(self): 14 | assert self.media_type in ["image", "video"] 15 | self.anno_list = None # list(dict), each dict contains {"image": str, # image or video path} 16 | self.transform = None 17 | self.video_reader = None 18 | self.num_tries = None 19 | 20 | def __getitem__(self, index): 21 | raise NotImplementedError 22 | 23 | def __len__(self): 24 | raise NotImplementedError 25 | 26 | def load_and_transform_media_data(self, index): 27 | if self.media_type == "image": 28 | return self.load_and_transform_media_data_image(index) 29 | else: 30 | return self.load_and_transform_media_data_video(index) 31 | 32 | def load_and_transform_media_data_image(self, index): 33 | ann = self.anno_list[index] 34 | data_path = ann["image"] 35 | image = load_image_from_path(data_path) 36 | image = self.transform(image) 37 | return image, index 38 | 39 | def load_and_transform_media_data_video(self, index): 40 | for i in range(self.num_tries): 41 | ann = self.anno_list[index] 42 | data_path = ann["image"] 43 | try: 44 | max_num_frames = self.max_num_frames \ 45 | if hasattr(self, "max_num_frames") else -1 46 | frames, frame_indices, video_duration = self.video_reader( 47 | data_path, self.num_frames, self.sample_type, 48 | max_num_frames=max_num_frames 49 | ) 50 | except Exception as e: 51 | index = random.randint(0, len(self) - 1) 52 | logger.warning( 53 | f"Caught exception {e} when loading video {data_path}, " 54 | f"randomly sample a new video as replacement") 55 | continue 56 | 57 | frames = self.transform(frames) 58 | return frames, index 59 | else: 60 | raise RuntimeError( 61 | f"Failed to fetch video after {self.num_tries} tries. " 62 | f"This might indicate that you have many corrupted videos." 63 | ) 64 | -------------------------------------------------------------------------------- /src/Singularity/dataset/qa_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataset.base_dataset import ImageVideoBaseDataset 3 | from dataset.utils import pre_text, load_anno 4 | from dataset.video_utils import VIDEO_READER_FUNCS 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class ImageQADataset(ImageVideoBaseDataset): 11 | media_type = "image" 12 | 13 | def __init__(self, ann_file, transform, eos="[SEP]", mode="train", answer_list=None): 14 | super(ImageQADataset, self).__init__() 15 | assert mode in ["train", "eval"] 16 | self.mode = mode 17 | self.transform = transform 18 | self.eos = eos 19 | 20 | self.anno_list = load_anno(ann_file) 21 | 22 | if mode == "eval": 23 | self.answer_list = json.load(open(answer_list, "r")) 24 | 25 | def __len__(self): 26 | return len(self.anno_list) 27 | 28 | def get_answers_with_weights(self, raw_answers): 29 | if isinstance(raw_answers, str): 30 | raw_answers = [raw_answers] 31 | answer_weight = {} 32 | for answer in raw_answers: 33 | if answer in answer_weight.keys(): 34 | answer_weight[answer] += 1/len(raw_answers) 35 | else: 36 | answer_weight[answer] = 1/len(raw_answers) 37 | 38 | answers = list(answer_weight.keys()) 39 | weights = [answer_weight[a] for a in answers] 40 | answers = [answer + " " + self.eos for answer in answers] 41 | return answers, weights 42 | 43 | def __getitem__(self, index): 44 | ann = self.anno_list[index] 45 | image, index = self.load_and_transform_media_data(index) 46 | 47 | question = pre_text(ann["question"]) 48 | if self.mode == "train": 49 | answers, weights = self.get_answers_with_weights(ann["answer"]) 50 | return image, question, answers, weights 51 | else: # self.mode == "eval": 52 | question_id = ann["question_id"] 53 | return image, question, question_id 54 | 55 | 56 | class VideoQADataset(ImageQADataset): 57 | media_type = "video" 58 | 59 | def __init__( 60 | self, ann_file, transform, eos="[SEP]", mode="train", answer_list=None, 61 | num_frames=4, video_reader_type="decord", sample_type="rand", num_tries=1 62 | ): 63 | super(VideoQADataset, self).__init__( 64 | ann_file, transform, eos, mode, answer_list) 65 | self.num_frames = num_frames 66 | self.video_reader_type = video_reader_type 67 | self.video_reader = VIDEO_READER_FUNCS[video_reader_type] 68 | self.sample_type = sample_type 69 | self.num_tries = num_tries 70 | -------------------------------------------------------------------------------- /src/Singularity/utils/scheduler.py: -------------------------------------------------------------------------------- 1 | """ Scheduler Factory 2 | Hacked together by / Copyright 2020 Ross Wightman 3 | """ 4 | from torch.optim import Optimizer 5 | import math 6 | from torch.optim.lr_scheduler import LambdaLR 7 | 8 | 9 | def create_scheduler(args, optimizer): 10 | lr_scheduler = None 11 | if args.sched == 'cosine': 12 | lr_scheduler = get_cosine_schedule_with_warmup( 13 | optimizer, 14 | num_warmup_steps=args.num_warmup_steps, 15 | num_training_steps=args.num_training_steps, 16 | num_cycles=0.5, 17 | min_lr_multi=args.min_lr_multi 18 | ) 19 | return lr_scheduler 20 | 21 | 22 | def get_cosine_schedule_with_warmup( 23 | optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, 24 | num_cycles: float = 0.5, min_lr_multi: float = 0., last_epoch: int = -1 25 | ): 26 | """ 27 | Modified from https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/optimization.py 28 | 29 | Create a schedule with a learning rate that decreases following the values of the cosine function between the 30 | initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the 31 | initial lr set in the optimizer. 32 | Args: 33 | optimizer ([`~torch.optim.Optimizer`]): 34 | The optimizer for which to schedule the learning rate. 35 | num_warmup_steps (`int`): 36 | The number of steps for the warmup phase. 37 | num_training_steps (`int`): 38 | The total number of training steps. 39 | num_cycles (`float`, *optional*, defaults to 0.5): 40 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 41 | following a half-cosine). 42 | min_lr_multi (`float`, *optional*, defaults to 0): 43 | The minimum learning rate multiplier. Thus the minimum learning rate is base_lr * min_lr_multi. 44 | last_epoch (`int`, *optional*, defaults to -1): 45 | The index of the last epoch when resuming training. 46 | Return: 47 | `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 48 | """ 49 | 50 | def lr_lambda(current_step): 51 | if current_step < num_warmup_steps: 52 | return max(min_lr_multi, float(current_step) / float(max(1, num_warmup_steps))) 53 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 54 | return max(min_lr_multi, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) 55 | 56 | return LambdaLR(optimizer, lr_lambda, last_epoch) 57 | -------------------------------------------------------------------------------- /dataset_cards/actionbench_ego4d.md: -------------------------------------------------------------------------------- 1 | # Action Dynamic Benchmark (ActionBench) on Ego4d 2 | 3 | ## Instruction for Downloading Videos 4 | - Set up Ego4d CLI following [here](https://ego4d-data.org/docs/start-here/) 5 | - Download the Moment clips using the following command: 6 | ``` 7 | ego4d \ 8 | --output_directory="Ego4d" \ 9 | --datasets clips annotations \ 10 | --benchmarks "EM" \ 11 | --metadata 12 | ``` 13 | - Put the downloaded `clips/` folder into `datasets/Ego4D/video_clips/` as `datasets/Ego4D/video_clips/clips` 14 | - Run preprocessing on the video clips (at the root dir of this repo): 15 | ``` 16 | python src/preprocessing/ego4d/downsample_downsize_video_clips.py 17 | ``` 18 | - The processed video clips will be stored at `datasets/Ego4D/video_clips/clips_downsampled_5fps_downsized_224x224` 19 | 20 | 21 | ## Annotation Details 22 | 23 | ### Annotation for Action Antonym Task & Video Reversal Task 24 | - train size: 274,946 25 | - val size: 34,368 26 | - test size: 34,369 27 | 28 | - ann_path: `ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/{split}.jsonl`. The original annotation is based on a subset of [EgoClip](https://github.com/showlab/EgoVLP). 29 | - format: 30 | ``` 31 | { 32 | 'video_uid': '002d2729-df71-438d-8396-5895b349e8fd', 33 | 'video_dur': 3571.4333333333334, 34 | 'narration_source': 'narration_pass_1', 35 | 'narration_ind': 229, 36 | 'narration_time': 592.6903, 37 | 'clip_start': 592.3519665973915, 38 | 'clip_end': 593.0286286452686, 39 | 'clip_text': '#C C picks up the knife from the chopping board with her right hand.', 40 | 'action_antonym_clip_text': '#C C drops down the knife from the chopping board with her right hand.', 41 | 'tag_verb': '[17, 93]', 42 | 'tag_noun': '[321, 268, 573, 105]', 43 | 'Unnamed: 10': nan, 44 | 'clip_uid': '116ec16b-0d76-4e71-b02c-72cb37ebd5c5', 45 | 'narration_relative_time': 0.6902999999999793, 46 | 'clip_relative_start': 0.351966597391538, 47 | 'clip_relative_end': 1.0286286452685545, 48 | 'clip_fps': 30.0} 49 | ``` 50 | 51 | ### Annotation for Object Shuffle 52 | A subset from above by filtering out clips with no object in the clip text. 53 | - val size: 31974 54 | - test size: 31925 55 | - ann_path: `ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/{split}.jsonl` 56 | - format: additional fields: 57 | ``` 58 | { 59 | ... 60 | 'object_shuffled_clip_text':'#C C picks up the banana from the chopping board with her right hand.', 61 | } 62 | ``` 63 | 64 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/moments_in_time/side_tuning_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | pretrained: "/checkpoint_0.pth" 7 | 8 | text_perceiver_config: 9 | dim: 768 # latent query dim 10 | k_v_dim: 768 # text_width 11 | depth: 1 12 | dim_head: 64 13 | heads: 8 14 | num_latents: 16 15 | ff_mult: 2 16 | 17 | vision_perceiver_config: 18 | dim: 768 # latent query dim 19 | k_v_dim: 1024 # vision_width 20 | depth: 1 21 | dim_head: 64 22 | heads: 8 23 | num_latents: 16 24 | ff_mult: 2 25 | 26 | objectives: ["video_text_contrastive"] 27 | loss_weighting: [1.0] 28 | if_use_attn_guidance: False 29 | if_use_dual_perceiver: False 30 | if_add_temporal_emebdding: True 31 | num_frms: 8 32 | temp_emb_drop_out: 0.1 33 | if_as_knowledge_fuser: True 34 | knowledge_fuser_type: "side_tuning" 35 | train_knowledge_fuser_jointly: True 36 | 37 | datasets: 38 | downstream_tasks_moment_in_time: 39 | type: "default" 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "minimum_text" 50 | eval: 51 | name: "minimum_text" 52 | 53 | # IMPORTANT configs: 54 | task: video_action_retrieval_2k 55 | neg_sampling_same_clip: 0 56 | eval_only: True 57 | 58 | # other arguements 59 | train_k: null # sample a subset of k instances 60 | eval_k: null # sample a subset of 3000 instances, reduce evaluation time 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | run: 69 | # use custom runner 70 | runner: runner_base_patch_and_fuse 71 | 72 | # task object name 73 | task: downstream_tasks_retrieval 74 | 75 | # which module is used for inference ["backbone", "knowledge_patcher"] 76 | eval_module: knowledge_patcher 77 | eval_task: video_action_retrieval_2k 78 | 79 | batch_size_train: 32 80 | batch_size_eval: 4 81 | num_workers: 4 82 | 83 | seed: 42 84 | output_dir: "output/downstream_tasks/MomentsInTime/eval/side_tuning_zero-shot" 85 | 86 | amp: False 87 | resume_ckpt_path: null 88 | 89 | evaluate: True 90 | 91 | # train_splits: ["train"] 92 | valid_splits: ["val"] 93 | # test_splits: ["test"] 94 | 95 | device: "cuda" 96 | world_size: 1 97 | dist_url: "env://" 98 | distributed: True 99 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/nextqa/side_tuning.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo_mcqa 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | pretrained: "/checkpoint_best.pth" # set trained patch_and_fuse on nextqa checkpoint path 7 | 8 | text_perceiver_config: 9 | dim: 768 # latent query dim 10 | k_v_dim: 768 # text_width 11 | depth: 1 12 | dim_head: 64 13 | heads: 8 14 | num_latents: 16 15 | ff_mult: 2 16 | 17 | vision_perceiver_config: 18 | dim: 768 # latent query dim 19 | k_v_dim: 1024 # vision_width 20 | depth: 1 21 | dim_head: 64 22 | heads: 8 23 | num_latents: 16 24 | ff_mult: 2 25 | 26 | objectives: ["mcqa_loss"] 27 | loss_weighting: [1.0] 28 | if_use_attn_guidance: False 29 | if_use_dual_perceiver: False 30 | if_add_temporal_emebdding: False 31 | num_frms: 8 32 | temp_emb_drop_out: 0.0 33 | knowledge_fuser_type: "side_tuning" 34 | if_as_knowledge_fuser: True 35 | train_knowledge_fuser_jointly: True 36 | 37 | datasets: 38 | downstream_tasks_qa_nextqa_224x224_5fps: 39 | type: "default" 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "minimum_text" 50 | eval: 51 | name: "minimum_text" 52 | 53 | # IMPORTANT configs: 54 | task: 5way-multiple-choice-qa # 55 | neg_sampling_same_clip: 0 56 | eval_only: True 57 | 58 | # other arguements 59 | train_k: null # sample a subset of k instances 60 | eval_k: null # sample a subset of k instances, reduce evaluation time 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | run: 69 | # use custom runner 70 | runner: runner_base_patch_and_fuse 71 | 72 | # task object name 73 | task: downstream_tasks_multi_choice_qa 74 | 75 | # which module is used for inference ["backbone", "knowledge_patcher"] 76 | eval_module: knowledge_patcher 77 | eval_task: 5way-multiple-choice-qa 78 | 79 | batch_size_train: 32 80 | batch_size_eval: 8 81 | num_workers: 4 82 | 83 | seed: 42 84 | output_dir: "output/downstream_tasks/NextQA/eval/side_tuning" 85 | 86 | amp: False 87 | resume_ckpt_path: null 88 | 89 | evaluate: True 90 | 91 | # train_splits: ["train"] 92 | valid_splits: ["val"] 93 | test_splits: ["test"] 94 | 95 | device: "cuda" 96 | world_size: 1 97 | dist_url: "env://" 98 | distributed: True 99 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/nextqa/patch_and_fuse.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo_mcqa 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/nextqa_patch_and_fuse.pth" # set trained patch_and_fuse on nextqa checkpoint path 7 | 8 | text_perceiver_config: 9 | dim: 768 # latent query dim 10 | k_v_dim: 768 # text_width 11 | depth: 1 12 | dim_head: 64 13 | heads: 8 14 | num_latents: 16 15 | ff_mult: 2 16 | 17 | vision_perceiver_config: 18 | dim: 768 # latent query dim 19 | k_v_dim: 1024 # vision_width 20 | depth: 1 21 | dim_head: 64 22 | heads: 8 23 | num_latents: 16 24 | ff_mult: 2 25 | 26 | objectives: ["mcqa_loss"] 27 | loss_weighting: [1.0] 28 | if_use_attn_guidance: False 29 | if_use_dual_perceiver: False 30 | if_add_temporal_emebdding: False 31 | num_frms: 8 32 | temp_emb_drop_out: 0.0 33 | knowledge_fuser_type: "xattn" 34 | if_as_knowledge_fuser: True 35 | train_knowledge_fuser_jointly: True 36 | 37 | datasets: 38 | downstream_tasks_qa_nextqa_224x224_5fps: 39 | type: "default" 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "minimum_text" 50 | eval: 51 | name: "minimum_text" 52 | 53 | # IMPORTANT configs: 54 | task: 5way-multiple-choice-qa # 55 | neg_sampling_same_clip: 0 56 | eval_only: True 57 | 58 | # other arguements 59 | train_k: null # sample a subset of k instances 60 | eval_k: null # sample a subset of k instances, reduce evaluation time 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | run: 69 | # use custom runner 70 | runner: runner_base_patch_and_fuse 71 | 72 | # task object name 73 | task: downstream_tasks_multi_choice_qa 74 | 75 | # which module is used for inference ["backbone", "knowledge_patcher"] 76 | eval_module: knowledge_patcher 77 | eval_task: 5way-multiple-choice-qa 78 | 79 | batch_size_train: 32 80 | batch_size_eval: 8 81 | num_workers: 4 82 | 83 | seed: 42 84 | output_dir: "output/downstream_tasks/NextQA/eval/patch_and_fuse" 85 | 86 | amp: False 87 | resume_ckpt_path: null 88 | 89 | evaluate: True 90 | 91 | # train_splits: ["train"] 92 | valid_splits: ["val"] 93 | test_splits: ["test"] 94 | 95 | device: "cuda" 96 | world_size: 1 97 | dist_url: "env://" 98 | distributed: True 99 | -------------------------------------------------------------------------------- /src/configs/projects/train/downstream_tasks/nextqa/KP-Perceiver-VTC-DVDM.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo_mcqa 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/models/InternVideo-MM-L-14.ckpt" 6 | 7 | text_perceiver_config: 8 | dim: 768 # latent query dim 9 | k_v_dim: 768 # text_width 10 | depth: 1 11 | dim_head: 64 12 | heads: 8 13 | num_latents: 16 14 | ff_mult: 2 15 | 16 | vision_perceiver_config: 17 | dim: 768 # latent query dim 18 | k_v_dim: 1024 # vision_width 19 | depth: 1 20 | dim_head: 64 21 | heads: 8 22 | num_latents: 16 23 | ff_mult: 2 24 | 25 | objectives: ["mcqa_loss", "video_action_contrastive"] 26 | loss_weighting: [1.0, 1.0] 27 | if_use_attn_guidance: False 28 | if_use_dual_perceiver: False 29 | if_add_temporal_emebdding: False 30 | num_frms: 8 31 | temp_emb_drop_out: 0.0 32 | 33 | datasets: 34 | downstream_tasks_qa_nextqa_224x224_5fps: 35 | type: "default" 36 | vis_processor: 37 | train: 38 | name: "video_train" 39 | image_size: 224 40 | eval: 41 | name: "internvideo_eval" 42 | image_size: 224 43 | text_processor: 44 | train: 45 | name: "minimum_text" 46 | eval: 47 | name: "minimum_text" 48 | 49 | # IMPORTANT configs: 50 | task: 5way-multiple-choice-qa 51 | neg_sampling_same_clip: 0 52 | eval_only: False 53 | 54 | # other arguements 55 | train_k: null # sample a subset of k instances 56 | eval_k: null # sample a subset of 3000 instances, reduce evaluation time 57 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 58 | num_frm: 8 59 | train_frame_height: 224 60 | train_frame_width: 224 61 | eval_frame_height: 224 62 | eval_frame_width: 224 63 | 64 | 65 | run: 66 | # use custom runner 67 | runner: runner_base_patch_and_fuse 68 | 69 | # task object name 70 | task: actionbench 71 | 72 | # which module is used for inference ["backbone", "knowledge_patcher"] 73 | eval_module: knowledge_patcher 74 | eval_task: video_text_matching 75 | 76 | # optimizer 77 | lr_sched: "linear_warmup_cosine_lr" 78 | init_lr: 1e-5 79 | min_lr: 0 80 | weight_decay: 0.05 81 | max_epoch: 1 82 | 83 | batch_size_train: 16 84 | batch_size_eval: 4 85 | 86 | num_workers: 4 87 | 88 | seed: 42 89 | output_dir: "output/downstream_tasks/NextQA/train/KP-Perceiver-VTC-DVDM_internvideo" 90 | 91 | amp: False 92 | resume_ckpt_path: null 93 | 94 | evaluate: False 95 | 96 | train_splits: ["train"] 97 | valid_splits: ["val"] 98 | # test_splits: ["test"] 99 | 100 | device: "cuda" 101 | world_size: 1 102 | dist_url: "env://" 103 | distributed: True -------------------------------------------------------------------------------- /src/ClipViP/src/configs/pretrain/pretrain_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "hdvila", 5 | "vis_format": "videoframe", 6 | "txt": "datasets/hdvila/hdvila_subtitles_92m_db", 7 | "vis": "youtube_data/ytt180m/video_clips_3fps", 8 | "vid_cap_path": "datasets/hdvila/hdvila_captions_db", 9 | "vid_txt": "subtitle", 10 | "img_dir": "", 11 | "cap_path": "", 12 | "img_source": "", 13 | "img_ratio": 0 14 | } 15 | ], 16 | "val_datasets": [ 17 | { 18 | "name": "msrvtt", 19 | "vis_format": "video", 20 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 21 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 22 | }, 23 | { 24 | "name": "how2", 25 | "vis_format": "video", 26 | "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl", 27 | "vis": "youtube_data/ytt180m/video_clips_3fps" 28 | }, 29 | { 30 | "name": "ours", 31 | "vis_format": "video", 32 | "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl", 33 | "vis": "youtube_data/ytt180m/video_clips_3fps" 34 | } 35 | ], 36 | 37 | "train_n_clips": 1, 38 | "train_num_frms": 12, 39 | "test_n_clips": 1, 40 | "test_num_frms": 12, 41 | "sample_rate": 0, 42 | "sample_jitter": 1, 43 | "video_res": [240, 320], 44 | "input_res": [224, 224], 45 | "max_txt_len": 70, 46 | 47 | "e2e_weights_path": null, 48 | "clip_weights": "openai/clip-vit-base-patch16", 49 | "clip_config": "openai/clip-vit-base-patch16", 50 | "clip_vision_additional_config": { 51 | "type": "ViP", 52 | "temporal_size": 12, 53 | "if_use_temporal_embed": 1, 54 | "logit_scale_init_value": 4.60, 55 | "add_cls_num": 3 56 | }, 57 | 58 | "train_batch_size": 16, 59 | "test_batch_size": 16, 60 | "max_n_example_per_group": 1, 61 | "gradient_accumulation_steps": 1, 62 | "n_workers": 8, 63 | "pin_mem": 1, 64 | "fp16": 1, 65 | "amp_level": "O2", 66 | "seed": 42, 67 | 68 | "optim": "adamw", 69 | "betas": [0.9, 0.98], 70 | "learning_rate": 5e-6, 71 | "weight_decay": 0.05, 72 | "lr_mul": 1, 73 | "lr_mul_prefix": "", 74 | "loss_config": { 75 | "loss_name": "NCELearnableTempLoss_vsc_fc", 76 | "if_gather": 1 77 | }, 78 | "warmup_ratio": 0.01, 79 | "decay": "cosine", 80 | "grad_norm": 5.0, 81 | 82 | "num_train_epochs": 5, 83 | "min_valid_steps": 1, 84 | "num_valid": 100, 85 | "only_valid_steps": 1000, 86 | "save_steps_ratio": 0.01, 87 | "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_16/", 88 | "if_tb_log": 1, 89 | "if_model_saver": 1, 90 | "if_log2file": 1, 91 | "dummy_data": 0 92 | } 93 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/pretrain/pretrain_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "hdvila", 5 | "vis_format": "videoframe", 6 | "txt": "datasets/hdvila/hdvila_subtitles_92m_db", 7 | "vis": "youtube_data/ytt180m/video_clips_3fps", 8 | "vid_cap_path": "datasets/hdvila/hdvila_captions_db", 9 | "vid_txt": "subtitle", 10 | "img_dir": "", 11 | "cap_path": "", 12 | "img_source": "", 13 | "img_ratio": 0 14 | } 15 | ], 16 | "val_datasets": [ 17 | { 18 | "name": "msrvtt", 19 | "vis_format": "video", 20 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 21 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 22 | }, 23 | { 24 | "name": "how2", 25 | "vis_format": "video", 26 | "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl", 27 | "vis": "youtube_data/ytt180m/video_clips_3fps" 28 | }, 29 | { 30 | "name": "ours", 31 | "vis_format": "video", 32 | "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl", 33 | "vis": "youtube_data/ytt180m/video_clips_3fps" 34 | } 35 | ], 36 | 37 | "train_n_clips": 1, 38 | "train_num_frms": 12, 39 | "test_n_clips": 1, 40 | "test_num_frms": 12, 41 | "sample_rate": 0, 42 | "sample_jitter": 1, 43 | "video_res": [240, 320], 44 | "input_res": [224, 224], 45 | "max_txt_len": 70, 46 | 47 | "e2e_weights_path": null, 48 | "clip_weights": "openai/clip-vit-base-patch32", 49 | "clip_config": "openai/clip-vit-base-patch32", 50 | "clip_vision_additional_config": { 51 | "type": "ViP", 52 | "temporal_size": 12, 53 | "if_use_temporal_embed": 1, 54 | "logit_scale_init_value": 4.60, 55 | "add_cls_num": 3 56 | }, 57 | 58 | "train_batch_size": 32, 59 | "test_batch_size": 32, 60 | "max_n_example_per_group": 1, 61 | "gradient_accumulation_steps": 1, 62 | "n_workers": 8, 63 | "pin_mem": 1, 64 | "fp16": 1, 65 | "amp_level": "O2", 66 | "seed": 42, 67 | 68 | "optim": "adamw", 69 | "betas": [0.9, 0.98], 70 | "learning_rate": 5e-6, 71 | "weight_decay": 0.05, 72 | "lr_mul": 1, 73 | "lr_mul_prefix": "", 74 | "loss_config": { 75 | "loss_name": "NCELearnableTempLoss_vsc_fc", 76 | "if_gather": 1 77 | }, 78 | "warmup_ratio": 0.01, 79 | "decay": "cosine", 80 | "grad_norm": 5.0, 81 | 82 | "num_train_epochs": 5, 83 | "min_valid_steps": 1, 84 | "num_valid": 100, 85 | "only_valid_steps": 1000, 86 | "save_steps_ratio": 0.01, 87 | "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_32/", 88 | "if_tb_log": 1, 89 | "if_model_saver": 1, 90 | "if_log2file": 1, 91 | "dummy_data": 0 92 | } 93 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/temporal_kinetics/side_tuning_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | pretrained: "/checkpoint_0.pth" 7 | 8 | text_perceiver_config: 9 | dim: 768 # latent query dim 10 | k_v_dim: 768 # text_width 11 | depth: 1 12 | dim_head: 64 13 | heads: 8 14 | num_latents: 16 15 | ff_mult: 2 16 | 17 | vision_perceiver_config: 18 | dim: 768 # latent query dim 19 | k_v_dim: 1024 # vision_width 20 | depth: 1 21 | dim_head: 64 22 | heads: 8 23 | num_latents: 16 24 | ff_mult: 2 25 | 26 | objectives: ["video_text_contrastive"] 27 | loss_weighting: [1.0] 28 | if_use_attn_guidance: False 29 | if_use_dual_perceiver: False 30 | if_add_temporal_emebdding: True 31 | num_frms: 8 32 | temp_emb_drop_out: 0.1 33 | if_as_knowledge_fuser: True 34 | knowledge_fuser_type: "side_tuning" 35 | train_knowledge_fuser_jointly: True 36 | 37 | datasets: 38 | downstream_tasks_temporal: 39 | type: "default" 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "minimum_text" 50 | eval: 51 | name: "minimum_text" 52 | 53 | # IMPORTANT configs: 54 | fps: 5 55 | task: v1.0_2.4k 56 | subset: kinetics 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of 3000 instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: downstream_tasks_retrieval 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: knowledge_patcher 79 | eval_task: v1.0_2.4k 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/downstream_tasks/temporal-kinetics/eval/side_tuning_zero-shot" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | 93 | # train_splits: ["train"] 94 | valid_splits: ["val"] 95 | # test_splits: ["test"] 96 | 97 | device: "cuda" 98 | world_size: 1 99 | dist_url: "env://" 100 | distributed: True 101 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/clipvip/ego4d/acdybench_ego4d_clipvip_backbone__action_antonym.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_clipvip 5 | model_type: pretrain_vip_base_32 6 | load_pretrained: True 7 | backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 8 | 9 | text_perceiver_config: 10 | dim: 512 # latent query dim 11 | k_v_dim: 512 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 512 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "vl_dynamic_ego4d_text" 50 | eval: 51 | name: "vl_dynamic_ego4d_text" 52 | 53 | # IMPORTANT configs: 54 | fps: 5 # if downsampled, use 5 fps 55 | task: "action_antonym" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 56 | neg_sampling_same_clip: 0 # evaluation set to 0 57 | eval_only: True 58 | 59 | # other arguements 60 | k: null # sample a subset of k instances 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | run: 69 | # use custom runner 70 | runner: runner_base_patch_and_fuse 71 | 72 | # task object name 73 | task: actionbench 74 | 75 | # which module is used for inference ["backbone", "knowledge_patcher"] 76 | eval_module: backbone 77 | eval_task: action_antonym 78 | 79 | batch_size_train: 32 80 | batch_size_eval: 4 81 | num_workers: 4 82 | 83 | seed: 42 84 | output_dir: "output/actionbench/eval/ClipViP/ego4d__clipvip_backbone__action_antonym" 85 | 86 | amp: False 87 | resume_ckpt_path: null 88 | 89 | evaluate: True 90 | # train_splits: ["train"] 91 | # valid_splits: ["val"] 92 | test_splits: ["test"] 93 | 94 | device: "cuda" 95 | world_size: 1 96 | dist_url: "env://" 97 | distributed: True 98 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/clipvip/ego4d/acdybench_ego4d_clipvip_backbone__reversed_video.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_clipvip 5 | model_type: pretrain_vip_base_32 6 | load_pretrained: True 7 | backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 8 | 9 | text_perceiver_config: 10 | dim: 512 # latent query dim 11 | k_v_dim: 512 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 512 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "vl_dynamic_ego4d_text" 50 | eval: 51 | name: "vl_dynamic_ego4d_text" 52 | 53 | # IMPORTANT configs: 54 | fps: 5 # if downsampled, use 5 fps 55 | task: "reversed_video" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 56 | neg_sampling_same_clip: 0 # evaluation set to 0 57 | eval_only: True 58 | 59 | # other arguements 60 | k: null # sample a subset of k instances 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | run: 69 | # use custom runner 70 | runner: runner_base_patch_and_fuse 71 | 72 | # task object name 73 | task: actionbench 74 | 75 | # which module is used for inference ["backbone", "knowledge_patcher"] 76 | eval_module: backbone 77 | eval_task: reversed_video 78 | 79 | batch_size_train: 32 80 | batch_size_eval: 4 81 | num_workers: 4 82 | 83 | seed: 42 84 | output_dir: "output/actionbench/eval/ClipViP/ego4d__clipvip_backbone__reversed_video" 85 | 86 | amp: False 87 | resume_ckpt_path: null 88 | 89 | evaluate: True 90 | # train_splits: ["train"] 91 | # valid_splits: ["val"] 92 | test_splits: ["test"] 93 | 94 | device: "cuda" 95 | world_size: 1 96 | dist_url: "env://" 97 | distributed: True 98 | -------------------------------------------------------------------------------- /src/ClipViP/src/utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | references: UNITER 3 | """ 4 | 5 | import logging 6 | from tensorboardX import SummaryWriter 7 | import os 8 | 9 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' 10 | _DATE_FMT = '%m/%d/%Y %H:%M:%S' 11 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO) 12 | LOGGER = logging.getLogger('__main__') # this is the global logger 13 | 14 | 15 | def add_log_to_file(log_path): 16 | fh = logging.FileHandler(log_path) 17 | formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT) 18 | fh.setFormatter(formatter) 19 | LOGGER.addHandler(fh) 20 | 21 | 22 | class TensorboardLogger(object): 23 | def __init__(self): 24 | self._logger = None 25 | self._global_step = 0 26 | 27 | def create(self, path): 28 | if "AZUREML_TB_PATH" in os.environ: 29 | self._logger = SummaryWriter(os.environ["AZUREML_TB_PATH"]) 30 | else: 31 | self._logger = SummaryWriter(path) 32 | 33 | def noop(self, *args, **kwargs): 34 | return 35 | 36 | def step(self): 37 | self._global_step += 1 38 | 39 | @property 40 | def global_step(self): 41 | return self._global_step 42 | 43 | @global_step.setter 44 | def global_step(self, step): 45 | self._global_step = step 46 | 47 | def log_scalar_dict(self, log_dict, prefix=''): 48 | """ log a dictionary of scalar values""" 49 | if self._logger is None: 50 | return 51 | if prefix: 52 | prefix = f'{prefix}_' 53 | for name, value in log_dict.items(): 54 | if isinstance(value, dict): 55 | self.log_scalar_dict(value, self._global_step, 56 | prefix=f'{prefix}{name}') 57 | else: 58 | self._logger.add_scalar(f'{prefix}{name}', value, 59 | self._global_step) 60 | 61 | def __getattr__(self, name): 62 | if self._logger is None: 63 | return self.noop 64 | return self._logger.__getattribute__(name) 65 | 66 | 67 | TB_LOGGER = TensorboardLogger() 68 | 69 | 70 | class RunningMeter(object): 71 | """ running meteor of a scalar value 72 | (useful for monitoring training loss) 73 | """ 74 | def __init__(self, name, val=None, smooth=0.99): 75 | self._name = name 76 | self._sm = smooth 77 | self._val = val 78 | 79 | def __call__(self, value): 80 | self._val = (value if self._val is None 81 | else value*(1-self._sm) + self._val*self._sm) 82 | 83 | def __str__(self): 84 | return f'{self._name}: {self._val:.4f}' 85 | 86 | @property 87 | def val(self): 88 | return self._val 89 | 90 | @property 91 | def name(self): 92 | return self._name 93 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/singularity/ego4d/acdybench_ego4d_singularity_backbone__action_antonym.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_singularity 5 | model_type: singularity_temporal_17m 6 | load_pretrained: True 7 | backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 8 | 9 | text_perceiver_config: 10 | dim: 256 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 256 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "vl_dynamic_ego4d_text" 50 | eval: 51 | name: "vl_dynamic_ego4d_text" 52 | 53 | # IMPORTANT configs: 54 | fps: 5 # if downsampled, use 5 fps 55 | task: "action_antonym" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 56 | neg_sampling_same_clip: 0 # evaluation set to 0 57 | eval_only: True 58 | 59 | # other arguements 60 | k: null # sample a subset of k instances 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | run: 69 | # use custom runner 70 | runner: runner_base_patch_and_fuse 71 | 72 | # task object name 73 | task: actionbench 74 | 75 | # which module is used for inference ["backbone", "knowledge_patcher"] 76 | eval_module: backbone 77 | eval_task: action_antonym 78 | 79 | batch_size_train: 32 80 | batch_size_eval: 4 81 | num_workers: 4 82 | 83 | seed: 42 84 | output_dir: "output/actionbench/eval/Singularity/ego4d__Singularity_backbone__action_antonym" 85 | 86 | amp: False 87 | resume_ckpt_path: null 88 | 89 | evaluate: True 90 | # train_splits: ["train"] 91 | # valid_splits: ["val"] 92 | test_splits: ["test"] 93 | 94 | device: "cuda" 95 | world_size: 1 96 | dist_url: "env://" 97 | distributed: True 98 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/singularity/ego4d/acdybench_ego4d_singularity_backbone__reversed_video.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_singularity 5 | model_type: singularity_temporal_17m 6 | load_pretrained: True 7 | backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 8 | 9 | text_perceiver_config: 10 | dim: 256 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 256 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "vl_dynamic_ego4d_text" 50 | eval: 51 | name: "vl_dynamic_ego4d_text" 52 | 53 | # IMPORTANT configs: 54 | fps: 5 # if downsampled, use 5 fps 55 | task: "reversed_video" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 56 | neg_sampling_same_clip: 0 # evaluation set to 0 57 | eval_only: True 58 | 59 | # other arguements 60 | k: null # sample a subset of k instances 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | run: 69 | # use custom runner 70 | runner: runner_base_patch_and_fuse 71 | 72 | # task object name 73 | task: actionbench 74 | 75 | # which module is used for inference ["backbone", "knowledge_patcher"] 76 | eval_module: backbone 77 | eval_task: reversed_video 78 | 79 | batch_size_train: 32 80 | batch_size_eval: 4 81 | num_workers: 4 82 | 83 | seed: 42 84 | output_dir: "output/actionbench/eval/Singularity/ego4d__Singularity_backbone__reversed_video" 85 | 86 | amp: False 87 | resume_ckpt_path: null 88 | 89 | evaluate: True 90 | # train_splits: ["train"] 91 | # valid_splits: ["val"] 92 | test_splits: ["test"] 93 | 94 | device: "cuda" 95 | world_size: 1 96 | dist_url: "env://" 97 | distributed: True 98 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/temporal_ssv2/side_tuning.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | pretrained: "/checkpoint_best.pth" #TODO: set trained patch_and_fuse checkpoint path 8 | 9 | text_perceiver_config: 10 | dim: 768 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 768 # latent query dim 20 | k_v_dim: 1024 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: True 32 | num_frms: 8 33 | temp_emb_drop_out: 0.1 34 | if_as_knowledge_fuser: True 35 | knowledge_fuser_type: "side_tuning" 36 | train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | downstream_tasks_temporal: 40 | type: "default" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 56 | task: v1.0_2.4k 57 | subset: ssv2 58 | neg_sampling_same_clip: 0 59 | eval_only: True 60 | 61 | # other arguements 62 | train_k: null # sample a subset of k instances 63 | eval_k: null # sample a subset of k instances, reduce evaluation time 64 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 65 | num_frm: 8 66 | train_frame_height: 224 67 | train_frame_width: 224 68 | eval_frame_height: 224 69 | eval_frame_width: 224 70 | 71 | run: 72 | # use custom runner 73 | runner: runner_base_patch_and_fuse 74 | 75 | # task object name 76 | task: downstream_tasks_retrieval 77 | 78 | # which module is used for inference ["backbone", "knowledge_patcher"] 79 | eval_module: knowledge_patcher 80 | eval_task: v1.0_2.4k 81 | 82 | batch_size_train: 32 83 | batch_size_eval: 4 84 | num_workers: 4 85 | 86 | seed: 42 87 | output_dir: "output/downstream_tasks/temporal_ssv2/side_tuning" 88 | 89 | amp: False 90 | resume_ckpt_path: null 91 | 92 | evaluate: True 93 | 94 | # train_splits: ["train"] 95 | valid_splits: ["val"] 96 | # test_splits: ["test"] 97 | 98 | device: "cuda" 99 | world_size: 1 100 | dist_url: "env://" 101 | distributed: True 102 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/clipvip/ego4d/acdybench_ego4d_clipvip_backbone__object_shuffle.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_clipvip 5 | model_type: pretrain_vip_base_32 6 | load_pretrained: True 7 | backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 8 | 9 | text_perceiver_config: 10 | dim: 512 # latent query dim 11 | k_v_dim: 512 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 512 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 40 | type: "object_shuffled" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "vl_dynamic_ego4d_text" 51 | eval: 52 | name: "vl_dynamic_ego4d_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: "object_shuffle" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 57 | neg_sampling_same_clip: 0 # evaluation set to 0 58 | eval_only: True 59 | 60 | # other arguements 61 | k: null # sample a subset of k instances 62 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 63 | num_frm: 8 64 | train_frame_height: 224 65 | train_frame_width: 224 66 | eval_frame_height: 224 67 | eval_frame_width: 224 68 | 69 | run: 70 | # use custom runner 71 | runner: runner_base_patch_and_fuse 72 | 73 | # task object name 74 | task: actionbench 75 | 76 | # which module is used for inference ["backbone", "knowledge_patcher"] 77 | eval_module: backbone 78 | eval_task: object_shuffle 79 | 80 | batch_size_train: 32 81 | batch_size_eval: 4 82 | num_workers: 4 83 | 84 | seed: 42 85 | output_dir: "output/actionbench/eval/ClipViP/ego4d__clipvip_backbone__object_shuffle" 86 | 87 | amp: False 88 | resume_ckpt_path: null 89 | 90 | evaluate: True 91 | # train_splits: ["train"] 92 | # valid_splits: ["val"] 93 | test_splits: ["test"] 94 | 95 | device: "cuda" 96 | world_size: 1 97 | dist_url: "env://" 98 | distributed: True 99 | -------------------------------------------------------------------------------- /src/ClipViP/src/configs/pretrained/pretrain_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "hdvila", 5 | "vis_format": "videoframe", 6 | "txt": "datasets/hdvila/hdvila_subtitles_92m_db", 7 | "vis": "youtube_data/ytt180m/video_clips_3fps", 8 | "vid_cap_path": "datasets/hdvila/hdvila_captions_db", 9 | "vid_txt": "subtitle", 10 | "img_dir": "", 11 | "cap_path": "", 12 | "img_source": "", 13 | "img_ratio": 0 14 | } 15 | ], 16 | "val_datasets": [ 17 | { 18 | "name": "msrvtt", 19 | "vis_format": "video", 20 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 21 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 22 | }, 23 | { 24 | "name": "how2", 25 | "vis_format": "video", 26 | "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl", 27 | "vis": "youtube_data/ytt180m/video_clips_3fps" 28 | }, 29 | { 30 | "name": "ours", 31 | "vis_format": "video", 32 | "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl", 33 | "vis": "youtube_data/ytt180m/video_clips_3fps" 34 | } 35 | ], 36 | 37 | "train_n_clips": 1, 38 | "train_num_frms": 12, 39 | "test_n_clips": 1, 40 | "test_num_frms": 12, 41 | "sample_rate": 0, 42 | "sample_jitter": 1, 43 | "video_res": [240, 320], 44 | "input_res": [224, 224], 45 | "max_txt_len": 70, 46 | 47 | "e2e_weights_path": "pretrained_ckpt/ClipViP/pretrain_clipvip_base_32.pt", 48 | "clip_weights": "openai/clip-vit-base-patch32", 49 | "clip_config": "openai/clip-vit-base-patch32", 50 | "clip_vision_additional_config": { 51 | "type": "ViP", 52 | "temporal_size": 12, 53 | "if_use_temporal_embed": 1, 54 | "logit_scale_init_value": 4.60, 55 | "add_cls_num": 3 56 | }, 57 | 58 | "train_batch_size": 32, 59 | "test_batch_size": 32, 60 | "max_n_example_per_group": 1, 61 | "gradient_accumulation_steps": 1, 62 | "n_workers": 8, 63 | "pin_mem": 1, 64 | "fp16": 1, 65 | "amp_level": "O2", 66 | "seed": 42, 67 | 68 | "optim": "adamw", 69 | "betas": [0.9, 0.98], 70 | "learning_rate": 5e-6, 71 | "weight_decay": 0.05, 72 | "lr_mul": 1, 73 | "lr_mul_prefix": "", 74 | "loss_config": { 75 | "loss_name": "NCELearnableTempLoss_vsc_fc", 76 | "if_gather": 1 77 | }, 78 | "warmup_ratio": 0.01, 79 | "decay": "cosine", 80 | "grad_norm": 5.0, 81 | 82 | "num_train_epochs": 5, 83 | "min_valid_steps": 1, 84 | "num_valid": 100, 85 | "only_valid_steps": 1000, 86 | "save_steps_ratio": 0.01, 87 | "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_32/", 88 | "if_tb_log": 1, 89 | "if_model_saver": 1, 90 | "if_log2file": 1, 91 | "dummy_data": 0 92 | } 93 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/temporal_ssv2/patch_and_fuse.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_template_patch_and_fuse.pth" #TODO: set trained patch_and_fuse checkpoint path 8 | 9 | text_perceiver_config: 10 | dim: 768 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 768 # latent query dim 20 | k_v_dim: 1024 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: True 32 | num_frms: 8 33 | temp_emb_drop_out: 0.1 34 | if_as_knowledge_fuser: True 35 | knowledge_fuser_type: "xattn" 36 | train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | downstream_tasks_temporal: 40 | type: "default" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 56 | task: v1.0_2.4k 57 | subset: ssv2 58 | neg_sampling_same_clip: 0 59 | eval_only: True 60 | 61 | # other arguements 62 | train_k: null # sample a subset of k instances 63 | eval_k: null # sample a subset of k instances, reduce evaluation time 64 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 65 | num_frm: 8 66 | train_frame_height: 224 67 | train_frame_width: 224 68 | eval_frame_height: 224 69 | eval_frame_width: 224 70 | 71 | run: 72 | # use custom runner 73 | runner: runner_base_patch_and_fuse 74 | 75 | # task object name 76 | task: downstream_tasks_retrieval 77 | 78 | # which module is used for inference ["backbone", "knowledge_patcher"] 79 | eval_module: knowledge_patcher 80 | eval_task: v1.0_2.4k 81 | 82 | batch_size_train: 32 83 | batch_size_eval: 4 84 | num_workers: 4 85 | 86 | seed: 42 87 | output_dir: "output/downstream_tasks/temporal_ssv2/patch_and_fuse" 88 | 89 | amp: False 90 | resume_ckpt_path: null 91 | 92 | evaluate: True 93 | 94 | # train_splits: ["train"] 95 | valid_splits: ["val"] 96 | # test_splits: ["test"] 97 | 98 | device: "cuda" 99 | world_size: 1 100 | dist_url: "env://" 101 | distributed: True 102 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/singularity/ego4d/acdybench_ego4d_singularity_backbone__object_shuffle.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_singularity 5 | model_type: singularity_temporal_17m 6 | load_pretrained: True 7 | backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 8 | 9 | text_perceiver_config: 10 | dim: 256 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 256 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 40 | type: "object_shuffled" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "vl_dynamic_ego4d_text" 51 | eval: 52 | name: "vl_dynamic_ego4d_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: "object_shuffle" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 57 | neg_sampling_same_clip: 0 # evaluation set to 0 58 | eval_only: True 59 | 60 | # other arguements 61 | k: null # sample a subset of k instances 62 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 63 | num_frm: 8 64 | train_frame_height: 224 65 | train_frame_width: 224 66 | eval_frame_height: 224 67 | eval_frame_width: 224 68 | 69 | run: 70 | # use custom runner 71 | runner: runner_base_patch_and_fuse 72 | 73 | # task object name 74 | task: actionbench 75 | 76 | # which module is used for inference ["backbone", "knowledge_patcher"] 77 | eval_module: backbone 78 | eval_task: object_shuffle 79 | 80 | batch_size_train: 32 81 | batch_size_eval: 4 82 | num_workers: 4 83 | 84 | seed: 42 85 | output_dir: "output/actionbench/eval/Singularity/ego4d__Singularity_backbone__object_shuffle" 86 | 87 | amp: False 88 | resume_ckpt_path: null 89 | 90 | evaluate: True 91 | # train_splits: ["train"] 92 | # valid_splits: ["val"] 93 | test_splits: ["test"] 94 | 95 | device: "cuda" 96 | world_size: 1 97 | dist_url: "env://" 98 | distributed: True 99 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/ssv2_label/side_tuning.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | pretrained: "/checkpoint_best.pth" #TODO: set trained patch_and_fuse checkpoint path 8 | 9 | text_perceiver_config: 10 | dim: 768 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 768 # latent query dim 20 | k_v_dim: 1024 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: True 32 | num_frms: 8 33 | temp_emb_drop_out: 0.1 34 | if_as_knowledge_fuser: True 35 | knowledge_fuser_type: "side_tuning" 36 | train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | downstream_tasks_retrieval_ssv2_224x224_5fps: 40 | type: "default" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: ssv2_label # ssv2_label, ssv2_template 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of k instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: downstream_tasks_retrieval 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: knowledge_patcher 79 | eval_task: ssv2_label 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/downstream_tasks/ssv2_label/side_tuning" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | 93 | # train_splits: ["train"] 94 | valid_splits: ["val"] 95 | # test_splits: ["test"] 96 | 97 | device: "cuda" 98 | world_size: 1 99 | dist_url: "env://" 100 | distributed: True 101 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/ssv2_label/patch_and_fuse.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_label_patch_and_fuse.pth" #TODO: set trained patch_and_fuse checkpoint path 8 | 9 | text_perceiver_config: 10 | dim: 768 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 768 # latent query dim 20 | k_v_dim: 1024 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: True 32 | num_frms: 8 33 | temp_emb_drop_out: 0.1 34 | if_as_knowledge_fuser: True 35 | knowledge_fuser_type: "xattn" 36 | train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | downstream_tasks_retrieval_ssv2_224x224_5fps: 40 | type: "default" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: ssv2_label # ssv2_label, ssv2_template 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of k instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: downstream_tasks_retrieval 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: knowledge_patcher 79 | eval_task: ssv2_label 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/downstream_tasks/ssv2_label/patch_and_fuse" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | 93 | # train_splits: ["train"] 94 | valid_splits: ["val"] 95 | # test_splits: ["test"] 96 | 97 | device: "cuda" 98 | world_size: 1 99 | dist_url: "env://" 100 | distributed: True 101 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/ssv2_template/side_tuning.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | pretrained: "/checkpoint_best.pth" #TODO: set trained patch_and_fuse checkpoint path 8 | 9 | text_perceiver_config: 10 | dim: 768 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 768 # latent query dim 20 | k_v_dim: 1024 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: True 32 | num_frms: 8 33 | temp_emb_drop_out: 0.1 34 | if_as_knowledge_fuser: True 35 | knowledge_fuser_type: "side_tuning" 36 | train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | downstream_tasks_retrieval_ssv2_224x224_5fps: 40 | type: "default" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: ssv2_template # ssv2_label, ssv2_template 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of k instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: downstream_tasks_retrieval 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: knowledge_patcher 79 | eval_task: ssv2_template 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/downstream_tasks/ssv2_template/side_tuning" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | 93 | # train_splits: ["train"] 94 | valid_splits: ["val"] 95 | # test_splits: ["test"] 96 | 97 | device: "cuda" 98 | world_size: 1 99 | dist_url: "env://" 100 | distributed: True 101 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/ssv2_template/patch_and_fuse.yaml: -------------------------------------------------------------------------------- 1 | 2 | model: 3 | arch: patch_and_fuse_internvideo 4 | model_type: InternVideo-MM-L-14 5 | load_pretrained: True 6 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 7 | pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_template_patch_and_fuse.pth" #TODO: set trained patch_and_fuse checkpoint path 8 | 9 | text_perceiver_config: 10 | dim: 768 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 768 # latent query dim 20 | k_v_dim: 1024 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: True 32 | num_frms: 8 33 | temp_emb_drop_out: 0.1 34 | if_as_knowledge_fuser: True 35 | knowledge_fuser_type: "xattn" 36 | train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | downstream_tasks_retrieval_ssv2_224x224_5fps: 40 | type: "default" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: ssv2_template # ssv2_label, ssv2_template 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of k instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: downstream_tasks_retrieval 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: knowledge_patcher 79 | eval_task: ssv2_template 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/downstream_tasks/ssv2_template/patch_and_fuse" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | 93 | # train_splits: ["train"] 94 | valid_splits: ["val"] 95 | # test_splits: ["test"] 96 | 97 | device: "cuda" 98 | world_size: 1 99 | dist_url: "env://" 100 | distributed: True 101 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/clipvip/ssv2/acdybench_ssv2_clipvip_backbone__action_antonym.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_clipvip 5 | model_type: pretrain_vip_base_32 6 | load_pretrained: True 7 | backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 8 | 9 | text_perceiver_config: 10 | dim: 512 # latent query dim 11 | k_v_dim: 512 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 512 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ssv2_224x224_5fps: 40 | type: "action_antonyms_and_object_shuffled" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: "action_antonym" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of k instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: actionbench 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: backbone 79 | eval_task: action_antonym 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/actionbench/eval/ClipViP/ssv2__clipvip_backbone__action_antonym" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | # train_splits: ["train"] 93 | valid_splits: ["val"] 94 | # test_splits: ["test"] 95 | 96 | device: "cuda" 97 | world_size: 1 98 | dist_url: "env://" 99 | distributed: True 100 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/clipvip/ssv2/acdybench_ssv2_clipvip_backbone__object_shuffle.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_clipvip 5 | model_type: pretrain_vip_base_32 6 | load_pretrained: True 7 | backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 8 | 9 | text_perceiver_config: 10 | dim: 512 # latent query dim 11 | k_v_dim: 512 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 512 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ssv2_224x224_5fps: 40 | type: "action_antonyms_and_object_shuffled" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: "object_shuffle" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of k instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: actionbench 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: backbone 79 | eval_task: object_shuffle 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/actionbench/eval/ClipViP/ssv2__clipvip_backbone__object_shuffle" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | # train_splits: ["train"] 93 | valid_splits: ["val"] 94 | # test_splits: ["test"] 95 | 96 | device: "cuda" 97 | world_size: 1 98 | dist_url: "env://" 99 | distributed: True 100 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/moments_in_time/patch_and_fuse_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_label_patch_and_fuse.pth" 7 | 8 | text_perceiver_config: 9 | dim: 768 # latent query dim 10 | k_v_dim: 768 # text_width 11 | depth: 1 12 | dim_head: 64 13 | heads: 8 14 | num_latents: 16 15 | ff_mult: 2 16 | 17 | vision_perceiver_config: 18 | dim: 768 # latent query dim 19 | k_v_dim: 1024 # vision_width 20 | depth: 1 21 | dim_head: 64 22 | heads: 8 23 | num_latents: 16 24 | ff_mult: 2 25 | 26 | objectives: ["video_text_contrastive"] 27 | loss_weighting: [1.0] 28 | if_use_attn_guidance: False 29 | if_use_dual_perceiver: False 30 | if_add_temporal_emebdding: True 31 | num_frms: 8 32 | temp_emb_drop_out: 0.1 33 | if_as_knowledge_fuser: True 34 | knowledge_fuser_type: "xattn" 35 | train_knowledge_fuser_jointly: True 36 | 37 | datasets: 38 | downstream_tasks_moment_in_time: 39 | type: "default" 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "minimum_text" 50 | eval: 51 | name: "minimum_text" 52 | 53 | # IMPORTANT configs: 54 | task: video_action_retrieval_2k 55 | neg_sampling_same_clip: 0 56 | eval_only: True 57 | 58 | # other arguements 59 | train_k: null # sample a subset of k instances 60 | eval_k: null # sample a subset of 3000 instances, reduce evaluation time 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | run: 69 | # use custom runner 70 | runner: runner_base_patch_and_fuse 71 | 72 | # task object name 73 | task: downstream_tasks_retrieval 74 | 75 | # which module is used for inference ["backbone", "knowledge_patcher"] 76 | eval_module: knowledge_patcher 77 | eval_task: video_action_retrieval_2k 78 | 79 | ## NOTE: uncomment the following to use Backbone Ensemble 80 | # eval_method: ensemble_with_backbone 81 | 82 | batch_size_train: 32 83 | batch_size_eval: 4 84 | num_workers: 4 85 | 86 | seed: 42 87 | output_dir: "output/downstream_tasks/MomentsInTime/eval/patch_and_fuse_zero-shot" 88 | 89 | amp: False 90 | resume_ckpt_path: null 91 | 92 | evaluate: True 93 | 94 | # train_splits: ["train"] 95 | valid_splits: ["val"] 96 | # test_splits: ["test"] 97 | 98 | device: "cuda" 99 | world_size: 1 100 | dist_url: "env://" 101 | distributed: True 102 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/clipvip/ssv2/acdybench_ssv2_clipvip_backbone__reversed_video.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_clipvip 5 | model_type: pretrain_vip_base_32 6 | load_pretrained: True 7 | backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 8 | 9 | text_perceiver_config: 10 | dim: 512 # latent query dim 11 | k_v_dim: 512 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 512 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | 39 | datasets: 40 | actionbench_ssv2_224x224_5fps: 41 | type: "action_antonyms_and_object_shuffled" 42 | vis_processor: 43 | train: 44 | name: "video_train" 45 | image_size: 224 46 | eval: 47 | name: "internvideo_eval" 48 | image_size: 224 49 | text_processor: 50 | train: 51 | name: "minimum_text" 52 | eval: 53 | name: "minimum_text" 54 | 55 | # IMPORTANT configs: 56 | fps: 5 # if downsampled, use 5 fps 57 | task: "reversed_video" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 58 | neg_sampling_same_clip: 0 59 | eval_only: True 60 | 61 | # other arguements 62 | train_k: null # sample a subset of k instances 63 | eval_k: null # sample a subset of k instances, reduce evaluation time 64 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 65 | num_frm: 8 66 | train_frame_height: 224 67 | train_frame_width: 224 68 | eval_frame_height: 224 69 | eval_frame_width: 224 70 | 71 | run: 72 | # use custom runner 73 | runner: runner_base_patch_and_fuse 74 | 75 | # task object name 76 | task: actionbench 77 | 78 | # which module is used for inference ["backbone", "knowledge_patcher"] 79 | eval_module: backbone 80 | eval_task: reversed_video 81 | 82 | batch_size_train: 32 83 | batch_size_eval: 4 84 | num_workers: 4 85 | 86 | seed: 42 87 | output_dir: "output/actionbench/eval/ClipViP/ssv2__clipvip_backbone__reversed_video" 88 | 89 | amp: False 90 | resume_ckpt_path: null 91 | 92 | evaluate: True 93 | # train_splits: ["train"] 94 | valid_splits: ["val"] 95 | # test_splits: ["test"] 96 | 97 | device: "cuda" 98 | world_size: 1 99 | dist_url: "env://" 100 | distributed: True 101 | -------------------------------------------------------------------------------- /src/configs/projects/eval/downstream_task/temporal_kinetics/patch_and_fuse_zero-shot.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 6 | pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_label_patch_and_fuse.pth" 7 | 8 | text_perceiver_config: 9 | dim: 768 # latent query dim 10 | k_v_dim: 768 # text_width 11 | depth: 1 12 | dim_head: 64 13 | heads: 8 14 | num_latents: 16 15 | ff_mult: 2 16 | 17 | vision_perceiver_config: 18 | dim: 768 # latent query dim 19 | k_v_dim: 1024 # vision_width 20 | depth: 1 21 | dim_head: 64 22 | heads: 8 23 | num_latents: 16 24 | ff_mult: 2 25 | 26 | objectives: ["video_text_contrastive"] 27 | loss_weighting: [1.0] 28 | if_use_attn_guidance: False 29 | if_use_dual_perceiver: False 30 | if_add_temporal_emebdding: True 31 | num_frms: 8 32 | temp_emb_drop_out: 0.1 33 | if_as_knowledge_fuser: True 34 | knowledge_fuser_type: "xattn" 35 | train_knowledge_fuser_jointly: True 36 | 37 | datasets: 38 | downstream_tasks_temporal: 39 | type: "default" 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "minimum_text" 50 | eval: 51 | name: "minimum_text" 52 | 53 | # IMPORTANT configs: 54 | fps: 5 55 | task: v1.0_2.4k 56 | subset: kinetics 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of 3000 instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: downstream_tasks_retrieval 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: knowledge_patcher 79 | eval_task: v1.0_2.4k 80 | 81 | ## NOTE: uncomment the following to use Backbone Ensemble 82 | # eval_method: ensemble_with_backbone 83 | 84 | batch_size_train: 32 85 | batch_size_eval: 4 86 | num_workers: 4 87 | 88 | seed: 42 89 | output_dir: "output/downstream_tasks/temporal-kinetics/eval/patch_and_fuse_zero-shot" 90 | 91 | amp: False 92 | resume_ckpt_path: null 93 | 94 | evaluate: True 95 | 96 | # train_splits: ["train"] 97 | valid_splits: ["val"] 98 | # test_splits: ["test"] 99 | 100 | device: "cuda" 101 | world_size: 1 102 | dist_url: "env://" 103 | distributed: True 104 | -------------------------------------------------------------------------------- /src/Singularity/configs/qa_anet.yaml: -------------------------------------------------------------------------------- 1 | dataset_name: anet 2 | data_root: ${oc.env:SL_DATA_DIR}/videos_images 3 | anno_root_downstream: ${oc.env:SL_DATA_DIR}/anno_downstream 4 | train_file: 5 | - ['${anno_root_downstream}/anet_qa_train.json', '${data_root}/activity_net_2fps_360', video] 6 | test_types: [val, ] # one of [minival, test] 7 | test_file: 8 | val: ['${anno_root_downstream}/anet_qa_val.json', '${data_root}/activity_net_2fps_360', video] 9 | test: ['${anno_root_downstream}/anet_qa_test.json', '${data_root}/activity_net_2fps_360', video] 10 | stop_key: val # one of the key in `test_file` 11 | answer_list: ${anno_root_downstream}/anet_qa_answer_list.json # list of answer words 12 | 13 | text_encoder: bert-base-uncased 14 | text_decoder: bert-base-uncased 15 | bert_config: configs/config_bert.json 16 | vit_type: beit # items in ${vit_zoo} 17 | vit_zoo: # from huggingface 18 | beit: microsoft/beit-base-patch16-224-pt22k-ft22k 19 | vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]} 20 | temporal_vision_encoder: 21 | enable: False 22 | num_layers: 2 23 | update_pooler_embed: False 24 | add_temporal_embed: False # whether to add temporal embed to encoded frames 25 | 26 | image_res: 224 27 | embed_dim: 256 # -- not used 28 | video_input: # input -- not used 29 | num_frames: 1 30 | reader: decord # one of [decord, av] 31 | sample_type: rand 32 | num_frames_test: 4 # num_frames during inference/test 33 | sample_type_test: middle 34 | 35 | batch_size: 36 | image: 128 37 | video: 32 38 | batch_size_test: 39 | image: 64 40 | video: 64 41 | k_test: 128 42 | temp: 0.07 # -- not used 43 | eos: '[SEP]' 44 | max_q_len: 25 45 | max_a_len: 5 46 | 47 | optimizer: 48 | opt: adamW 49 | lr: 1e-5 50 | opt_betas: [0.9, 0.999] # default 51 | weight_decay: 0.02 52 | max_grad_norm: -1 # requires a positive float, use -1 to disable 53 | different_lr: # use a different lr for some modules, e.g., larger lr for new modules 54 | enable: False 55 | module_names: [] 56 | lr: 1e-3 57 | 58 | scheduler: 59 | sched: cosine 60 | epochs: 10 61 | min_lr_multi: 0.1 # min_lr will be `optimizer.lr * min_lr_multi` 62 | warmup_epochs: 0.5 # float 63 | 64 | output_dir: None # output dir 65 | pretrained_path: None # path to pretrained model weights 66 | resume: False # if True, load optimizer and scheduler states as well 67 | evaluate: False 68 | # `eval_frame_ensemble': how do we aggregate scores if `video_input.num_frames_test' > `video_input.num_frames' 69 | # `concat': concat frames before input to multi-modal encoder, i.e., early fusion 70 | # `mean', `max', `lse': mean/max/lse-pool scores after multi-modal encoder, i.e., late fusion, as in ClipBERT 71 | eval_frame_ensemble: concat # [concat, max, mean, lse] 72 | device: cuda 73 | seed: 42 74 | log_freq: 100 75 | dist_url: env:// 76 | distributed: True 77 | fp16: True 78 | debug: False 79 | num_workers: 24 80 | 81 | wandb: 82 | enable: False 83 | entity: None # username or teamname to store the runs, see https://docs.wandb.ai/ref/python/init 84 | project: vqa # setup in your command line 85 | 86 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/singularity/ssv2/acdybench_ssv2_singularity_backbone__action_antonym.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_singularity 5 | model_type: singularity_temporal_17m 6 | load_pretrained: True 7 | backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 8 | 9 | text_perceiver_config: 10 | dim: 256 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 256 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ssv2_224x224_5fps: 40 | type: "action_antonyms_and_object_shuffled" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: "action_antonym" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of k instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: actionbench 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: backbone 79 | eval_task: action_antonym 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/actionbench/eval/Singularity/ssv2__Singularity_backbone__action_antonym" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | # train_splits: ["train"] 93 | valid_splits: ["val"] 94 | # test_splits: ["test"] 95 | 96 | device: "cuda" 97 | world_size: 1 98 | dist_url: "env://" 99 | distributed: True 100 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/singularity/ssv2/acdybench_ssv2_singularity_backbone__object_shuffle.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_singularity 5 | model_type: singularity_temporal_17m 6 | load_pretrained: True 7 | backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 8 | 9 | text_perceiver_config: 10 | dim: 256 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 256 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | datasets: 39 | actionbench_ssv2_224x224_5fps: 40 | type: "action_antonyms_and_object_shuffled" 41 | vis_processor: 42 | train: 43 | name: "video_train" 44 | image_size: 224 45 | eval: 46 | name: "internvideo_eval" 47 | image_size: 224 48 | text_processor: 49 | train: 50 | name: "minimum_text" 51 | eval: 52 | name: "minimum_text" 53 | 54 | # IMPORTANT configs: 55 | fps: 5 # if downsampled, use 5 fps 56 | task: "object_shuffle" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 57 | neg_sampling_same_clip: 0 58 | eval_only: True 59 | 60 | # other arguements 61 | train_k: null # sample a subset of k instances 62 | eval_k: null # sample a subset of k instances, reduce evaluation time 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: actionbench 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: backbone 79 | eval_task: object_shuffle 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/actionbench/eval/Singularity/ssv2__Singularity_backbone__object_shuffle" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | # train_splits: ["train"] 93 | valid_splits: ["val"] 94 | # test_splits: ["test"] 95 | 96 | device: "cuda" 97 | world_size: 1 98 | dist_url: "env://" 99 | distributed: True 100 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/backbone/singularity/ssv2/acdybench_ssv2_singularity_backbone__reversed_video.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | model: 4 | arch: patch_and_fuse_singularity 5 | model_type: singularity_temporal_17m 6 | load_pretrained: True 7 | backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 8 | 9 | text_perceiver_config: 10 | dim: 256 # latent query dim 11 | k_v_dim: 768 # text_width 12 | depth: 1 13 | dim_head: 64 14 | heads: 8 15 | num_latents: 16 16 | ff_mult: 2 17 | 18 | vision_perceiver_config: 19 | dim: 256 # latent query dim 20 | k_v_dim: 768 # vision_width 21 | depth: 1 22 | dim_head: 64 23 | heads: 8 24 | num_latents: 16 25 | ff_mult: 2 26 | 27 | objectives: ["video_text_contrastive"] 28 | loss_weighting: [1.0] 29 | if_use_attn_guidance: False 30 | if_use_dual_perceiver: False 31 | if_add_temporal_emebdding: False 32 | num_frms: 8 33 | temp_emb_drop_out: 0.0 34 | # if_as_knowledge_fuser: True 35 | # knowledge_fuser_type: "xattn" 36 | # train_knowledge_fuser_jointly: True 37 | 38 | 39 | datasets: 40 | actionbench_ssv2_224x224_5fps: 41 | type: "action_antonyms_and_object_shuffled" 42 | vis_processor: 43 | train: 44 | name: "video_train" 45 | image_size: 224 46 | eval: 47 | name: "internvideo_eval" 48 | image_size: 224 49 | text_processor: 50 | train: 51 | name: "minimum_text" 52 | eval: 53 | name: "minimum_text" 54 | 55 | # IMPORTANT configs: 56 | fps: 5 # if downsampled, use 5 fps 57 | task: "reversed_video" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"] 58 | neg_sampling_same_clip: 0 59 | eval_only: True 60 | 61 | # other arguements 62 | train_k: null # sample a subset of k instances 63 | eval_k: null # sample a subset of k instances, reduce evaluation time 64 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 65 | num_frm: 8 66 | train_frame_height: 224 67 | train_frame_width: 224 68 | eval_frame_height: 224 69 | eval_frame_width: 224 70 | 71 | run: 72 | # use custom runner 73 | runner: runner_base_patch_and_fuse 74 | 75 | # task object name 76 | task: actionbench 77 | 78 | # which module is used for inference ["backbone", "knowledge_patcher"] 79 | eval_module: backbone 80 | eval_task: reversed_video 81 | 82 | batch_size_train: 32 83 | batch_size_eval: 4 84 | num_workers: 4 85 | 86 | seed: 42 87 | output_dir: "output/actionbench/eval/Singularity/ssv2__Singularity_backbone__reversed_video" 88 | 89 | amp: False 90 | resume_ckpt_path: null 91 | 92 | evaluate: True 93 | # train_splits: ["train"] 94 | valid_splits: ["val"] 95 | # test_splits: ["test"] 96 | 97 | device: "cuda" 98 | world_size: 1 99 | dist_url: "env://" 100 | distributed: True 101 | -------------------------------------------------------------------------------- /src/configs/projects/train/downstream_tasks/nextqa/Patch_and_Fuse.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo_mcqa 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/models/InternVideo-MM-L-14.ckpt" 6 | pretrained: "/checkpoint_best.pth" #TODO: set trained Knowledge Patcher checkpoint path 7 | 8 | text_perceiver_config: 9 | dim: 768 # latent query dim 10 | k_v_dim: 768 # text_width 11 | depth: 1 12 | dim_head: 64 13 | heads: 8 14 | num_latents: 16 15 | ff_mult: 2 16 | 17 | vision_perceiver_config: 18 | dim: 768 # latent query dim 19 | k_v_dim: 1024 # vision_width 20 | depth: 1 21 | dim_head: 64 22 | heads: 8 23 | num_latents: 16 24 | ff_mult: 2 25 | 26 | objectives: ["mcqa_loss"] 27 | loss_weighting: [1.0] 28 | if_use_attn_guidance: False 29 | if_use_dual_perceiver: False 30 | if_add_temporal_emebdding: False 31 | num_frms: 8 32 | temp_emb_drop_out: 0.0 33 | knowledge_fuser_type: "xattn" 34 | if_as_knowledge_fuser: True 35 | train_knowledge_fuser_jointly: True 36 | 37 | datasets: 38 | downstream_tasks_qa_nextqa_224x224_5fps: 39 | type: "default" 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "minimum_text" 50 | eval: 51 | name: "minimum_text" 52 | 53 | # IMPORTANT configs: 54 | task: 5way-multiple-choice-qa 55 | neg_sampling_same_clip: 0 56 | eval_only: False 57 | 58 | # other arguements 59 | train_k: null # sample a subset of k instances 60 | eval_k: null # sample a subset of k instances, reduce evaluation time 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | 69 | run: 70 | # use custom runner 71 | runner: runner_base_patch_and_fuse 72 | 73 | # task object name 74 | task: actionbench 75 | 76 | # which module is used for inference ["backbone", "knowledge_patcher"] 77 | eval_module: knowledge_patcher 78 | eval_task: video_text_matching 79 | 80 | # optimizer 81 | lr_sched: "linear_warmup_cosine_lr" 82 | init_lr: 1e-5 83 | min_lr: 0 84 | weight_decay: 0.05 85 | max_epoch: 4 86 | 87 | batch_size_train: 16 88 | batch_size_eval: 4 89 | 90 | num_workers: 4 91 | 92 | seed: 42 93 | output_dir: "output/downstream_tasks/NextQA/train/patch_and_fuse_internvideo" 94 | 95 | amp: False 96 | resume_ckpt_path: null 97 | 98 | evaluate: False 99 | 100 | train_splits: ["train"] 101 | valid_splits: ["val"] 102 | # test_splits: ["test"] 103 | 104 | device: "cuda" 105 | world_size: 1 106 | dist_url: "env://" 107 | distributed: True -------------------------------------------------------------------------------- /src/configs/projects/train/downstream_tasks/nextqa/Side_Tuning.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: patch_and_fuse_internvideo_mcqa 3 | model_type: InternVideo-MM-L-14 4 | load_pretrained: True 5 | backbone_pretrained: "pretrained_ckpt/InternVideo/models/InternVideo-MM-L-14.ckpt" 6 | pretrained: "/checkpoint_best.pth" #TODO: set trained Knowledge Patcher checkpoint path 7 | 8 | text_perceiver_config: 9 | dim: 768 # latent query dim 10 | k_v_dim: 768 # text_width 11 | depth: 1 12 | dim_head: 64 13 | heads: 8 14 | num_latents: 16 15 | ff_mult: 2 16 | 17 | vision_perceiver_config: 18 | dim: 768 # latent query dim 19 | k_v_dim: 1024 # vision_width 20 | depth: 1 21 | dim_head: 64 22 | heads: 8 23 | num_latents: 16 24 | ff_mult: 2 25 | 26 | objectives: ["mcqa_loss"] 27 | loss_weighting: [1.0] 28 | if_use_attn_guidance: False 29 | if_use_dual_perceiver: False 30 | if_add_temporal_emebdding: False 31 | num_frms: 8 32 | temp_emb_drop_out: 0.0 33 | knowledge_fuser_type: "side_tuning" 34 | if_as_knowledge_fuser: True 35 | train_knowledge_fuser_jointly: True 36 | 37 | datasets: 38 | downstream_tasks_qa_nextqa_224x224_5fps: 39 | type: "default" 40 | vis_processor: 41 | train: 42 | name: "video_train" 43 | image_size: 224 44 | eval: 45 | name: "internvideo_eval" 46 | image_size: 224 47 | text_processor: 48 | train: 49 | name: "minimum_text" 50 | eval: 51 | name: "minimum_text" 52 | 53 | # IMPORTANT configs: 54 | task: 5way-multiple-choice-qa 55 | neg_sampling_same_clip: 0 56 | eval_only: False 57 | 58 | # other arguements 59 | train_k: null # sample a subset of k instances 60 | eval_k: null # sample a subset of 3000 instances, reduce evaluation time 61 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 62 | num_frm: 8 63 | train_frame_height: 224 64 | train_frame_width: 224 65 | eval_frame_height: 224 66 | eval_frame_width: 224 67 | 68 | 69 | run: 70 | # use custom runner 71 | runner: runner_base_patch_and_fuse 72 | 73 | # task object name 74 | task: actionbench 75 | 76 | # which module is used for inference ["backbone", "knowledge_patcher"] 77 | eval_module: knowledge_patcher 78 | eval_task: video_text_matching 79 | 80 | # optimizer 81 | lr_sched: "linear_warmup_cosine_lr" 82 | init_lr: 1e-5 83 | min_lr: 0 84 | weight_decay: 0.05 85 | max_epoch: 4 86 | 87 | batch_size_train: 16 88 | batch_size_eval: 4 89 | 90 | num_workers: 4 91 | 92 | seed: 42 93 | output_dir: "output/downstream_tasks/NextQA/train/side_tuning_internvideo" 94 | 95 | amp: False 96 | resume_ckpt_path: null 97 | 98 | evaluate: False 99 | 100 | train_splits: ["train"] 101 | valid_splits: ["val"] 102 | # test_splits: ["test"] 103 | 104 | device: "cuda" 105 | world_size: 1 106 | dist_url: "env://" 107 | distributed: True -------------------------------------------------------------------------------- /src/Singularity/configs/qa_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | dataset_name: msrvtt 2 | data_root: ${oc.env:SL_DATA_DIR}/videos_images 3 | anno_root_downstream: ${oc.env:SL_DATA_DIR}/anno_downstream 4 | train_file: # each file should be formatted similar to data/downstream/vqa_train_sample.json 5 | - ['${anno_root_downstream}/msrvtt_qa_train.json', '${data_root}/msrvtt_2fps_224', video] 6 | test_types: [val, ] # one of [minival, test] 7 | test_file: 8 | val: ['${anno_root_downstream}/msrvtt_qa_val.json', '${data_root}/msrvtt_2fps_224', video] 9 | test: ['${anno_root_downstream}/msrvtt_qa_test.json', '${data_root}/msrvtt_2fps_224', video] 10 | stop_key: val # on of the key in `test_file` 11 | answer_list: ${anno_root_downstream}/msrvtt_qa_answer_list.json # list of answer words 12 | 13 | text_encoder: bert-base-uncased 14 | text_decoder: bert-base-uncased 15 | bert_config: configs/config_bert.json 16 | vit_type: beit # items in ${vit_zoo} 17 | vit_zoo: # from huggingface 18 | beit: microsoft/beit-base-patch16-224-pt22k-ft22k 19 | vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]} 20 | temporal_vision_encoder: 21 | enable: False 22 | num_layers: 2 23 | update_pooler_embed: False 24 | add_temporal_embed: False # whether to add temporal embed to encoded frames 25 | 26 | image_res: 224 27 | embed_dim: 256 # -- not used 28 | video_input: # input -- not used 29 | num_frames: 1 30 | reader: decord # one of [decord, av] 31 | sample_type: rand 32 | num_frames_test: 4 # num_frames during inference/test 33 | sample_type_test: middle 34 | max_q_len: 25 35 | max_a_len: 5 36 | 37 | batch_size: 38 | image: 128 39 | video: 32 40 | batch_size_test: 41 | image: 64 42 | video: 64 43 | k_test: 128 44 | temp: 0.07 # -- not used 45 | eos: '[SEP]' 46 | 47 | optimizer: 48 | opt: adamW 49 | lr: 1e-5 50 | opt_betas: [0.9, 0.999] # default 51 | weight_decay: 0.02 52 | max_grad_norm: -1 # requires a positive float, use -1 to disable 53 | different_lr: # use a different lr for some modules, e.g., larger lr for new modules 54 | enable: False 55 | module_names: [] 56 | lr: 1e-3 57 | 58 | scheduler: 59 | sched: cosine 60 | epochs: 10 61 | min_lr_multi: 0.1 # min_lr will be `optimizer.lr * min_lr_multi` 62 | warmup_epochs: 0.5 # float 63 | 64 | output_dir: None # output dir 65 | pretrained_path: None # path to pretrained model weights 66 | resume: False # if True, load optimizer and scheduler states as well 67 | evaluate: False 68 | # `eval_frame_ensemble': how do we aggregate scores if `video_input.num_frames_test' > `video_input.num_frames' 69 | # `concat': concat frames before input to multi-modal encoder, i.e., early fusion 70 | # `mean', `max', `lse': mean/max/lse-pool scores after multi-modal encoder, i.e., late fusion, as in ClipBERT 71 | eval_frame_ensemble: concat # [concat, max, mean, lse] 72 | device: cuda 73 | seed: 42 74 | log_freq: 100 75 | dist_url: env:// 76 | distributed: True 77 | fp16: True 78 | debug: False 79 | num_workers: 16 80 | 81 | wandb: 82 | enable: False 83 | entity: None # username or teamname to store the runs, see https://docs.wandb.ai/ref/python/init 84 | project: vqa # setup in your command line 85 | 86 | -------------------------------------------------------------------------------- /ActionBench/src/split_train_val_test_ego4d.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | from tqdm import tqdm 5 | random.seed(42) # Set the random seed to ensure reproducibility 6 | 7 | ### == some helper functions == ### 8 | class PostProcess(): 9 | # handle edge cases and artifacts from the antonym mining 10 | def __init__(self) -> None: 11 | additional_antonyms_mapping = json.load(open("additional_antonyms_mapping_ego4d.json")) 12 | self.post_process_targets = {} 13 | for value in additional_antonyms_mapping.values(): 14 | if value.endswith("s") or value.endswith("es"): 15 | self.post_process_targets[value+"es"] = value 16 | print("post processing target:value:", self.post_process_targets) 17 | 18 | def run(self, ann): 19 | action_antonym_clip_text = ann['action_antonym_clip_text'] 20 | for key,value in self.post_process_targets.items(): 21 | action_antonym_clip_text = action_antonym_clip_text.replace(key,value) 22 | ann['action_antonym_clip_text'] = action_antonym_clip_text 23 | 24 | def filterer(ann): 25 | if_filter = False 26 | filtering_verbs = ["keeps","keep","kept"] 27 | for v in filtering_verbs: 28 | if v in ann["clip_text"]: 29 | if_filter = True 30 | break 31 | return if_filter 32 | 33 | 34 | ### == set the input and output paths == ### 35 | processed_annotation_jsonl_name = "" # e.g. "egoclip_subset_action_antonyms" 36 | input_jsonl = f"../ego4d/{processed_annotation_jsonl_name}.jsonl" 37 | 38 | train_output = f"/{processed_annotation_jsonl_name}/train.jsonl" 39 | val_output = f"/{processed_annotation_jsonl_name}/val.jsonl" 40 | test_output = f"/{processed_annotation_jsonl_name}/test.jsonl" 41 | 42 | ### == set split ratio == ### 43 | ratios = [0.8,0.1,0.1] 44 | 45 | 46 | ### == run the script == ### 47 | post_processor = PostProcess() 48 | 49 | annotations = [] 50 | with open(input_jsonl, 'r') as f: 51 | for line in tqdm(f): 52 | loaded_ann = json.loads(line) 53 | post_processor.run(loaded_ann) 54 | if not filterer(loaded_ann): 55 | annotations.append(loaded_ann) 56 | else: 57 | print("filtered:", loaded_ann['clip_text']) 58 | print(len(annotations)) 59 | 60 | random.shuffle(annotations) 61 | 62 | sizes = [int(len(annotations)*r) for r in ratios] 63 | 64 | print(sizes) 65 | 66 | train_anns = annotations[:sizes[0]] 67 | val_anns = annotations[sizes[0]:sizes[0]+sizes[1]] 68 | test_anns = annotations[sizes[0]+sizes[1]:] 69 | 70 | print("train size:", len(train_anns)) 71 | print("val size:", len(val_anns)) 72 | print("test size:", len(test_anns)) 73 | 74 | with open(train_output, 'w') as out: 75 | for line in train_anns: 76 | out.write(json.dumps(line)) 77 | out.write("\n") 78 | 79 | with open(val_output, 'w') as out: 80 | for line in val_anns: 81 | out.write(json.dumps(line)) 82 | out.write("\n") 83 | 84 | with open(test_output, 'w') as out: 85 | for line in test_anns: 86 | out.write(json.dumps(line)) 87 | out.write("\n") -------------------------------------------------------------------------------- /src/configs/projects/train/actionbench/ssv2/KP-Transformer-VTC.yaml: -------------------------------------------------------------------------------- 1 | 2 | ## == internvideo backbone == ## 3 | model: 4 | arch: patch_and_fuse_internvideo_baseline_simple 5 | model_type: InternVideo-MM-L-14 6 | load_pretrained: True 7 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 8 | 9 | ## == ClipVip backbone == ## 10 | ## NOTE: uncomment following "model" section to use the clipvip backbone; modify the output_dir accordingly under "run:" section 11 | # model: 12 | # arch: patch_and_fuse_clipvip_baseline_simple # w/o jointly train vis encoder using action prediction 13 | # model_type: pretrain_vip_base_32 14 | # load_pretrained: True 15 | # backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 16 | 17 | 18 | ## == Singularity backbone == ## 19 | ## NOTE: uncomment following "model" section to use the singularity backbone; modify the output_dir accordingly under "run:" section 20 | # model: 21 | # arch: patch_and_fuse_singularity_baseline_simple # w/o jointly train vis encoder using action prediction 22 | # model_type: singularity_temporal_17m 23 | # load_pretrained: True 24 | # backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 25 | 26 | 27 | datasets: 28 | actionbench_ssv2_224x224_5fps: 29 | type: "action_antonyms_and_object_shuffled" 30 | vis_processor: 31 | train: 32 | name: "video_train" 33 | image_size: 224 34 | eval: 35 | name: "internvideo_eval" 36 | image_size: 224 37 | text_processor: 38 | train: 39 | name: "minimum_text" 40 | eval: 41 | name: "minimum_text" 42 | 43 | # IMPORTANT configs: 44 | fps: 5 # if downsampled, use 5 fps 45 | task: "action_antonym" 46 | neg_sampling_same_clip: 0 47 | # eval_only: True 48 | 49 | # other arguements 50 | train_k: null # sample a subset of k instances 51 | eval_k: null # sample a subset of k instances, reduce evaluation time 52 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 53 | num_frm: 8 54 | train_frame_height: 224 55 | train_frame_width: 224 56 | eval_frame_height: 224 57 | eval_frame_width: 224 58 | 59 | run: 60 | # use custom runner 61 | runner: runner_base_patch_and_fuse 62 | 63 | # task object name 64 | task: actionbench 65 | 66 | # which module is used for inference ["backbone", "knowledge_patcher", "knowledge_patcher_baseline"] 67 | eval_module: knowledge_patcher_baseline 68 | eval_task: action_antonym 69 | 70 | # optimizer 71 | lr_sched: "linear_warmup_cosine_lr" 72 | init_lr: 1e-5 73 | min_lr: 0 74 | weight_decay: 0.05 75 | max_epoch: 1 76 | 77 | batch_size_train: 8 78 | batch_size_eval: 4 79 | 80 | num_workers: 4 81 | 82 | seed: 42 83 | output_dir: "output/actionbench/train/SSv2__KP-Transformer-VTC__internvideo" 84 | 85 | amp: False 86 | resume_ckpt_path: null 87 | 88 | evaluate: False 89 | 90 | train_splits: ["train"] 91 | valid_splits: ["val"] 92 | # test_splits: ["test"] 93 | 94 | device: "cuda" 95 | world_size: 1 96 | dist_url: "env://" 97 | distributed: True 98 | -------------------------------------------------------------------------------- /src/configs/projects/train/actionbench/ego4d/KP-Transformer-VTC.yaml: -------------------------------------------------------------------------------- 1 | 2 | ## == internvideo backbone == ## 3 | model: 4 | arch: patch_and_fuse_internvideo_baseline_simple 5 | model_type: InternVideo-MM-L-14 6 | load_pretrained: True 7 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 8 | 9 | ## == ClipVip backbone == ## 10 | ## NOTE: uncomment following "model" section to use the clipvip backbone; modify the output_dir accordingly under "run:" section 11 | # model: 12 | # arch: patch_and_fuse_clipvip_baseline_simple # w/o jointly train vis encoder using action prediction 13 | # model_type: pretrain_vip_base_32 14 | # load_pretrained: True 15 | # backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json" 16 | 17 | 18 | ## == Singularity backbone == ## 19 | ## NOTE: uncomment following "model" section to use the singularity backbone; modify the output_dir accordingly under "run:" section 20 | # model: 21 | # arch: patch_and_fuse_singularity_baseline_simple # w/o jointly train vis encoder using action prediction 22 | # model_type: singularity_temporal_17m 23 | # load_pretrained: True 24 | # backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml" 25 | 26 | 27 | datasets: 28 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 29 | vis_processor: 30 | train: 31 | name: "video_train" 32 | image_size: 224 33 | eval: 34 | name: "internvideo_eval" 35 | image_size: 224 36 | text_processor: 37 | train: 38 | name: "vl_dynamic_ego4d_text" 39 | eval: 40 | name: "vl_dynamic_ego4d_text" 41 | 42 | # IMPORTANT configs: 43 | fps: 5 # if downsampled, use 5 fps 44 | task: "action_antonym" 45 | neg_sampling_same_clip: 0 46 | # eval_only: True 47 | 48 | # other arguements 49 | train_k: null # sample a subset of k instances 50 | eval_k: null # sample a subset of k instances 51 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 52 | num_frm: 8 53 | train_frame_height: 224 54 | train_frame_width: 224 55 | eval_frame_height: 224 56 | eval_frame_width: 224 57 | 58 | run: 59 | # use custom runner 60 | runner: runner_base_patch_and_fuse 61 | 62 | # task object name 63 | task: actionbench 64 | 65 | # which module is used for inference ["backbone", "knowledge_patcher", "knowledge_patcher_baseline"] 66 | eval_module: knowledge_patcher_baseline 67 | eval_task: action_antonym 68 | 69 | # optimizer 70 | lr_sched: "linear_warmup_cosine_lr" 71 | init_lr: 1e-5 72 | min_lr: 0 73 | weight_decay: 0.05 74 | max_epoch: 1 75 | 76 | batch_size_train: 8 77 | batch_size_eval: 4 78 | 79 | num_workers: 4 80 | 81 | seed: 42 82 | output_dir: "output/actionbench/train/Ego4d__KP-Transformer-VTC__internvideo" 83 | 84 | amp: False 85 | resume_ckpt_path: null 86 | 87 | evaluate: False 88 | 89 | train_splits: ["train"] 90 | valid_splits: ["val"] 91 | # test_splits: ["test"] 92 | 93 | device: "cuda" 94 | world_size: 1 95 | dist_url: "env://" 96 | distributed: True 97 | -------------------------------------------------------------------------------- /src/Singularity/utils/config_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | from os.path import join, dirname 5 | from omegaconf import OmegaConf, ListConfig, DictConfig 6 | from .distributed import init_distributed_mode, is_main_process 7 | from .logger import setup_logger 8 | # from utils.distributed import init_distributed_mode, is_main_process 9 | # from utils.logger import setup_logger 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def convert_types(config): 16 | """Convert `'None'` (str) --> `None` (None). Only supports top-level""" 17 | for k, v in config.items(): 18 | if isinstance(v, DictConfig): 19 | setattr(config, k, convert_types(v)) 20 | 21 | # TODO convert types in ListConfig, right now they are ignored 22 | # if isinstance(v, ListConfig): 23 | # new_v = ListConfig() 24 | 25 | if v in ["None", "none"]: 26 | setattr(config, k, None) 27 | return config 28 | 29 | 30 | def setup_config(): 31 | """Conbine yaml config and command line config with OmegaConf. 32 | Also converts types, e.g., `'None'` (str) --> `None` (None) 33 | """ 34 | config_path = sys.argv[1] 35 | del sys.argv[1] # not needed 36 | cli_args = sys.argv[1:] 37 | yaml_config = OmegaConf.load(config_path) 38 | cli_config = OmegaConf.from_cli() if len(cli_args) else OmegaConf.create() 39 | # the latter overwrite the former, i.e, cli_config higher priority. 40 | logger.info(f"Command line configs: {cli_config}") 41 | config = OmegaConf.merge(yaml_config, cli_config) 42 | config = convert_types(config) 43 | if config.debug: 44 | config.wandb.enable = False 45 | return config 46 | 47 | 48 | def setup_evaluate_config(config): 49 | """setup evaluation default settings, e.g., disable wandb""" 50 | assert config.evaluate 51 | config.wandb.enable = False 52 | if config.output_dir is None: 53 | config.output_dir = join(dirname(config.pretrained_path), "eval") 54 | return config 55 | 56 | 57 | def setup_output_dir(output_dir, excludes=["code"]): 58 | """ensure not overwritting an exisiting/non-empty output dir""" 59 | if not os.path.exists(output_dir): 60 | os.makedirs(output_dir, exist_ok=False) 61 | else: 62 | existing_dirs_files = os.listdir(output_dir) # list 63 | remaining = set(existing_dirs_files) - set(excludes) 64 | remaining = [e for e in remaining if "slurm" not in e] 65 | assert len(remaining) == 0, f"remaining dirs or files: {remaining}" 66 | 67 | 68 | def setup_main(): 69 | """ 70 | Setup config, logger, output_dir, etc. 71 | Shared for pretrain and all downstream tasks. 72 | """ 73 | config = setup_config() 74 | if hasattr(config, "evaluate") and config.evaluate: 75 | config = setup_evaluate_config(config) 76 | init_distributed_mode(config) 77 | 78 | if is_main_process(): 79 | setup_output_dir(config.output_dir, excludes=["code"]) 80 | setup_logger(output=config.output_dir, color=True, name="loopitr") 81 | OmegaConf.save( 82 | config, open(os.path.join(config.output_dir, 'config.yaml'), 'w')) 83 | return config 84 | -------------------------------------------------------------------------------- /src/Singularity/configs/qa_vqa.yaml: -------------------------------------------------------------------------------- 1 | dataset_name: vqa 2 | data_root: ${oc.env:SL_DATA_DIR}/videos_images 3 | anno_root_downstream: ${oc.env:SL_DATA_DIR}/anno_downstream 4 | train_file: # each file should be formatted similar to data/downstream/vqa_train_sample.json 5 | - ['${anno_root_downstream}/vqa_train.json', '${data_root}/coco'] 6 | - ['${anno_root_downstream}/vqa_valminusminival.json', '${data_root}/coco'] 7 | - ['${anno_root_downstream}/vg_qa.json', '${data_root}/vg'] 8 | test_types: [minival, ] # one of [minival, test] 9 | test_file: 10 | minival: ['${anno_root_downstream}/vqa_minival.json', '${data_root}/coco'] 11 | test: ['${anno_root_downstream}/vqa_test.json', '${data_root}/coco'] 12 | stop_key: minival # on of the key in `test_file` 13 | answer_list: ${anno_root_downstream}/vqa_answer_list.json # list of answer words 14 | 15 | text_encoder: bert-base-uncased 16 | text_decoder: bert-base-uncased 17 | bert_config: configs/config_bert.json 18 | vit_type: beit # items in ${vit_zoo} 19 | vit_zoo: # from huggingface 20 | beit: microsoft/beit-base-patch16-224-pt22k-ft22k 21 | vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]} 22 | temporal_vision_encoder: 23 | enable: False 24 | num_layers: 2 25 | update_pooler_embed: False 26 | add_temporal_embed: False # whether to add temporal embed to encoded frames 27 | 28 | image_res: 224 29 | embed_dim: 256 # -- not used 30 | video_input: # input -- not used 31 | num_frames: 1 32 | reader: decord # one of [decord, av] 33 | sample_type: rand 34 | num_frames_test: 1 # num_frames during inference/test 35 | sample_type_test: middle 36 | max_q_len: 25 37 | max_a_len: 6 38 | 39 | batch_size: 40 | image: 128 41 | video: 128 42 | batch_size_test: 43 | image: 64 44 | video: 64 45 | k_test: 128 46 | temp: 0.07 # -- not used 47 | eos: '[SEP]' 48 | 49 | optimizer: 50 | opt: adamW 51 | lr: 1e-5 52 | opt_betas: [0.9, 0.999] # default 53 | weight_decay: 0.02 54 | max_grad_norm: -1 # requires a positive float, use -1 to disable 55 | different_lr: # use a different lr for some modules, e.g., larger lr for new modules 56 | enable: False 57 | module_names: [] 58 | lr: 1e-3 59 | 60 | scheduler: 61 | sched: cosine 62 | epochs: 5 63 | min_lr_multi: 0.1 # min_lr will be `optimizer.lr * min_lr_multi` 64 | warmup_epochs: 0.5 # float 65 | 66 | output_dir: None # output dir 67 | pretrained_path: None # path to pretrained model weights 68 | resume: False # if True, load optimizer and scheduler states as well 69 | evaluate: False 70 | # `eval_frame_ensemble': how do we aggregate scores if `video_input.num_frames_test' > `video_input.num_frames' 71 | # `concat': concat frames before input to multi-modal encoder, i.e., early fusion 72 | # `mean', `max', `lse': mean/max/lse-pool scores after multi-modal encoder, i.e., late fusion, as in ClipBERT 73 | eval_frame_ensemble: concat # [concat, max, mean, lse] 74 | device: cuda 75 | seed: 42 76 | log_freq: 100 77 | dist_url: env:// 78 | distributed: True 79 | fp16: True 80 | debug: False 81 | num_workers: 16 82 | 83 | wandb: 84 | enable: False 85 | entity: None # username or teamname to store the runs, see https://docs.wandb.ai/ref/python/init 86 | project: vqa # setup in your command line 87 | 88 | -------------------------------------------------------------------------------- /src/ClipViP/src/optimization/sched.py: -------------------------------------------------------------------------------- 1 | """ 2 | optimizer learning rate scheduling helpers 3 | """ 4 | import math 5 | from math import ceil 6 | from collections import Counter 7 | 8 | 9 | def noam_schedule(step, warmup_step=4000): 10 | if step <= warmup_step: 11 | return step / warmup_step 12 | return (warmup_step ** 0.5) * (step ** -0.5) 13 | 14 | 15 | def warmup_linear(step, warmup_step, tot_step): 16 | if step < warmup_step: 17 | return step / warmup_step 18 | return max(0, (tot_step-step)/(tot_step-warmup_step)) 19 | 20 | def warmup_cosine(step, warmup_step, tot_step): 21 | if step < warmup_step: 22 | return step / warmup_step 23 | progress = (step - warmup_step) / (tot_step - warmup_step) 24 | return 0.5 * (1.0 + math.cos(math.pi * progress)) 25 | 26 | def multi_step_schedule(n_epoch, milestones, step, warmup_step,gamma=0.5): 27 | if step <= warmup_step: 28 | return step / warmup_step 29 | 30 | milestones = list(sorted(milestones)) 31 | for i, m in enumerate(milestones): 32 | if n_epoch < m: 33 | return gamma**i 34 | return gamma**(len(milestones)+1) 35 | 36 | class AutoStep(): 37 | def __init__(self, tolerance, gamma): 38 | self.tolerance = tolerance 39 | self.coeff_mem = 1 40 | self.gamma = gamma 41 | self.best_score = 0. 42 | self.count = 0 43 | 44 | def step(self, score): 45 | if score <= self.best_score: 46 | self.count += 1 47 | else: 48 | self.count = 0 49 | self.best_score = score 50 | if self.count > self.tolerance: 51 | self.count = 0 52 | self.coeff_mem = self.coeff_mem * self.gamma 53 | 54 | def get_lr(self, global_step, learning_rate, num_train_steps, warmup_ratio=0.1): 55 | warmup_steps = int(warmup_ratio * num_train_steps) 56 | if global_step <= warmup_steps: 57 | return learning_rate * global_step / warmup_steps 58 | 59 | return max(self.coeff_mem * learning_rate, 1e-8) 60 | 61 | 62 | def get_lr_sched(global_step, decay, learning_rate, 63 | num_train_steps, warmup_ratio=0.1, 64 | decay_epochs=[], multi_step_epoch=-1): 65 | warmup_steps = int(warmup_ratio*num_train_steps) 66 | if decay == 'linear': 67 | lr_this_step = learning_rate * warmup_linear( 68 | global_step, warmup_steps, num_train_steps) 69 | elif decay == 'cosine': 70 | lr_this_step = learning_rate * warmup_cosine( 71 | global_step, warmup_steps, num_train_steps) 72 | elif decay == 'invsqrt': 73 | lr_this_step = learning_rate * noam_schedule( 74 | global_step, warmup_steps) 75 | elif decay == 'constant': 76 | lr_this_step = learning_rate 77 | elif decay == "multi_step": 78 | assert multi_step_epoch >= 0 79 | lr_this_step = learning_rate * multi_step_schedule( 80 | multi_step_epoch, decay_epochs, global_step, warmup_steps) 81 | if lr_this_step <= 0: 82 | # save guard for possible miscalculation of train steps 83 | lr_this_step = 1e-8 84 | return lr_this_step 85 | -------------------------------------------------------------------------------- /src/configs/projects/eval/actionbench/knowledge_patcher/acdybench_ego4d_internvideo_KP-Perceiver-VTC-DVDM__action_antonym.yaml: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs 2 | 3 | ## == internvideo backbone == ## 4 | model: 5 | arch: patch_and_fuse_internvideo 6 | model_type: InternVideo-MM-L-14 7 | load_pretrained: True 8 | backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt" 9 | pretrained: "pretrained_ckpt/PatchAndFuse/ActionBench/actionbench_ego4d_patch_and_fuse.pth" 10 | 11 | text_perceiver_config: 12 | dim: 768 # latent query dim 13 | k_v_dim: 768 # text_width 14 | depth: 1 15 | dim_head: 64 16 | heads: 8 17 | num_latents: 16 18 | ff_mult: 2 19 | 20 | vision_perceiver_config: 21 | dim: 768 # latent query dim 22 | k_v_dim: 1024 # vision_width 23 | depth: 1 24 | dim_head: 64 25 | heads: 8 26 | num_latents: 16 27 | ff_mult: 2 28 | 29 | objectives: ["video_text_contrastive","video_action_contrastive","action_temporal_matching"] 30 | loss_weighting: [1.0,1.0,0.4] 31 | if_use_attn_guidance: False 32 | if_use_dual_perceiver: False 33 | if_add_temporal_emebdding: True 34 | num_frms: 8 35 | temp_emb_drop_out: 0.1 36 | 37 | # do ATM only on state change salient videos 38 | state_change_filtering_for_FDM: True 39 | 40 | datasets: 41 | actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation 42 | vis_processor: 43 | train: 44 | name: "video_train" 45 | image_size: 224 46 | eval: 47 | name: "internvideo_eval" 48 | image_size: 224 49 | text_processor: 50 | train: 51 | name: "vl_dynamic_ego4d_text" 52 | eval: 53 | name: "vl_dynamic_ego4d_text" 54 | 55 | # IMPORTANT configs: 56 | fps: 5 # if downsampled, use 5 fps 57 | task: "action_antonym" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"] 58 | neg_sampling_same_clip: 0 # evaluation set to 0 59 | eval_only: True 60 | 61 | # other arguements 62 | k: null # sample a subset of k instances 63 | frm_sampling_strategy: "uniform" # take the first and last frame as start and end state 64 | num_frm: 8 65 | train_frame_height: 224 66 | train_frame_width: 224 67 | eval_frame_height: 224 68 | eval_frame_width: 224 69 | 70 | run: 71 | # use custom runner 72 | runner: runner_base_patch_and_fuse 73 | 74 | # task object name 75 | task: actionbench 76 | 77 | # which module is used for inference ["backbone", "knowledge_patcher"] 78 | eval_module: backbone 79 | eval_task: action_antonym 80 | 81 | batch_size_train: 32 82 | batch_size_eval: 4 83 | num_workers: 4 84 | 85 | seed: 42 86 | output_dir: "output/actionbench/eval/knowledge_patcher/ego4d__internvideo_KP-Perceiver-VTC-DVDM__action_antonym" 87 | 88 | amp: False 89 | resume_ckpt_path: null 90 | 91 | evaluate: True 92 | # train_splits: ["train"] 93 | # valid_splits: ["val"] 94 | test_splits: ["test"] 95 | 96 | device: "cuda" 97 | world_size: 1 98 | dist_url: "env://" 99 | distributed: True 100 | --------------------------------------------------------------------------------