├── src
    ├── ClipViP
    │   └── src
    │   │   ├── __init__.py
    │   │   ├── utils
    │   │       ├── __init__.py
    │   │       ├── misc.py
    │   │       ├── metrics.py
    │   │       └── logger.py
    │   │   ├── datasets
    │   │       └── __init__.py
    │   │   ├── modeling
    │   │       └── __init__.py
    │   │   ├── optimization
    │   │       ├── __init__.py
    │   │       └── sched.py
    │   │   └── configs
    │   │       ├── lsmdc_retrieval
    │   │           ├── lsmdc_retrieval_vip_base_16.json
    │   │           └── lsmdc_retrieval_vip_base_32.json
    │   │       ├── didemo_retrieval
    │   │           ├── didemo_retrieval_vip_base_32.json
    │   │           └── didemo_retrieval_vip_base_16.json
    │   │       ├── msrvtt_retrieval
    │   │           ├── msrvtt_retrieval_vip_base_16.json
    │   │           └── msrvtt_retrieval_vip_base_32.json
    │   │       ├── actnet_retrieval
    │   │           ├── actnet_retrieval_vip_base_16.json
    │   │           └── actnet_retrieval_vip_base_32.json
    │   │       ├── pretrain
    │   │           ├── pretrain_vip_base_16.json
    │   │           └── pretrain_vip_base_32.json
    │   │       └── pretrained
    │   │           └── pretrain_vip_base_32.json
    ├── Singularity
    │   ├── __init__.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── model_retrieval.py
    │   ├── configs
    │   │   ├── config_bert.json
    │   │   ├── beit-base-patch16-224-pt22k-ft22k.json
    │   │   ├── qa_anet.yaml
    │   │   ├── qa_msrvtt.yaml
    │   │   └── qa_vqa.yaml
    │   ├── dataset
    │   │   ├── dataloader.py
    │   │   ├── base_dataset.py
    │   │   └── qa_dataset.py
    │   └── utils
    │   │   ├── scheduler.py
    │   │   └── config_utils.py
    ├── InternVideo
    │   ├── __init__.py
    │   ├── clip_utils
    │   │   ├── __init__.py
    │   │   └── utils
    │   │   │   └── __init__.py
    │   └── bpe_simple_vocab_16e6.txt.gz
    ├── demo_video
    │   └── ssv2_194058__book_falling_like_a_rock.mp4
    ├── run_scripts
    │   ├── train.sh
    │   ├── eval_downstream_task.sh
    │   ├── eval_actionbench.sh
    │   └── inference.sh
    ├── configs
    │   ├── datasets
    │   │   ├── downstream_tasks
    │   │   │   ├── downstream_tasks_moments_in_time.yaml
    │   │   │   ├── downstream_tasks_temporal_224x224_5fps.yaml
    │   │   │   ├── downstream_tasks_retrieval_ssv2_224x224_5fps.yaml
    │   │   │   └── downstream_tasks_qa_nextqa_224x224_5fps.yaml
    │   │   └── actionbench
    │   │   │   ├── actionbench_ssv2_224x224_5fps.yaml
    │   │   │   ├── actionbench_ssv2_antonyms_224x224_5fps.yaml
    │   │   │   ├── actionbench_ego4d_224x224_5fps.yaml
    │   │   │   └── actionbench_ego4d_object_shuffled_224x224_5fps.yaml
    │   ├── projects
    │   │   ├── eval
    │   │   │   ├── actionbench
    │   │   │   │   ├── knowledge_patcher
    │   │   │   │   │   ├── README.md
    │   │   │   │   │   └── acdybench_ego4d_internvideo_KP-Perceiver-VTC-DVDM__action_antonym.yaml
    │   │   │   │   └── backbone
    │   │   │   │   │   ├── internvideo
    │   │   │   │   │       ├── ego4d
    │   │   │   │   │       │   ├── acdybench_ego4d_internvideo_backbone__action_antonym.yaml
    │   │   │   │   │       │   ├── acdybench_ego4d_internvideo_backbone__reversed_video.yaml
    │   │   │   │   │       │   └── acdybench_ego4d_internvideo_backbone__object_shuffle.yaml
    │   │   │   │   │       └── ssv2
    │   │   │   │   │       │   ├── acdybench_ssv2_internvideo_backbone__action_antonym.yaml
    │   │   │   │   │       │   ├── acdybench_ssv2_internvideo_backbone__reversed_video.yaml
    │   │   │   │   │       │   └── acdybench_ssv2_internvideo_backbone__object_shuffle.yaml
    │   │   │   │   │   ├── clipvip
    │   │   │   │   │       ├── ego4d
    │   │   │   │   │       │   ├── acdybench_ego4d_clipvip_backbone__action_antonym.yaml
    │   │   │   │   │       │   ├── acdybench_ego4d_clipvip_backbone__reversed_video.yaml
    │   │   │   │   │       │   └── acdybench_ego4d_clipvip_backbone__object_shuffle.yaml
    │   │   │   │   │       └── ssv2
    │   │   │   │   │       │   ├── acdybench_ssv2_clipvip_backbone__action_antonym.yaml
    │   │   │   │   │       │   ├── acdybench_ssv2_clipvip_backbone__object_shuffle.yaml
    │   │   │   │   │       │   └── acdybench_ssv2_clipvip_backbone__reversed_video.yaml
    │   │   │   │   │   └── singularity
    │   │   │   │   │       ├── ego4d
    │   │   │   │   │           ├── acdybench_ego4d_singularity_backbone__action_antonym.yaml
    │   │   │   │   │           ├── acdybench_ego4d_singularity_backbone__reversed_video.yaml
    │   │   │   │   │           └── acdybench_ego4d_singularity_backbone__object_shuffle.yaml
    │   │   │   │   │       └── ssv2
    │   │   │   │   │           ├── acdybench_ssv2_singularity_backbone__action_antonym.yaml
    │   │   │   │   │           ├── acdybench_ssv2_singularity_backbone__object_shuffle.yaml
    │   │   │   │   │           └── acdybench_ssv2_singularity_backbone__reversed_video.yaml
    │   │   │   └── downstream_task
    │   │   │   │   ├── nextqa
    │   │   │   │       ├── backbone_zero-shot.yaml
    │   │   │   │       ├── side_tuning.yaml
    │   │   │   │       └── patch_and_fuse.yaml
    │   │   │   │   ├── temporal_ssv2
    │   │   │   │       ├── backbone_zero-shot.yaml
    │   │   │   │       ├── side_tuning.yaml
    │   │   │   │       └── patch_and_fuse.yaml
    │   │   │   │   ├── ssv2_label
    │   │   │   │       ├── backbone_zero-shot.yaml
    │   │   │   │       ├── side_tuning.yaml
    │   │   │   │       └── patch_and_fuse.yaml
    │   │   │   │   ├── ssv2_template
    │   │   │   │       ├── backbone_zero-shot.yaml
    │   │   │   │       ├── side_tuning.yaml
    │   │   │   │       └── patch_and_fuse.yaml
    │   │   │   │   ├── moments_in_time
    │   │   │   │       ├── backbone_zero-shot.yaml
    │   │   │   │       ├── side_tuning_zero-shot.yaml
    │   │   │   │       └── patch_and_fuse_zero-shot.yaml
    │   │   │   │   └── temporal_kinetics
    │   │   │   │       ├── backbone_zero-shot.yaml
    │   │   │   │       ├── side_tuning_zero-shot.yaml
    │   │   │   │       └── patch_and_fuse_zero-shot.yaml
    │   │   └── train
    │   │   │   ├── downstream_tasks
    │   │   │       └── nextqa
    │   │   │       │   ├── KP-Perceiver-VTC-DVDM.yaml
    │   │   │       │   ├── Patch_and_Fuse.yaml
    │   │   │       │   └── Side_Tuning.yaml
    │   │   │   └── actionbench
    │   │   │       ├── ssv2
    │   │   │           └── KP-Transformer-VTC.yaml
    │   │   │       └── ego4d
    │   │   │           └── KP-Transformer-VTC.yaml
    │   └── models
    │   │   ├── patch_and_fuse_intern_video.yaml
    │   │   ├── patch_and_fuse_clip_vip.yaml
    │   │   └── patch_and_fuse_singularity.yaml
    ├── preprocessing
    │   ├── ego4d
    │   │   └── downsample_downsize_video_clips.py
    │   ├── ssv2
    │   │   └── downsample_downsize_video_clips.py
    │   ├── kinetics
    │   │   └── downsample_downsize_video_clips.py
    │   └── nextqa
    │   │   └── downsample_downsize_video_nextqa.py
    └── _get_model_computational_complexity.py
├── Paxion_overview.png
├── ActionBench_overview.png
├── .gitmodules
├── ActionBench
    └── src
    │   ├── ignored_verbs_ssv2.json
    │   ├── ignored_verbs_ego4d.json
    │   ├── README.md
    │   ├── additional_antonyms_mapping_ssv2.json
    │   ├── get_object_shuffling_ssv2.py
    │   └── split_train_val_test_ego4d.py
├── .gitignore
└── dataset_cards
    ├── moments_in_time.md
    ├── downstream_tasks_ssv2.md
    ├── actionbench_ssv2.md
    ├── nextqa.md
    ├── downstream_tasks_temporal.md
    └── actionbench_ego4d.md


/src/ClipViP/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/Singularity/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/Singularity/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/optimization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/InternVideo/__init__.py:
--------------------------------------------------------------------------------
1 | from .internvideo import *


--------------------------------------------------------------------------------
/src/InternVideo/clip_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 | 


--------------------------------------------------------------------------------
/Paxion_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MikeWangWZHL/Paxion/HEAD/Paxion_overview.png


--------------------------------------------------------------------------------
/ActionBench_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MikeWangWZHL/Paxion/HEAD/ActionBench_overview.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/LAVIS"]
2 | 	path = src/LAVIS
3 | 	url = https://github.com/MikeWangWZHL/LAVIS.git


--------------------------------------------------------------------------------
/src/InternVideo/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MikeWangWZHL/Paxion/HEAD/src/InternVideo/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/src/demo_video/ssv2_194058__book_falling_like_a_rock.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MikeWangWZHL/Paxion/HEAD/src/demo_video/ssv2_194058__book_falling_like_a_rock.mp4


--------------------------------------------------------------------------------
/ActionBench/src/ignored_verbs_ssv2.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "hitting",
 3 |     "moving",
 4 |     "pass",
 5 |     "poking",
 6 |     "pick",
 7 |     "pretending",
 8 |     "poke",
 9 |     "putting"
10 | ]


--------------------------------------------------------------------------------
/src/InternVideo/clip_utils/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # from .evl_module import TransformerDecoder
2 | from .clip_vit_only_global import vit_only_global_b32, vit_only_global_b16, vit_only_global_l14, vit_only_global_l14_336


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | datasets/*
 2 | ActionBench/ego4d
 3 | ActionBench/ssv2
 4 | pretrained_ckpt/*
 5 | *.pth
 6 | *.pt
 7 | *.ckpt
 8 | checkpoint/
 9 | dummy_dataset/
10 | src/visualization
11 | output/
12 | src/testing_outputs
13 | _backup
14 | **/__pycache__/
15 | **/*.code-workspace
16 | ckpt/
17 | *.pt
18 | *tar.gz
19 | *.zip
20 | # *.png
21 | # *.jpg


--------------------------------------------------------------------------------
/src/Singularity/models/model_retrieval.py:
--------------------------------------------------------------------------------
 1 | from .model_retrieval_base import SingularityRetrievalBase
 2 | # from models.model_retrieval_base import SingularityRetrievalBase
 3 | 
 4 | 
 5 | class Singularity(SingularityRetrievalBase):
 6 |     def __init__(self, config=None, tokenizer=None):
 7 |         super(Singularity, self).__init__(
 8 |             config=config, tokenizer=tokenizer, pretrain=False
 9 |         )
10 | 


--------------------------------------------------------------------------------
/src/run_scripts/train.sh:
--------------------------------------------------------------------------------
 1 | DEVICES=0,1 # comma-separated list of GPU IDs
 2 | N_GPU=2 # number of GPUs to use for training
 3 | PORT=29501
 4 | 
 5 | # takes in a .yaml config file from configs/projects/train, e.g.,
 6 | CONFIG="configs/projects/train/actionbench/ssv2/KP-Perceiver-VTC-DVDM.yaml"
 7 | CUDA_VISIBLE_DEVICES=${DEVICES} python -m torch.distributed.run \
 8 |     --nproc_per_node=${N_GPU} \
 9 |     --master_port=${PORT} \
10 |     train.py --cfg-path ${CONFIG}


--------------------------------------------------------------------------------
/src/run_scripts/eval_downstream_task.sh:
--------------------------------------------------------------------------------
 1 | DEVICES=0 # support one GPU only for downstream evaluation (fast)
 2 | N_GPU=1
 3 | PORT=29501
 4 | 
 5 | # takes in a .yaml config file from configs/projects/eval/downstream_task, e.g.,
 6 | CONFIG="configs/projects/eval/downstream_task/ssv2_template/backbone_zero-shot.yaml"
 7 | CUDA_VISIBLE_DEVICES=${DEVICES} python -m torch.distributed.run \
 8 |     --nproc_per_node=${N_GPU} \
 9 |     --master_port=${PORT} \
10 |     evaluate.py --cfg-path ${CONFIG}


--------------------------------------------------------------------------------
/dataset_cards/moments_in_time.md:
--------------------------------------------------------------------------------
1 | ## Instruction for Downloading Videos
2 | - Download the videos following the instructions [here](http://moments.csail.mit.edu/)
3 | - Put the downloaded videos to `datasets/Moments_In_Time/videos`
4 | 
5 | ## Annotation Details
6 | We subsample ~2k instances from the original validation set for doing the zero-shot action classification. 
7 | - validation size: 1830
8 | - ann_path: `datasets/Moments_In_Time/ann/validationSet_2k.csv`
9 | - format: refer to `datasets/Moments_In_Time/ann/README.md`


--------------------------------------------------------------------------------
/src/run_scripts/eval_actionbench.sh:
--------------------------------------------------------------------------------
 1 | DEVICES=0,1 # comma-separated list of GPU IDs
 2 | N_GPU=2 # number of GPUs to use for training
 3 | PORT=29501
 4 | 
 5 | # takes in a .yaml config file from configs/projects/eval/actionbench, e.g.,
 6 | CONFIG="configs/projects/eval/actionbench/backbone/internvideo/ssv2/actionbench_ssv2_internvideo_backbone__action_antonym.yaml"
 7 | CUDA_VISIBLE_DEVICES=${DEVICES} python -m torch.distributed.run \
 8 |     --nproc_per_node=${N_GPU} \
 9 |     --master_port=${PORT} \
10 |     evaluate.py --cfg-path ${CONFIG}


--------------------------------------------------------------------------------
/src/Singularity/configs/config_bert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 30522,
19 |   "fusion_layer": 9,
20 |   "encoder_width": 768
21 | }
22 | 


--------------------------------------------------------------------------------
/src/configs/datasets/downstream_tasks/downstream_tasks_moments_in_time.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   downstream_tasks_moment_in_time: # name of the dataset builder
 3 |     dataset_card: dataset_cards/moments_in_time.md
 4 |     data_type: videos #extracted features of videos (I3D, VGGish) # [images|videos|features]
 5 | 
 6 |     build_info:
 7 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 8 |       annotations:
 9 |         train:
10 |           path: datasets/Moments_In_Time/ann
11 |         val:
12 |           path: datasets/Moments_In_Time/ann
13 |       videos:
14 |         path: datasets/Moments_In_Time/videos


--------------------------------------------------------------------------------
/src/configs/datasets/downstream_tasks/downstream_tasks_temporal_224x224_5fps.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   downstream_tasks_temporal: # name of the dataset builder
 3 |     dataset_card: dataset_cards/downstream_tasks_temporal.md
 4 |     data_type: videos #extracted features of videos (I3D, VGGish) # [images|videos|features]
 5 | 
 6 |     build_info:
 7 |       annotations:
 8 |         val:
 9 |           path: datasets/Temporal/ann
10 |       videos:
11 |         path:
12 |           kinetics: datasets/Temporal/video_clips/kinetics400/clips_downsampled_5fps_downsized_224x224
13 |           ssv2: datasets/SSv2/video_clips/clips_downsampled_5fps_downsized_224x224


--------------------------------------------------------------------------------
/src/configs/datasets/downstream_tasks/downstream_tasks_retrieval_ssv2_224x224_5fps.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   downstream_tasks_retrieval_ssv2_224x224_5fps: # name of the dataset builder
 3 |     dataset_card: dataset_cards/downstream_task_ssv2.md
 4 |     data_type: videos # [images|videos|features]
 5 | 
 6 |     build_info:
 7 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 8 |       annotations:
 9 |         train:
10 |           path: datasets/SSv2/ssv2_label_ssv2_template
11 |         val:
12 |           path: datasets/SSv2/ssv2_label_ssv2_template
13 |       videos:
14 |         path: datasets/SSv2/video_clips/clips_downsampled_5fps_downsized_224x224


--------------------------------------------------------------------------------
/src/ClipViP/src/utils/misc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | modified from UNITER
 3 | """
 4 | import json
 5 | import random
 6 | import sys
 7 | 
 8 | import torch
 9 | import numpy as np
10 | 
11 | 
12 | class NoOp(object):
13 |     """ useful for distributed training No-Ops """
14 |     def __getattr__(self, name):
15 |         return self.noop
16 | 
17 |     def noop(self, *args, **kwargs):
18 |         return
19 | 
20 | 
21 | def set_random_seed(seed):
22 |     random.seed(seed)
23 |     np.random.seed(seed)
24 |     torch.manual_seed(seed)
25 |     torch.cuda.manual_seed_all(seed)
26 | 
27 | 
28 | def zero_none_grad(model):
29 |     for p in model.parameters():
30 |         if p.grad is None and p.requires_grad:
31 |             p.grad = p.data.new(p.size()).zero_()
32 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/knowledge_patcher/README.md:
--------------------------------------------------------------------------------
1 | For constructing eval configs for knowledge patcher on AcdyBench, we show two examples in this folder on SSv2 action_antonym and Ego4d action_antonym, with KP-Perceiver based on InternVideo. 
2 | 
3 | - To evaluate on other method such as KP-Transformer: replace the "model" section with the corresponding "model" sections in `configs/train/acdybench/<dataset>/<method>.yaml`. And set the "model.pretrained" field to the corresponding trained checkpoint path.
4 | - To evaluate on other tasks such as reversed_video: replace the "dataset" section with the corresponding "dataset" sections in `configs/eval/acdybench/backbone/*/*_<task>.yaml`.
5 | - Set the "run.output_dir" according to the custom setting.
6 | 


--------------------------------------------------------------------------------
/src/configs/datasets/downstream_tasks/downstream_tasks_qa_nextqa_224x224_5fps.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   downstream_tasks_qa_nextqa_224x224_5fps: # name of the dataset builder
 3 |     dataset_card: dataset_cards/downstream_task_nextqa.md
 4 |     data_type: videos # [images|videos|features]
 5 | 
 6 |     build_info:
 7 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 8 |       annotations:
 9 |         train:
10 |           path: datasets/NextQA/ann/nextqa_action_antonym
11 |         val:
12 |           path: datasets/NextQA/ann/nextqa_action_antonym
13 |         test:
14 |           path: datasets/NextQA/ann/nextqa_action_antonym
15 |       videos:
16 |         path: datasets/NextQA/video_clips/NExTVideo_downsampled_5fps_downsized_224x224


--------------------------------------------------------------------------------
/src/configs/models/patch_and_fuse_intern_video.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: patch_and_fuse_internvideo
 8 | 
 9 |   load_pretrained: True
10 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
11 | 
12 | preprocess:
13 |   vis_processor:
14 |     train:
15 |       name: "video_train"
16 |       image_size: 224
17 |     eval:
18 |       name: "internvideo_eval"
19 |       image_size: 224
20 |   text_processor:
21 |     train:
22 |       name: "vl_dynamic_ego4d_text"
23 |     eval:
24 |       name: "vl_dynamic_ego4d_text"
25 | 


--------------------------------------------------------------------------------
/src/configs/models/patch_and_fuse_clip_vip.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: patch_and_fuse_clipvip
 8 | 
 9 |   load_pretrained: True
10 |   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
11 | 
12 | 
13 | preprocess:
14 |   vis_processor:
15 |     train:
16 |       name: "video_train"
17 |       image_size: 224
18 |     eval:
19 |       name: "internvideo_eval"
20 |       image_size: 224
21 |   text_processor:
22 |     train:
23 |       name: "vl_dynamic_ego4d_text"
24 |     eval:
25 |       name: "vl_dynamic_ego4d_text"
26 | 


--------------------------------------------------------------------------------
/src/configs/models/patch_and_fuse_singularity.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: patch_and_fuse_singularity
 8 | 
 9 |   load_pretrained: True
10 |   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
11 | 
12 | 
13 | preprocess:
14 |   vis_processor:
15 |     train:
16 |       name: "video_train"
17 |       image_size: 224
18 |     eval:
19 |       name: "internvideo_eval"
20 |       image_size: 224
21 |   text_processor:
22 |     train:
23 |       name: "vl_dynamic_ego4d_text"
24 |     eval:
25 |       name: "vl_dynamic_ego4d_text"
26 | 


--------------------------------------------------------------------------------
/src/configs/datasets/actionbench/actionbench_ssv2_224x224_5fps.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   actionbench_ssv2_224x224_5fps: # name of the dataset builder
 3 |     dataset_card: dataset_cards/actionbench_ssv2.md
 4 |     data_type: videos # [images|videos|features]
 5 | 
 6 |     build_info:
 7 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 8 |       annotations:
 9 |         train:
10 |           path: ActionBench/ssv2/original
11 |           use_templates_as_labels: false
12 |         val:
13 |           path: ActionBench/ssv2/original
14 |           use_templates_as_labels: false
15 |         test:
16 |           path: ActionBench/ssv2/original
17 |           use_templates_as_labels: true # This needs to be true to be loaded properly
18 |       videos:
19 |         path: datasets/SSv2/video_clips/clips_downsampled_5fps_downsized_224x224


--------------------------------------------------------------------------------
/dataset_cards/downstream_tasks_ssv2.md:
--------------------------------------------------------------------------------
 1 | ## Instruction for Downloading Videos
 2 | refer to [actionbench_ssv2.md](./actionbench_ssv2.md)
 3 | 
 4 | ## Downstream Task: SSv2-label and SSv2-template
 5 | - paper: https://arxiv.org/abs/2206.03428
 6 | - train size: 168913
 7 | - val size: 2088
 8 | - ann_path: `/shared/nas/data/m1/wangz3/Paper_Code_Repos_Cleaned/neurips23_patch_and_fuse/datasets/SSv2/ssv2_label_ssv2_template`
 9 | - format SSv2-label:
10 |     ```
11 |         [
12 |             {"video": ["62211.webm"], "caption": "spinning soap that quickly stops spinning"},
13 |         ]
14 |     ```
15 | - format SSv2-template:
16 |     ```
17 |         [
18 |             {"video": ["62211.webm", "63095.webm", "174825.webm", "65027.webm", "65677.webm", "37955.webm", "9741.webm", "47588.webm", "31811.webm", "155308.webm", "6483.webm", "106444.webm"], "caption": "Spinning [something] that quickly stops spinning"}
19 |         ]
20 |     ```


--------------------------------------------------------------------------------
/src/Singularity/configs/beit-base-patch16-224-pt22k-ft22k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "note": "this file is a copy of the BEiT model config, not used directly",
 3 |   "architectures": [
 4 |     "BeitForImageClassification"
 5 |   ],
 6 |   "url": "https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k/raw/main/config.json",
 7 |   "attention_probs_dropout_prob": 0.0,
 8 |   "drop_path_rate": 0.1,
 9 |   "hidden_act": "gelu",
10 |   "hidden_dropout_prob": 0.0,
11 |   "hidden_size": 768,
12 |   "image_size": 224,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": 3072,
15 |   "layer_norm_eps": 1e-12,
16 |   "layer_scale_init_value": 0.1,
17 |   "model_type": "beit",
18 |   "num_attention_heads": 12,
19 |   "num_channels": 3,
20 |   "num_hidden_layers": 12,
21 |   "patch_size": 16,
22 |   "torch_dtype": "float32",
23 |   "transformers_version": "4.11.0.dev0",
24 |   "use_absolute_position_embeddings": false,
25 |   "use_mask_token": false,
26 |   "use_mean_pooling": true,
27 |   "use_relative_position_bias": true,
28 |   "use_shared_relative_position_bias": false,
29 |   "vocab_size": 8192
30 | }
31 | 


--------------------------------------------------------------------------------
/ActionBench/src/ignored_verbs_ego4d.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "puts",
 3 |     "moves",
 4 |     "looks",
 5 |     "places",
 6 |     "takes",
 7 |     "keeps",
 8 |     "keep",
 9 |     "turns",
10 |     "adjusts",
11 |     "operates",
12 |     "move",
13 |     "look",
14 |     "place",
15 |     "take",
16 |     "turn",
17 |     "adjust",
18 |     "operate",
19 |     "using",
20 |     "Puts",
21 |     "carries",
22 |     "plays",
23 |     "saw",
24 |     "uses",
25 |     "sits",
26 |     "applies",
27 |     "reads",
28 |     "left",
29 |     "Looks",
30 |     "gives",
31 |     "checks",
32 |     "drives",
33 |     "cooking",
34 |     "inspects",
35 |     "watches",
36 |     "put",
37 |     "hand",
38 |     "carry",
39 |     "play",
40 |     "use",
41 |     "apply",
42 |     "read",
43 |     "Look",
44 |     "give",
45 |     "interact",
46 |     "check",
47 |     "drive",
48 |     "cook",
49 |     "watch",
50 |     "pass",
51 |     "passes",
52 |     "fail",
53 |     "fails",
54 |     "show",
55 |     "shows",
56 |     "share",
57 |     "shares",
58 |     "keep",
59 |     "keeps",
60 |     "looking",
61 |     "found",
62 |     "find"
63 | ]


--------------------------------------------------------------------------------
/src/configs/datasets/actionbench/actionbench_ssv2_antonyms_224x224_5fps.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   actionbench_ssv2_224x224_5fps: # name of the dataset builder
 3 |     dataset_card: dataset_cards/actionbench_ssv2.md
 4 |     data_type: videos # [images|videos|features]
 5 | 
 6 |     build_info:
 7 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 8 |       annotations:
 9 |         train:
10 |           path: ActionBench/ssv2/shuffled_object_and_action_antonyms
11 |           use_templates_as_labels: false
12 |           state_change_filtering_json: ActionBench/ssv2/shuffled_object_and_action_antonyms/state_change_heavy_instance_filtering_train.json
13 |         val:
14 |           path: ActionBench/ssv2/shuffled_object_and_action_antonyms
15 |           use_templates_as_labels: false
16 |           state_change_filtering_json: ActionBench/ssv2/shuffled_object_and_action_antonyms/state_change_heavy_instance_filtering_val.json
17 |         test:
18 |           path: ActionBench/ssv2/shuffled_object_and_action_antonyms
19 |           use_templates_as_labels: true # This needs to be true to be loaded properly
20 |       videos:
21 |         path: datasets/SSv2/video_clips/clips_downsampled_5fps_downsized_224x224


--------------------------------------------------------------------------------
/ActionBench/src/README.md:
--------------------------------------------------------------------------------
 1 | # Code Description
 2 | You need to install `spacy`, `nltk` and `pyinflect` for using these script.
 3 | 
 4 | ## get action antonyms
 5 | - `get_action_antonyms_ego4d_ssv2.py`: contains example code for getting altered text sentences with verbs replaced with their antonyms.
 6 |     -  `get_action_antonyms()` is the main function for finding action antonyms given an original text annotation;
 7 |     -  `get_action_antonyms_ego4d()` shows an example and some comments on processing Ego4d annotations;
 8 |         -  variable `ADDITIONAL_ANTONYYMS_MAPPING` should contain a table of semi-automatically constructed verb-antonym pairs which can be dataset specific; (For Ego4d, we first get a list of verbs from the provided taxonomy and then ask ChatGPT to generate antonym for each of them, and then manually clean up.)
 9 |         -  variable `IGNORED_VERBS` should contain a list of verbs that does not have a good antonym, this is manually created;
10 |     -   `get_action_antonyms_ssv2()` shows a similar example on ssv2
11 | 
12 | ## get shuffled objects
13 | - `get_object_shuffling_*`: contains example code for getting altered text sentences with object names replaced by a random object based on dataset taxonomy (Ego4d and SSv2)


--------------------------------------------------------------------------------
/src/run_scripts/inference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export TOKENIZERS_PARALLELISM=false
 3 | 
 4 | ### usage examples for inferencing different tasks ###
 5 | # CONFIG is taken from .yaml from configs/projects/eval
 6 | 
 7 | ## == inference actionbench == ##
 8 | INFERENCE_TYPE="physical_knowledge_bench"
 9 | CONFIG="configs/projects/eval/actionbench/backbone/internvideo/ssv2/actionbench_ssv2_internvideo_backbone__action_antonym.yaml"
10 | 
11 | ## == inference nextqa == ##
12 | INFERENCE_TYPE="downstream_task_next_qa"
13 | CONFIG="configs/projects/eval/downstream_task/nextqa/backbone_zero-shot.yaml"
14 | 
15 | ## == inference ssv2-label == ##
16 | INFERENCE_TYPE="downstream_task_retrieval_v2t_ssv2_label"
17 | CONFIG="configs/projects/eval/downstream_task/ssv2_label/backbone_zero-shot.yaml"
18 | 
19 | ## == inference ssv2-template == ##
20 | INFERENCE_TYPE="downstream_task_retrieval_v2t_ssv2_template"
21 | CONFIG="configs/projects/eval/downstream_task/ssv2_template/backbone_zero-shot.yaml"
22 | 
23 | ## == inference temporal-ssv2 == ##
24 | INFERENCE_TYPE="downstream_task_retrieval_v2t_temporal_ssv2"
25 | CONFIG="configs/projects/eval/downstream_task/temporal_ssv2/backbone_zero-shot.yaml"
26 | 
27 | 
28 | 
29 | # run inference
30 | CUDA_VISIBLE_DEVICES=${DEVICES} python -m torch.distributed.run \
31 |     --nproc_per_node=${N_GPU} \
32 |     --master_port=${PORT} \
33 |     inference.py --cfg-path $CONFIG --inference_type $INFERENCE_TYPE


--------------------------------------------------------------------------------
/dataset_cards/actionbench_ssv2.md:
--------------------------------------------------------------------------------
 1 | # Action Dynamic Benchmark (ActionBench) on SSv2
 2 | 
 3 | ## Instruction for Downloading Videos
 4 | - Download the videos from [here](https://developer.qualcomm.com/software/ai-datasets/something-something)
 5 | - Put the downloaded `.webm` files into `datasets/ssv2/video_clips/clips`
 6 | - Run preprocessing script (at the root dir of this repo):
 7 |     ```
 8 |         python src/preprocessing/ssv2/downsample_downsize_video_clips.py
 9 |     ```
10 | - The resulting preprocessed video clips are stored at `datasets/ssv2/video_clips/clips_downsampled_5fps_downsized_224x224`
11 | 
12 | ## Annotation Details
13 | 
14 | ### Action Antonym Task & Video Reversal Task & Object Shuffle Task
15 | - train: 162,475
16 | - validation: 23,807
17 | - ann_path: `ActionBench/ssv2/shuffled_object_and_action_antonyms`
18 | - format:
19 |     ```
20 |         {
21 |             "label": "Spinning cube that quickly stops spinning",
22 |             "template": "Spinning something that quickly stops spinning",
23 |             "placeholders": [
24 |                 "cube"
25 |             ],
26 |             "template_action_antonym_clip_text": "Spinning something that quickly starts spinning",
27 |             "label_action_antonym_clip_text": "Spinning cube that quickly starts spinning",
28 |             "id": 74225,
29 |             "label_object_shuffled_clip_text": "spinning feeding lid that quickly stops spinning"
30 |         }
31 |     ```
32 | 


--------------------------------------------------------------------------------
/dataset_cards/nextqa.md:
--------------------------------------------------------------------------------
 1 | ## Instruction for Downloading Videos
 2 | - Download the videos following the instructions [here](https://github.com/doc-doc/NExT-QA)
 3 | - Put the downloaded videos to `datasets/NextQA/video_clips/NExTVideo`
 4 | - Run preprocessing script (at the root dir of this repo):
 5 |     ```
 6 |         python src/preprocessing/nextqa/downsample_downsize_video_nextqa.py
 7 |     ```
 8 | - The resulting preprocessed video clips are stored at `datasets/NextQA/video_clips/NExTVideo_downsampled_5fps_downsized_224x224`
 9 | 
10 | ## Annotation Details
11 | - train size: 34132
12 | - val size: 4996
13 | - action antonym ann_path for patcher DVDM training: `datasets/NextQA/ann/nextqa_action_antonym`
14 |     - format:
15 |         ```
16 |             video,frame_count,width,height,question,answer,qid,type,a0,a1,a2,a3,a4,action_antonym_choices
17 |             4010069381,369,640,480,how do the two man play the instrument,0,6,CH,roll the handle,tap their feet,strum the string,hit with sticks,pat with hand,"['unwind the handle', 'hit their feet', 'missed with sticks']"
18 |         ```
19 | - original ann_path for finetuning and evaluation: `datasets/NextQA/ann/nextqa`
20 |     - format:
21 |         ```
22 |             video,frame_count,width,height,question,answer,qid,type,a0,a1,a2,a3,a4
23 |             4010069381,369,640,480,how do the two man play the instrument,0,6,CH,roll the handle,tap their feet,strum the string,hit with sticks,pat with hand
24 |         ```


--------------------------------------------------------------------------------
/ActionBench/src/additional_antonyms_mapping_ssv2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "approaching": "moving away from",
 3 |     "attaching": "detaching",
 4 |     "bending": "straightening",
 5 |     "burying": "unearthing",
 6 |     "closing": "opening",
 7 |     "covering": "uncovering",
 8 |     "digging": "filling",
 9 |     "dropping": "picking up",
10 |     "failing": "succeeding",
11 |     "folding": "unfolding",
12 |     "holding": "releasing",
13 |     "laying": "picking up",
14 |     "letting": "preventing",
15 |     "lifting": "lowering",
16 |     "towards": "away from",
17 |     "opening": "closing",
18 |     "picking": "putting",
19 |     "piling": "scattering",
20 |     "plugging": "removing",
21 |     "pouring": "filling",
22 |     "trying": "succeeding",
23 |     "tearing": "mending",
24 |     "close": "open",
25 |     "open": "close",
26 |     "pour": "fill",
27 |     "scoop": "fill",
28 |     "spread": "gather",
29 |     "sprinkle": "dump",
30 |     "squeeze": "expand",
31 |     "take": "return",
32 |     "throw": "catch",
33 |     "pulling": "pushing",
34 |     "pushing": "pulling",
35 |     "removing": "placing",
36 |     "rolling": "halting",
37 |     "scooping": "filling",
38 |     "showing": "concealing",
39 |     "falling": "ascending",
40 |     "spilling": "collecting",
41 |     "squeezing": "expanding",
42 |     "stuffing": "emptying",
43 |     "throwing": "catching",
44 |     "tilting": "righting",
45 |     "tipping": "righting",
46 |     "turning": "straightening",
47 |     "wiping": "spilling",
48 |     "twisting": "untwisting",
49 |     "uncovering": "covering",
50 |     "unfolding": "folding"
51 | }


--------------------------------------------------------------------------------
/src/preprocessing/ego4d/downsample_downsize_video_clips.py:
--------------------------------------------------------------------------------
 1 | # modified from EgoVLP https://github.com/showlab/EgoVLP/blob/main/utils/video_resize.py
 2 | 
 3 | import os
 4 | import time
 5 | import sys
 6 | import subprocess
 7 | from multiprocessing import Pool, Value
 8 | 
 9 | image_size = 224
10 | fps = 5
11 | 
12 | original_clips = 'datasets/Ego4D/video_clips/clips'
13 | output_dir = f'datasets/Ego4D/video_clips/clips_downsampled_{fps}fps_downsized_{image_size}x{image_size}'
14 | 
15 | os.makedirs(output_dir, exist_ok=True)
16 | 
17 | def videos_resize(videoinfos):
18 |     global count
19 | 
20 |     videoidx, videoname = videoinfos
21 | 
22 |     if os.path.exists(os.path.join(output_dir, videoname)):
23 |         print(f'{videoname} already exists.')
24 |         return
25 | 
26 |     inname = original_clips + '/' + videoname
27 |     outname = output_dir + '/' + videoname
28 | 
29 |     # cmd = "ffmpeg -y -i {} -filter:v scale=\"trunc(oh*a/2)*2:256\" -c:a copy {}".format(inname, outname)
30 |     cmd = f"ffmpeg -loglevel info -y -i {inname} -filter:v scale={image_size}:{image_size},fps={fps} -c:a copy {outname}"
31 |     subprocess.call(cmd, shell=True)
32 | 
33 |     return
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     
38 |     file_list = []
39 |     mp4_list = [item for item in os.listdir(original_clips) if item.endswith('.mp4')] # load mp4 files
40 |     
41 |     for idx, video in enumerate(mp4_list):
42 |         file_list.append([idx, video])
43 |     
44 |     print(file_list)
45 |     print(len(file_list))
46 | 
47 |     pool = Pool(8)
48 |     pool.map(videos_resize, tuple(file_list))


--------------------------------------------------------------------------------
/src/configs/datasets/actionbench/actionbench_ego4d_224x224_5fps.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   actionbench_ego4d_224x224_5fps: # name of the dataset builder
 3 |     dataset_card: dataset_cards/actionbench_ego4d.md
 4 |     data_type: videos # [images|videos|features]
 5 | 
 6 |     build_info:
 7 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 8 |       annotations:
 9 |         train:
10 |           url: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/train.jsonl
11 |           storage: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/train.jsonl
12 |           state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_train.json
13 |         val:
14 |           url: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/val.jsonl
15 |           storage: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/val.jsonl
16 |           state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_val.json
17 |         test:
18 |           url: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/test.jsonl
19 |           storage: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/test.jsonl
20 |           state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_test.json
21 |       videos:
22 |         storage: datasets/Ego4D/video_clips/clips_downsampled_5fps_downsized_224x224


--------------------------------------------------------------------------------
/dataset_cards/downstream_tasks_temporal.md:
--------------------------------------------------------------------------------
 1 | ## Instruction for Downloading Videos
 2 | - **SSv2 videos**: refer to [actionbench_ssv2.md](./actionbench_ssv2.md).
 3 | - **Kinetics400 videos**: download the subset of Kinetic400 that are required in Temporal-Kinetics
 4 |     - Install `yt-dlp` following the instructions [here](https://github.com/yt-dlp/yt-dlp.git)
 5 |     - Download the required videos using our provided script:
 6 |         ```
 7 |             cd datasets/Temporal/ann
 8 |             bash download_kinetic_videos_yt_dlp.sh
 9 |         ```
10 |     - Put the downloaded videos into `datasets/Temporal/video_clips/kinetics400/clips`
11 |     - Run preprocessing script (at the root dir of this repo):
12 |         ```
13 |             python src/preprocessing/kinetics/downsample_downsize_video_clips.py
14 |         ```
15 |     - The resulting preprocessed video clips are stored at `datasets/Temporal/video_clips/kinetics400/clips_downsampled_5fps_downsized_224x224`
16 | 
17 | ## Annotation Details
18 | - paper: https://arxiv.org/abs/2301.02074
19 | - Temporal-kinetics size: 1309 | 32 action texts
20 | - Temporal-ssv2 size: 864 | 18 action texts
21 | - ann_path: `datasets/Temporal/ann/val-v1.0-2.4k.csv`
22 | - format:
23 |     ```
24 |         ,index,video_id,text,dataset
25 |         4153,2561,169724,Approaching [something] with your camera,SSv2
26 |         ...
27 |         188,2281,cartwheeling/RUNwB3-Qxqg_000007_000017,cartwheeling,kinetics
28 |         ...
29 |     ````
30 | 
31 | ## Video directory
32 | - Temporal-kinetics: `datasets/Temporal/video_clips/kinetics400/clips_downsampled_5fps_downsized_224x224`
33 | - Temporal-ssv2: `datasets/ssv2/video_clips/clips_downsampled_5fps_downsized_224x224`


--------------------------------------------------------------------------------
/src/configs/datasets/actionbench/actionbench_ego4d_object_shuffled_224x224_5fps.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   actionbench_ego4d_224x224_5fps: # name of the dataset builder
 3 |     type: "object_shuffled"
 4 |     dataset_card: dataset_cards/actionbench_ego4d.md
 5 |     data_type: videos # [images|videos|features]
 6 | 
 7 |     build_info:
 8 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 9 |       annotations:
10 |         train:
11 |           url: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/train.jsonl
12 |           storage: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/train.jsonl
13 |           state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_train.json
14 |         val:
15 |           url: ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/val.jsonl
16 |           storage: ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/val.jsonl
17 |           state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_val.json
18 |         test:
19 |           url: ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/test.jsonl
20 |           storage: ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/test.jsonl
21 |           state_change_filtering_json: ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/state_change_heavy_instance_filtering_test.json
22 |       videos:
23 |         storage: datasets/Ego4D/video_clips/clips_downsampled_5fps_downsized_224x224


--------------------------------------------------------------------------------
/src/preprocessing/ssv2/downsample_downsize_video_clips.py:
--------------------------------------------------------------------------------
 1 | # modified from EgoVLP https://github.com/showlab/EgoVLP/blob/main/utils/video_resize.py
 2 | # Downsamples, downsizes, and converts to mp4
 3 | 
 4 | import os
 5 | import subprocess
 6 | from multiprocessing import Pool
 7 | from tqdm import tqdm
 8 | from functools import partial
 9 | 
10 | image_size = 224
11 | fps = 5
12 | 
13 | 
14 | original_clips = 'datasets/SSv2/video_clips/clips'
15 | output_dir = f'datasets/SSv2/video_clips/clips_downsampled_{fps}fps_downsized_{image_size}x{image_size}'
16 | 
17 | def resize_video(videoname, suppress_stdout=False, suppress_stderr=False):
18 |     if os.path.exists(os.path.join(output_dir, videoname)):
19 |         print(f'{videoname} already exists.')
20 |         return
21 | 
22 |     inname = original_clips + '/' + videoname
23 |     outname = output_dir + '/' + f'{videoname.split(".")[0]}.mp4'
24 | 
25 |     cmd = f"ffmpeg -loglevel info -y -i {inname} -filter:v scale={image_size}:{image_size},fps={fps} -c:a copy {outname}"
26 | 
27 |     kwargs = {}
28 |     if suppress_stdout:
29 |         kwargs['stdout'] = subprocess.DEVNULL
30 |     if suppress_stderr:
31 |         kwargs['stderr'] = subprocess.DEVNULL
32 | 
33 |     subprocess.run(cmd, shell=True, **kwargs)
34 | 
35 |     return
36 | 
37 | if __name__ == "__main__":
38 |     suppress_stdout = True
39 |     suppress_stderr = True
40 |     num_proc = 10
41 | 
42 |     os.makedirs(output_dir, exist_ok=True)
43 |     webm_list = [item for item in os.listdir(original_clips) if item.endswith('.webm')] # load webm files
44 |     print('Total files to consider:', len(webm_list))
45 | 
46 |     resizer = partial(resize_video, suppress_stdout=suppress_stdout, suppress_stderr=suppress_stderr)
47 |     for _ in tqdm(Pool(num_proc).imap_unordered(resizer, webm_list), total=len(webm_list)):
48 |         pass


--------------------------------------------------------------------------------
/src/_get_model_computational_complexity.py:
--------------------------------------------------------------------------------
 1 | import torchvision.models as models
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from ptflops import get_model_complexity_info
 6 | 
 7 | import InternVideo
 8 | from models import (
 9 |     PatchAndFuseInternVideo,
10 |     KnowledgePatcherInternVideo_Baseline_Simple
11 | )
12 | def set_up_device(gpu_index):
13 |     # single gpu
14 |     if torch.cuda.is_available() and gpu_index != -1:
15 |         dev = f"cuda:{gpu_index}"
16 |     else:
17 |         dev = "cpu"
18 |     return torch.device(dev)
19 | 
20 | class Wrapper(nn.Module):
21 |     def __init__(self, module) -> None:
22 |         super(Wrapper, self).__init__()
23 |         self.net = module
24 |     def forward(self, x):
25 |         return self.net.encode_video(x)
26 | 
27 | with torch.cuda.device(1):
28 |     # device = set_up_device(gpu_index=3)
29 | 
30 |     for model_name in [
31 |         "patch_and_fuse_internvideo",
32 |         "patch_and_fuse_internvideo_baseline_simple"
33 |     ]:
34 | 
35 |         print("model_name:",model_name)
36 | 
37 |         model_type = "InternVideo-MM-L-14"
38 |         print("model_type:",model_type)
39 | 
40 |         # load_model
41 |         if model_name == "patch_and_fuse_internvideo":
42 |             module = PatchAndFuseInternVideo.from_pretrained(model_type=model_type)
43 |         elif model_name == "patch_and_fuse_internvideo_baseline_simple":
44 |             module = KnowledgePatcherInternVideo_Baseline_Simple.from_pretrained(model_type=model_type)
45 | 
46 |         model = Wrapper(module)
47 | 
48 |         macs, params = get_model_complexity_info(model, (8,3,224,224), as_strings=True,
49 |                                                 print_per_layer_stat=True, verbose=True)
50 |         print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
51 |         print('{:<30}  {:<8}'.format('Number of parameters: ', params))


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/nextqa/backbone_zero-shot.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: patch_and_fuse_internvideo_mcqa
 3 |   model_type: InternVideo-MM-L-14
 4 |   load_pretrained: True
 5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 6 | 
 7 | datasets:
 8 |   downstream_tasks_qa_nextqa_224x224_5fps:
 9 |     type: "default"
10 |     vis_processor:
11 |       train:
12 |         name: "video_train"
13 |         image_size: 224
14 |       eval:
15 |         name: "internvideo_eval"
16 |         image_size: 224
17 |     text_processor:
18 |       train:
19 |         name: "minimum_text"
20 |       eval:
21 |         name: "minimum_text"
22 |     
23 |     # IMPORTANT configs:
24 |     task: 5way-multiple-choice-qa # 
25 |     neg_sampling_same_clip: 0
26 |     eval_only: True
27 |     
28 |     # other arguements
29 |     train_k: null # sample a subset of k instances
30 |     eval_k: null # sample a subset of k instances, reduce evaluation time
31 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
32 |     num_frm: 8
33 |     train_frame_height: 224
34 |     train_frame_width: 224
35 |     eval_frame_height: 224
36 |     eval_frame_width: 224
37 | 
38 | run:
39 |   # use custom runner
40 |   runner: runner_base_patch_and_fuse
41 |   
42 |   # task object name
43 |   task: downstream_tasks_multi_choice_qa
44 | 
45 |   # which module is used for inference ["backbone", "knowledge_patcher"]
46 |   eval_module: backbone
47 |   eval_task: 5way-multiple-choice-qa
48 | 
49 |   batch_size_train: 32
50 |   batch_size_eval: 8
51 |   num_workers: 4
52 | 
53 |   seed: 42
54 |   output_dir: "output/downstream_tasks/NextQA/eval/backbone_zero-shot"
55 | 
56 |   amp: False
57 |   resume_ckpt_path: null
58 | 
59 |   evaluate: True
60 |   
61 |   # train_splits: ["train"]
62 |   valid_splits: ["val"]
63 |   test_splits: ["test"]
64 | 
65 |   device: "cuda"
66 |   world_size: 1
67 |   dist_url: "env://"
68 |   distributed: True
69 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/temporal_ssv2/backbone_zero-shot.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | model:
 3 |   arch: patch_and_fuse_internvideo
 4 |   model_type: InternVideo-MM-L-14
 5 |   load_pretrained: True
 6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 7 | 
 8 | datasets:
 9 |   downstream_tasks_temporal:
10 |     type: "default"
11 |     vis_processor:
12 |       train:
13 |         name: "video_train"
14 |         image_size: 224
15 |       eval:
16 |         name: "internvideo_eval"
17 |         image_size: 224
18 |     text_processor:
19 |       train:
20 |         name: "minimum_text"
21 |       eval:
22 |         name: "minimum_text"
23 |     
24 |     # IMPORTANT configs:
25 |     fps: 5
26 |     task: v1.0_2.4k
27 |     subset: ssv2
28 |     neg_sampling_same_clip: 0
29 |     eval_only: True
30 |     
31 |     # other arguements
32 |     train_k: null # sample a subset of k instances
33 |     eval_k: null # sample a subset of k instances, reduce evaluation time
34 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
35 |     num_frm: 8
36 |     train_frame_height: 224
37 |     train_frame_width: 224
38 |     eval_frame_height: 224
39 |     eval_frame_width: 224
40 | 
41 | run:
42 |   # use custom runner
43 |   runner: runner_base_patch_and_fuse
44 |   
45 |   # task object name
46 |   task: downstream_tasks_retrieval
47 | 
48 |   # which module is used for inference ["backbone", "knowledge_patcher"]
49 |   eval_module: backbone
50 |   eval_task: v1.0_2.4k
51 | 
52 |   batch_size_train: 32
53 |   batch_size_eval: 4
54 |   num_workers: 4
55 | 
56 |   seed: 42
57 |   output_dir: "output/downstream_tasks/temporal_ssv2/backbone_zero-shot"
58 | 
59 |   amp: False
60 |   resume_ckpt_path: null
61 | 
62 |   evaluate: True
63 |   
64 |   # train_splits: ["train"]
65 |   valid_splits: ["val"]
66 |   # test_splits: ["test"]
67 | 
68 |   device: "cuda"
69 |   world_size: 1
70 |   dist_url: "env://"
71 |   distributed: True
72 | 


--------------------------------------------------------------------------------
/src/preprocessing/kinetics/downsample_downsize_video_clips.py:
--------------------------------------------------------------------------------
 1 | # modified from EgoVLP https://github.com/showlab/EgoVLP/blob/main/utils/video_resize.py
 2 | # Downsamples, downsizes, and converts to mp4
 3 | 
 4 | import os
 5 | import subprocess
 6 | from multiprocessing import Pool
 7 | from tqdm import tqdm
 8 | from functools import partial
 9 | 
10 | image_size = 224
11 | fps = 5
12 | 
13 | # original_clips = ''
14 | # output_dir = f'<path to output root dir>/clips_downsampled_{fps}fps_downsized_{image_size}x{image_size}'
15 | 
16 | original_clips = 'datasets/Temporal/video_clips/kinetics400/clips'
17 | output_dir = f'datasets/Temporal/video_clips/kinetics400/clips_downsampled_{fps}fps_downsized_{image_size}x{image_size}'
18 | 
19 | def resize_video(videoname, suppress_stdout=False, suppress_stderr=False):
20 |     if os.path.exists(os.path.join(output_dir, videoname)):
21 |         print(f'{videoname} already exists.')
22 |         return
23 | 
24 |     inname = original_clips + '/' + videoname
25 |     outname = output_dir + '/' + f'{videoname.split(".")[0]}.mp4'
26 | 
27 |     cmd = f"ffmpeg -loglevel info -y -i {inname} -filter:v scale={image_size}:{image_size},fps={fps} -c:a copy {outname}"
28 | 
29 |     kwargs = {}
30 |     if suppress_stdout:
31 |         kwargs['stdout'] = subprocess.DEVNULL
32 |     if suppress_stderr:
33 |         kwargs['stderr'] = subprocess.DEVNULL
34 | 
35 |     subprocess.run(cmd, shell=True, **kwargs)
36 | 
37 |     return
38 | 
39 | if __name__ == "__main__":
40 |     suppress_stdout = True
41 |     suppress_stderr = True
42 |     num_proc = 10
43 | 
44 |     os.makedirs(output_dir, exist_ok=True)
45 |     mp4_list = [item for item in os.listdir(original_clips) if item.endswith('.mp4')] # load original mp4 files
46 |     print('Total files to consider:', len(mp4_list))
47 | 
48 |     resizer = partial(resize_video, suppress_stdout=suppress_stdout, suppress_stderr=suppress_stderr)
49 |     for _ in tqdm(Pool(num_proc).imap_unordered(resizer, mp4_list), total=len(mp4_list)):
50 |         pass


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/ssv2_label/backbone_zero-shot.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | model:
 3 |   arch: patch_and_fuse_internvideo
 4 |   model_type: InternVideo-MM-L-14
 5 |   load_pretrained: True
 6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 7 | 
 8 | datasets:
 9 |   downstream_tasks_retrieval_ssv2_224x224_5fps:
10 |     type: "default"
11 |     vis_processor:
12 |       train:
13 |         name: "video_train"
14 |         image_size: 224
15 |       eval:
16 |         name: "internvideo_eval"
17 |         image_size: 224
18 |     text_processor:
19 |       train:
20 |         name: "minimum_text"
21 |       eval:
22 |         name: "minimum_text"
23 |     
24 |     # IMPORTANT configs: 
25 |     fps: 5 # if downsampled, use 5 fps
26 |     task: ssv2_label # ssv2_label, ssv2_template
27 |     neg_sampling_same_clip: 0
28 |     eval_only: True
29 |     
30 |     # other arguements
31 |     train_k: null # sample a subset of k instances
32 |     eval_k: null # sample a subset of k instances, reduce evaluation time
33 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
34 |     num_frm: 8
35 |     train_frame_height: 224
36 |     train_frame_width: 224
37 |     eval_frame_height: 224
38 |     eval_frame_width: 224
39 | 
40 | run:
41 |   # use custom runner
42 |   runner: runner_base_patch_and_fuse
43 |   
44 |   # task object name
45 |   task: downstream_tasks_retrieval
46 | 
47 |   # which module is used for inference ["backbone", "knowledge_patcher"]
48 |   eval_module: backbone
49 |   eval_task: ssv2_label
50 | 
51 |   batch_size_train: 32
52 |   batch_size_eval: 4
53 |   num_workers: 4
54 | 
55 |   seed: 42
56 |   output_dir: "output/downstream_tasks/ssv2_label/backbone_zero-shot"
57 | 
58 |   amp: False
59 |   resume_ckpt_path: null
60 | 
61 |   evaluate: True
62 |   
63 |   # train_splits: ["train"]
64 |   valid_splits: ["val"]
65 |   # test_splits: ["test"]
66 | 
67 |   device: "cuda"
68 |   world_size: 1
69 |   dist_url: "env://"
70 |   distributed: True
71 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/ssv2_template/backbone_zero-shot.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | model:
 3 |   arch: patch_and_fuse_internvideo
 4 |   model_type: InternVideo-MM-L-14
 5 |   load_pretrained: True
 6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 7 | 
 8 | datasets:
 9 |   downstream_tasks_retrieval_ssv2_224x224_5fps:
10 |     type: "default"
11 |     vis_processor:
12 |       train:
13 |         name: "video_train"
14 |         image_size: 224
15 |       eval:
16 |         name: "internvideo_eval"
17 |         image_size: 224
18 |     text_processor:
19 |       train:
20 |         name: "minimum_text"
21 |       eval:
22 |         name: "minimum_text"
23 |     
24 |     # IMPORTANT configs: 
25 |     fps: 5 # if downsampled, use 5 fps
26 |     task: ssv2_template # ssv2_label, ssv2_template
27 |     neg_sampling_same_clip: 0
28 |     eval_only: True
29 |     
30 |     # other arguements
31 |     train_k: null # sample a subset of k instances
32 |     eval_k: null # sample a subset of k instances, reduce evaluation time
33 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
34 |     num_frm: 8
35 |     train_frame_height: 224
36 |     train_frame_width: 224
37 |     eval_frame_height: 224
38 |     eval_frame_width: 224
39 | 
40 | run:
41 |   # use custom runner
42 |   runner: runner_base_patch_and_fuse
43 |   
44 |   # task object name
45 |   task: downstream_tasks_retrieval
46 | 
47 |   # which module is used for inference ["backbone", "knowledge_patcher"]
48 |   eval_module: backbone
49 |   eval_task: ssv2_template
50 | 
51 |   batch_size_train: 32
52 |   batch_size_eval: 4
53 |   num_workers: 4
54 | 
55 |   seed: 42
56 |   output_dir: "output/downstream_tasks/ssv2_template/backbone_zero-shot"
57 | 
58 |   amp: False
59 |   resume_ckpt_path: null
60 | 
61 |   evaluate: True
62 |   
63 |   # train_splits: ["train"]
64 |   valid_splits: ["val"]
65 |   # test_splits: ["test"]
66 | 
67 |   device: "cuda"
68 |   world_size: 1
69 |   dist_url: "env://"
70 |   distributed: True
71 | 


--------------------------------------------------------------------------------
/ActionBench/src/get_object_shuffling_ssv2.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from collections import defaultdict    
 4 | from nltk.corpus import wordnet
 5 | import nltk
 6 | import spacy
 7 | import pyinflect
 8 | from tqdm import tqdm
 9 | import random
10 | 
11 | def get_object_shuffling_ssv2(output_path = None):
12 |     ann_path = "../ssv2/antonyms/validation.json"
13 |     assert output_path is not None
14 |     object_taxonomy = json.load(open("../ssv2/object_taxonomy.json"))
15 |     print(len(object_taxonomy))
16 | 
17 |     val_annotations = json.load(open(ann_path))
18 | 
19 |     output_annotations = []
20 |     for item in tqdm(val_annotations):
21 |         orig_objects = item['placeholders']
22 |         
23 |         cand_object_taxonomy = object_taxonomy.copy()
24 |         for orig in orig_objects:
25 |             if orig in cand_object_taxonomy:
26 |                 cand_object_taxonomy.remove(orig)
27 |         
28 |         cand_objects = random.sample(cand_object_taxonomy, len(orig_objects))
29 |         
30 |         object_shuffled_text = item['label'].lower()
31 |         for i, orig in enumerate(orig_objects):
32 |             orig = orig.lower()
33 |             # Find the index of the first occurrence of the substring
34 |             index = object_shuffled_text.find(orig)
35 |             assert index != -1
36 |             # Replace the first occurrence of the substring with a new string
37 |             object_shuffled_text = object_shuffled_text[:index] + cand_objects[i] + object_shuffled_text[index+len(orig):]
38 |         
39 |         assert object_shuffled_text != item['label']
40 |         item['label_object_shuffled_clip_text'] = object_shuffled_text
41 |         
42 |         output_annotations.append(item)
43 | 
44 |     assert len(output_annotations) == len(val_annotations)
45 |     with open(output_path, 'w') as o:
46 |         json.dump(output_annotations, o, indent=4)
47 | 
48 | 
49 | 
50 | if __name__ == "__main__":
51 | 
52 |     random.seed(42)
53 |     get_object_shuffling_ssv2(output_path = "<output path for processed validation set>")


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/moments_in_time/backbone_zero-shot.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: patch_and_fuse_internvideo
 3 |   model_type: InternVideo-MM-L-14
 4 |   load_pretrained: True
 5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 6 | 
 7 | datasets:
 8 |   downstream_tasks_moment_in_time:
 9 |     type: "default"
10 |     vis_processor:
11 |       train:
12 |         name: "video_train"
13 |         image_size: 224
14 |       eval:
15 |         name: "internvideo_eval"
16 |         image_size: 224
17 |     text_processor:
18 |       train:
19 |         name: "minimum_text"
20 |       eval:
21 |         name: "minimum_text"
22 |     
23 |     # IMPORTANT configs:
24 |     task: video_action_retrieval_2k
25 |     neg_sampling_same_clip: 0
26 |     eval_only: True
27 |     
28 |     # other arguements
29 |     train_k: null # sample a subset of k instances
30 |     eval_k: null # sample a subset of 3000 instances, reduce evaluation time
31 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
32 |     num_frm: 8
33 |     train_frame_height: 224
34 |     train_frame_width: 224
35 |     eval_frame_height: 224
36 |     eval_frame_width: 224
37 | 
38 | run:
39 |   # use custom runner
40 |   runner: runner_base_patch_and_fuse
41 |   
42 |   # task object name
43 |   task: downstream_tasks_retrieval
44 | 
45 |   # which module is used for inference ["backbone", "knowledge_patcher"]
46 |   eval_module: backbone
47 |   eval_task: video_action_retrieval_2k
48 | 
49 |   ## NOTE: uncomment the following to use Backbone Ensemble
50 |   # eval_method: ensemble_with_backbone 
51 | 
52 |   batch_size_train: 32
53 |   batch_size_eval: 4
54 |   num_workers: 4
55 | 
56 |   seed: 42
57 |   output_dir: "output/downstream_tasks/MomentsInTime/eval/backbone_zero-shot"
58 | 
59 |   amp: False
60 |   resume_ckpt_path: null
61 | 
62 |   evaluate: True
63 |   
64 |   # train_splits: ["train"]
65 |   valid_splits: ["val"]
66 |   # test_splits: ["test"]
67 | 
68 |   device: "cuda"
69 |   world_size: 1
70 |   dist_url: "env://"
71 |   distributed: True
72 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/temporal_kinetics/backbone_zero-shot.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: patch_and_fuse_internvideo
 3 |   model_type: InternVideo-MM-L-14
 4 |   load_pretrained: True
 5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 6 | 
 7 | datasets:
 8 |   downstream_tasks_temporal:
 9 |     type: "default"
10 |     vis_processor:
11 |       train:
12 |         name: "video_train"
13 |         image_size: 224
14 |       eval:
15 |         name: "internvideo_eval"
16 |         image_size: 224
17 |     text_processor:
18 |       train:
19 |         name: "minimum_text"
20 |       eval:
21 |         name: "minimum_text"
22 |     
23 |     # IMPORTANT configs:
24 |     fps: 5
25 |     task: v1.0_2.4k
26 |     subset: kinetics
27 |     neg_sampling_same_clip: 0
28 |     eval_only: True
29 |     
30 |     # other arguements
31 |     train_k: null # sample a subset of k instances
32 |     eval_k: null # sample a subset of 3000 instances, reduce evaluation time
33 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
34 |     num_frm: 8
35 |     train_frame_height: 224
36 |     train_frame_width: 224
37 |     eval_frame_height: 224
38 |     eval_frame_width: 224
39 | 
40 | run:
41 |   # use custom runner
42 |   runner: runner_base_patch_and_fuse
43 |   
44 |   # task object name
45 |   task: downstream_tasks_retrieval
46 | 
47 |   # which module is used for inference ["backbone", "knowledge_patcher"]
48 |   eval_module: backbone
49 |   eval_task: v1.0_2.4k
50 | 
51 |   ## NOTE: uncomment the following to use Backbone Ensemble
52 |   # eval_method: ensemble_with_backbone 
53 | 
54 |   batch_size_train: 32
55 |   batch_size_eval: 4
56 |   num_workers: 4
57 | 
58 |   seed: 42
59 |   output_dir: "output/downstream_tasks/temporal-kinetics/eval/backbone_zero-shot"
60 | 
61 |   amp: False
62 |   resume_ckpt_path: null
63 | 
64 |   evaluate: True
65 |   
66 |   # train_splits: ["train"]
67 |   valid_splits: ["val"]
68 |   # test_splits: ["test"]
69 | 
70 |   device: "cuda"
71 |   world_size: 1
72 |   dist_url: "env://"
73 |   distributed: True
74 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/internvideo/ego4d/acdybench_ego4d_internvideo_backbone__action_antonym.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | model:
 4 |   arch: patch_and_fuse_internvideo
 5 |   model_type: InternVideo-MM-L-14
 6 |   load_pretrained: True
 7 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 8 | 
 9 | 
10 | datasets:
11 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
12 |     vis_processor:
13 |       train:
14 |         name: "video_train"
15 |         image_size: 224
16 |       eval:
17 |         name: "internvideo_eval"
18 |         image_size: 224
19 |     text_processor:
20 |       train:
21 |         name: "vl_dynamic_ego4d_text"
22 |       eval:
23 |         name: "vl_dynamic_ego4d_text"
24 |     
25 |     # IMPORTANT configs: 
26 |     fps: 5 # if downsampled, use 5 fps
27 |     task: "action_antonym" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
28 |     neg_sampling_same_clip: 0 # evaluation set to 0
29 |     eval_only: True
30 |     
31 |     # other arguements
32 |     k: null # sample a subset of k instances
33 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
34 |     num_frm: 8
35 |     train_frame_height: 224
36 |     train_frame_width: 224
37 |     eval_frame_height: 224
38 |     eval_frame_width: 224
39 | 
40 | run:
41 |   # use custom runner
42 |   runner: runner_base_patch_and_fuse
43 |   
44 |   # task object name
45 |   task: actionbench
46 | 
47 |   # which module is used for inference ["backbone", "knowledge_patcher"]
48 |   eval_module: backbone
49 |   eval_task: action_antonym
50 | 
51 |   batch_size_train: 32
52 |   batch_size_eval: 4
53 |   num_workers: 4
54 | 
55 |   seed: 42
56 |   output_dir: "output/actionbench/eval/InternVideo/ego4d__InternVideo_backbone__action_antonym"
57 | 
58 |   amp: False
59 |   resume_ckpt_path: null
60 | 
61 |   evaluate: True 
62 |   # train_splits: ["train"]
63 |   # valid_splits: ["val"]
64 |   test_splits: ["test"]
65 | 
66 |   device: "cuda"
67 |   world_size: 1
68 |   dist_url: "env://"
69 |   distributed: True
70 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/internvideo/ego4d/acdybench_ego4d_internvideo_backbone__reversed_video.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | 
 4 | model:
 5 |   arch: patch_and_fuse_internvideo
 6 |   model_type: InternVideo-MM-L-14
 7 |   load_pretrained: True
 8 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 9 | 
10 | 
11 | 
12 | datasets:
13 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
14 |     vis_processor:
15 |       train:
16 |         name: "video_train"
17 |         image_size: 224
18 |       eval:
19 |         name: "internvideo_eval"
20 |         image_size: 224
21 |     text_processor:
22 |       train:
23 |         name: "vl_dynamic_ego4d_text"
24 |       eval:
25 |         name: "vl_dynamic_ego4d_text"
26 |     
27 |     # IMPORTANT configs: 
28 |     fps: 5 # if downsampled, use 5 fps
29 |     task: "reversed_video" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
30 |     neg_sampling_same_clip: 0 # evaluation set to 0
31 |     eval_only: True
32 |     
33 |     # other arguements
34 |     k: null # sample a subset of k instances
35 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
36 |     num_frm: 8
37 |     train_frame_height: 224
38 |     train_frame_width: 224
39 |     eval_frame_height: 224
40 |     eval_frame_width: 224
41 | 
42 | run:
43 |   # use custom runner
44 |   runner: runner_base_patch_and_fuse
45 |   
46 |   # task object name
47 |   task: actionbench
48 | 
49 |   # which module is used for inference ["backbone", "knowledge_patcher"]
50 |   eval_module: backbone
51 |   eval_task: reversed_video
52 | 
53 |   batch_size_train: 32
54 |   batch_size_eval: 4
55 |   num_workers: 4
56 | 
57 |   seed: 42
58 |   output_dir: "output/actionbench/eval/InternVideo/ego4d__InternVideo_backbone__reversed_video"
59 | 
60 |   amp: False
61 |   resume_ckpt_path: null
62 | 
63 |   evaluate: True 
64 |   # train_splits: ["train"]
65 |   # valid_splits: ["val"]
66 |   test_splits: ["test"]
67 | 
68 |   device: "cuda"
69 |   world_size: 1
70 |   dist_url: "env://"
71 |   distributed: True
72 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/internvideo/ssv2/acdybench_ssv2_internvideo_backbone__action_antonym.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | model:
 4 |   arch: patch_and_fuse_internvideo
 5 |   model_type: InternVideo-MM-L-14
 6 |   load_pretrained: True
 7 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 8 | 
 9 | 
10 | datasets:
11 |   actionbench_ssv2_224x224_5fps:
12 |     type: "action_antonyms_and_object_shuffled"
13 |     vis_processor:
14 |       train:
15 |         name: "video_train"
16 |         image_size: 224
17 |       eval:
18 |         name: "internvideo_eval"
19 |         image_size: 224
20 |     text_processor:
21 |       train:
22 |         name: "minimum_text"
23 |       eval:
24 |         name: "minimum_text"
25 |     
26 |     # IMPORTANT configs: 
27 |     fps: 5 # if downsampled, use 5 fps
28 |     task: "action_antonym" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
29 |     neg_sampling_same_clip: 0
30 |     eval_only: True
31 |     
32 |     # other arguements
33 |     train_k: null # sample a subset of k instances
34 |     eval_k: null # sample a subset of k instances, reduce evaluation time
35 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
36 |     num_frm: 8
37 |     train_frame_height: 224
38 |     train_frame_width: 224
39 |     eval_frame_height: 224
40 |     eval_frame_width: 224
41 | 
42 | run:
43 |   # use custom runner
44 |   runner: runner_base_patch_and_fuse
45 |   
46 |   # task object name
47 |   task: actionbench
48 | 
49 |   # which module is used for inference ["backbone", "knowledge_patcher"]
50 |   eval_module: backbone
51 |   eval_task: action_antonym
52 | 
53 |   batch_size_train: 32
54 |   batch_size_eval: 4
55 |   num_workers: 4
56 | 
57 |   seed: 42
58 |   output_dir: "output/actionbench/eval/InternVideo/ssv2__InternVideo_backbone__action_antonym"
59 | 
60 |   amp: False
61 |   resume_ckpt_path: null
62 | 
63 |   evaluate: True 
64 |   # train_splits: ["train"]
65 |   valid_splits: ["val"]
66 |   # test_splits: ["test"]
67 | 
68 |   device: "cuda"
69 |   world_size: 1
70 |   dist_url: "env://"
71 |   distributed: True
72 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/internvideo/ssv2/acdybench_ssv2_internvideo_backbone__reversed_video.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | 
 4 | model:
 5 |   arch: patch_and_fuse_internvideo
 6 |   model_type: InternVideo-MM-L-14
 7 |   load_pretrained: True
 8 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 9 | 
10 | 
11 | datasets:
12 |   actionbench_ssv2_224x224_5fps:
13 |     type: "action_antonyms_and_object_shuffled"
14 |     vis_processor:
15 |       train:
16 |         name: "video_train"
17 |         image_size: 224
18 |       eval:
19 |         name: "internvideo_eval"
20 |         image_size: 224
21 |     text_processor:
22 |       train:
23 |         name: "minimum_text"
24 |       eval:
25 |         name: "minimum_text"
26 |     
27 |     # IMPORTANT configs: 
28 |     fps: 5 # if downsampled, use 5 fps
29 |     task: "reversed_video" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
30 |     neg_sampling_same_clip: 0
31 |     eval_only: True
32 |     
33 |     # other arguements
34 |     train_k: null # sample a subset of k instances
35 |     eval_k: null # sample a subset of k instances, reduce evaluation time
36 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
37 |     num_frm: 8
38 |     train_frame_height: 224
39 |     train_frame_width: 224
40 |     eval_frame_height: 224
41 |     eval_frame_width: 224
42 | 
43 | run:
44 |   # use custom runner
45 |   runner: runner_base_patch_and_fuse
46 |   
47 |   # task object name
48 |   task: actionbench
49 | 
50 |   # which module is used for inference ["backbone", "knowledge_patcher"]
51 |   eval_module: backbone
52 |   eval_task: reversed_video
53 | 
54 |   batch_size_train: 32
55 |   batch_size_eval: 4
56 |   num_workers: 4
57 | 
58 |   seed: 42
59 |   output_dir: "output/actionbench/eval/InternVideo/ssv2__InternVideo_backbone__reversed_video"
60 | 
61 |   amp: False
62 |   resume_ckpt_path: null
63 | 
64 |   evaluate: True 
65 |   # train_splits: ["train"]
66 |   valid_splits: ["val"]
67 |   # test_splits: ["test"]
68 | 
69 |   device: "cuda"
70 |   world_size: 1
71 |   dist_url: "env://"
72 |   distributed: True
73 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/internvideo/ssv2/acdybench_ssv2_internvideo_backbone__object_shuffle.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | 
 4 | model:
 5 |   arch: patch_and_fuse_internvideo
 6 |   model_type: InternVideo-MM-L-14
 7 |   load_pretrained: True
 8 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 9 | 
10 | 
11 | 
12 | datasets:
13 |   actionbench_ssv2_224x224_5fps:
14 |     type: "action_antonyms_and_object_shuffled"
15 |     vis_processor:
16 |       train:
17 |         name: "video_train"
18 |         image_size: 224
19 |       eval:
20 |         name: "internvideo_eval"
21 |         image_size: 224
22 |     text_processor:
23 |       train:
24 |         name: "minimum_text"
25 |       eval:
26 |         name: "minimum_text"
27 |     
28 |     # IMPORTANT configs: 
29 |     fps: 5 # if downsampled, use 5 fps
30 |     task: "object_shuffle" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
31 |     neg_sampling_same_clip: 0
32 |     eval_only: True
33 |     
34 |     # other arguements
35 |     train_k: null # sample a subset of k instances
36 |     eval_k: null # sample a subset of k instances, reduce evaluation time
37 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
38 |     num_frm: 8
39 |     train_frame_height: 224
40 |     train_frame_width: 224
41 |     eval_frame_height: 224
42 |     eval_frame_width: 224
43 | 
44 | run:
45 |   # use custom runner
46 |   runner: runner_base_patch_and_fuse
47 |   
48 |   # task object name
49 |   task: actionbench
50 | 
51 |   # which module is used for inference ["backbone", "knowledge_patcher"]
52 |   eval_module: backbone
53 |   eval_task: object_shuffle
54 | 
55 |   batch_size_train: 32
56 |   batch_size_eval: 4
57 |   num_workers: 4
58 | 
59 |   seed: 42
60 |   output_dir: "output/actionbench/eval/InternVideo/ssv2__InternVideo_backbone__object_shuffle"
61 | 
62 |   amp: False
63 |   resume_ckpt_path: null
64 | 
65 |   evaluate: True 
66 |   # train_splits: ["train"]
67 |   valid_splits: ["val"]
68 |   # test_splits: ["test"]
69 | 
70 |   device: "cuda"
71 |   world_size: 1
72 |   dist_url: "env://"
73 |   distributed: True
74 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/internvideo/ego4d/acdybench_ego4d_internvideo_backbone__object_shuffle.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | 
 4 | model:
 5 |   arch: patch_and_fuse_internvideo
 6 |   model_type: InternVideo-MM-L-14
 7 |   load_pretrained: True
 8 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 9 | 
10 | 
11 | 
12 | datasets:
13 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
14 |     type: "object_shuffled"
15 |     vis_processor:
16 |       train:
17 |         name: "video_train"
18 |         image_size: 224
19 |       eval:
20 |         name: "internvideo_eval"
21 |         image_size: 224
22 |     text_processor:
23 |       train:
24 |         name: "vl_dynamic_ego4d_text"
25 |       eval:
26 |         name: "vl_dynamic_ego4d_text"
27 |     
28 |     # IMPORTANT configs: 
29 |     fps: 5 # if downsampled, use 5 fps
30 |     task: "object_shuffle" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
31 |     neg_sampling_same_clip: 0 # evaluation set to 0
32 |     eval_only: True
33 |     
34 |     # other arguements
35 |     k: null # sample a subset of k instances
36 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
37 |     num_frm: 8
38 |     train_frame_height: 224
39 |     train_frame_width: 224
40 |     eval_frame_height: 224
41 |     eval_frame_width: 224
42 | 
43 | run:
44 |   # use custom runner
45 |   runner: runner_base_patch_and_fuse
46 |   
47 |   # task object name
48 |   task: actionbench
49 | 
50 |   # which module is used for inference ["backbone", "knowledge_patcher"]
51 |   eval_module: backbone
52 |   eval_task: object_shuffle
53 | 
54 |   batch_size_train: 32
55 |   batch_size_eval: 4
56 |   num_workers: 4
57 | 
58 |   seed: 42
59 |   output_dir: "output/actionbench/eval/InternVideo/ego4d__InternVideo_backbone__object_shuffle"
60 | 
61 |   amp: False
62 |   resume_ckpt_path: null
63 | 
64 |   evaluate: True 
65 |   # train_splits: ["train"]
66 |   # valid_splits: ["val"]
67 |   test_splits: ["test"]
68 | 
69 |   device: "cuda"
70 |   world_size: 1
71 |   dist_url: "env://"
72 |   distributed: True
73 | 


--------------------------------------------------------------------------------
/src/Singularity/dataset/dataloader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from utils.distributed import get_rank, is_dist_avail_and_initialized, is_main_process
 4 | import random
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class MetaLoader(object):
11 |     """ wraps multiple data loader """
12 |     def __init__(self, name2loader):
13 |         """Iterates over multiple dataloaders, it ensures all processes
14 |         work on data from the same dataloader. This loader will end when
15 |         the shorter dataloader raises StopIteration exception.
16 | 
17 |         loaders: Dict, {name: dataloader}
18 |         """
19 |         self.name2loader = name2loader
20 |         self.name2iter = {name: iter(l) for name, l in name2loader.items()}
21 |         name2index = {name: idx for idx, (name, l) in enumerate(name2loader.items())}
22 |         index2name = {v: k for k, v in name2index.items()}
23 | 
24 |         iter_order = []
25 |         for n, l in name2loader.items():
26 |             iter_order.extend([name2index[n]]*len(l))
27 | 
28 |         random.shuffle(iter_order)
29 |         iter_order = torch.Tensor(iter_order).to(torch.device("cuda")).to(torch.uint8)
30 | 
31 |         # sync
32 |         if is_dist_avail_and_initialized():
33 |             # make sure all processes have the same order so that
34 |             # each step they will have data from the same loader
35 |             dist.broadcast(iter_order, src=0)
36 |         self.iter_order = [index2name[int(e.item())] for e in iter_order.cpu()]
37 | 
38 |         logger.info(str(self))
39 | 
40 |     def __str__(self):
41 |         output = [f"MetaLoader has {len(self.name2loader)} dataloaders, {len(self)} batches in total"]
42 |         for idx, (name, loader) in enumerate(self.name2loader.items()):
43 |             output.append(
44 |                 f"dataloader index={idx} name={name}, batch-size={loader.batch_size} length(#batches)={len(loader)} "
45 |             )
46 |         return "\n".join(output)
47 | 
48 |     def __len__(self):
49 |         return len(self.iter_order)
50 | 
51 |     def __iter__(self):
52 |         """ this iterator will run indefinitely """
53 |         for name in self.iter_order:
54 |             _iter = self.name2iter[name]
55 |             batch = next(_iter)
56 |             yield name, batch
57 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/lsmdc_retrieval/lsmdc_retrieval_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "lsmdc-101k",
 5 |       "vis_format": "video",
 6 |       "txt": "clip_data/vis_db/lsmdc/train_101k_frame.jsonl",
 7 |       "vis": "datasets/lsmdc"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "lsmdc-1k",
13 |       "vis_format": "video",
14 |       "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl",
15 |       "vis": "datasets/lsmdc"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "lsmdc-1k",
21 |       "vis_format": "video",
22 |       "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl",
23 |       "vis": "datasets/lsmdc"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch16",
39 |   "clip_config": "openai/clip-vit-base-patch16",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 10,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/lsmdc_retrieval/lsmdc_retrieval_vip_base_16",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/lsmdc_retrieval/lsmdc_retrieval_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "lsmdc-101k",
 5 |       "vis_format": "video",
 6 |       "txt": "clip_data/vis_db/lsmdc/train_101k_frame.jsonl",
 7 |       "vis": "datasets/lsmdc"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "lsmdc-1k",
13 |       "vis_format": "video",
14 |       "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl",
15 |       "vis": "datasets/lsmdc"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "lsmdc-1k",
21 |       "vis_format": "video",
22 |       "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl",
23 |       "vis": "datasets/lsmdc"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch32",
39 |   "clip_config": "openai/clip-vit-base-patch32",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 10,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/lsmdc_retrieval/lsmdc_retrieval_vip_base_32",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/didemo_retrieval/didemo_retrieval_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "didemo-train",
 5 |       "vis_format": "video",
 6 |       "txt": "datasets/lfvideo_data/task/didemo/train.jsonl",
 7 |       "vis": "datasets/didemo/didemo_video_xfps/"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "didemo-val",
13 |       "vis_format": "video",
14 |       "txt": "datasets/lfvideo_data/task/didemo/val.jsonl",
15 |       "vis": "datasets/didemo/didemo_video_xfps/"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "didemo-test",
21 |       "vis_format": "video",
22 |       "txt": "datasets/lfvideo_data/task/didemo/test.jsonl",
23 |       "vis": "datasets/didemo/didemo_video_xfps/"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch32",
39 |   "clip_config": "openai/clip-vit-base-patch32",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 20,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/didemo_retrieval/didemo_retrieval_vip_base_32",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/didemo_retrieval/didemo_retrieval_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "didemo-train",
 5 |       "vis_format": "video",
 6 |       "txt": "datasets/lfvideo_data/task/didemo/train.jsonl",
 7 |       "vis": "datasets/didemo/didemo_video_xfps/"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "didemo-val",
13 |       "vis_format": "video",
14 |       "txt": "datasets/lfvideo_data/task/didemo/val.jsonl",
15 |       "vis": "datasets/didemo/didemo_video_xfps/"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "didemo-test",
21 |       "vis_format": "video",
22 |       "txt": "datasets/lfvideo_data/task/didemo/test.jsonl",
23 |       "vis": "datasets/didemo/didemo_video_xfps/"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 70,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch16",
39 |   "clip_config": "openai/clip-vit-base-patch16",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 10,
63 |   "lr_mul_prefix": "logit_scale",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 20,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/didemo_retrieval/didemo_retrieval_vip_base_16",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/src/preprocessing/nextqa/downsample_downsize_video_nextqa.py:
--------------------------------------------------------------------------------
 1 | # modified from EgoVLP https://github.com/showlab/EgoVLP/blob/main/utils/video_resize.py
 2 | # Downsamples, downsizes, and converts to mp4
 3 | 
 4 | import os
 5 | import subprocess
 6 | from multiprocessing import Pool
 7 | from tqdm import tqdm
 8 | from functools import partial
 9 | from glob import glob
10 | 
11 | def resize_video(input_output_path, suppress_stdout=False, suppress_stderr=False):
12 |     input_path, output_path = input_output_path
13 | 
14 |     if os.path.exists(output_path):
15 |         print(f'{output_path} already exists.')
16 |         return
17 | 
18 |     cmd = f"ffmpeg -loglevel info -y -i {input_path} -filter:v scale={image_size}:{image_size},fps={fps} -c:a copy {output_path}"
19 | 
20 |     kwargs = {}
21 |     if suppress_stdout:
22 |         kwargs['stdout'] = subprocess.DEVNULL
23 |     if suppress_stderr:
24 |         kwargs['stderr'] = subprocess.DEVNULL
25 | 
26 |     subprocess.run(cmd, shell=True, **kwargs)
27 | 
28 |     return
29 | 
30 | if __name__ == "__main__":
31 |     suppress_stdout = True
32 |     suppress_stderr = True
33 |     num_proc = 10
34 | 
35 |     image_size = 224
36 |     fps = 5
37 | 
38 |     original_clips = 'datasets/NextQA/video_clips/NExTVideo'
39 |     output_dir = f'datasets/NextQA/video_clips/NExTVideo_downsampled_{fps}fps_downsized_{image_size}x{image_size}'
40 | 
41 |     os.makedirs(output_dir, exist_ok=True)
42 | 
43 |     input_output_paths = []
44 | 
45 |     input_dirs = glob(os.path.join(original_clips, "*"))
46 |     for d in input_dirs:
47 |         input_paths = glob(os.path.join(d, "*.mp4"))
48 |         input_dir_name = os.path.basename(d)
49 |         for ip in input_paths:
50 |             video_name = os.path.basename(ip)
51 |             os.makedirs(os.path.join(output_dir, input_dir_name), exist_ok=True)
52 |             op = os.path.join(output_dir, input_dir_name, video_name)
53 |             input_output_paths.append((ip,op))
54 |     
55 |     # mp4_list = [item for item in os.listdir(original_clips) if item.endswith('.mp4')] # load original mp4 files
56 |     # print('Total files to consider:', len(mp4_list))
57 | 
58 |     print('Total files to consider:', len(input_output_paths))
59 | 
60 | 
61 |     resizer = partial(resize_video, suppress_stdout=suppress_stdout, suppress_stderr=suppress_stderr)
62 |     for _ in tqdm(Pool(num_proc).imap_unordered(resizer, input_output_paths), total=len(input_output_paths)):
63 |         pass


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/msrvtt_retrieval/msrvtt_retrieval_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "msrvtt-9k",
 5 |       "vis_format": "video",
 6 |       "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl",
 7 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "msrvtt-1ka",
13 |       "vis_format": "video",
14 |       "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
15 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "msrvtt-1ka",
21 |       "vis_format": "video",
22 |       "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
23 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch16",
39 |   "clip_config": "openai/clip-vit-base-patch16",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 100,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_16",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/msrvtt_retrieval/msrvtt_retrieval_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "msrvtt-9k",
 5 |       "vis_format": "video",
 6 |       "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl",
 7 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "msrvtt-1ka",
13 |       "vis_format": "video",
14 |       "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
15 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "msrvtt-1ka",
21 |       "vis_format": "video",
22 |       "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
23 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch32",
39 |   "clip_config": "openai/clip-vit-base-patch32",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 100,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_32",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/actnet_retrieval/actnet_retrieval_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "actnet-train",
 5 |       "vis_format": "frame",
 6 |       "txt": "clip_data/vis_db/anet_retrieval/train.jsonl",
 7 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "actnet-test",
13 |       "vis_format": "frame",
14 |       "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl",
15 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "actnet-test",
21 |       "vis_format": "frame",
22 |       "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl",
23 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 32,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 32,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 70,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch16",
39 |   "clip_config": "openai/clip-vit-base-patch16",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 8,
49 |   "test_batch_size": 8,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 20,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/actnet_retrieval/actnet_retrieval_vip_base_16",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/actnet_retrieval/actnet_retrieval_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "actnet-train",
 5 |       "vis_format": "frame",
 6 |       "txt": "clip_data/vis_db/anet_retrieval/train.jsonl",
 7 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "actnet-test",
13 |       "vis_format": "frame",
14 |       "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl",
15 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "actnet-test",
21 |       "vis_format": "frame",
22 |       "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl",
23 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 32,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 32,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 70,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch32",
39 |   "clip_config": "openai/clip-vit-base-patch32",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 20,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/actnet_retrieval/actnet_retrieval_vip_base_32",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def cal_cossim(feats1, feats2):
 4 |     sim_matrix = np.dot(feats1, feats2.T)
 5 |     return sim_matrix
 6 | 
 7 | def np_softmax(X, theta = 1.0, axis = None):
 8 |     """
 9 |     Compute the softmax of each element along an axis of X.
10 | 
11 |     Parameters
12 |     ----------
13 |     X: ND-Array. Probably should be floats. 
14 |     theta (optional): float parameter, used as a multiplier
15 |         prior to exponentiation. Default = 1.0
16 |     axis (optional): axis to compute values along. Default is the 
17 |         first non-singleton axis.
18 | 
19 |     Returns an array the same size as X. The result will sum to 1
20 |     along the specified axis.
21 |     """
22 |     # make X at least 2d
23 |     y = np.atleast_2d(X)
24 |     # find axis
25 |     if axis is None:
26 |         axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)
27 |     # multiply y against the theta parameter, 
28 |     y = y * float(theta)
29 |     # subtract the max for numerical stability
30 |     y = y - np.expand_dims(np.max(y, axis = axis), axis)
31 |     # exponentiate y
32 |     y = np.exp(y)
33 |     # take the sum along the specified axis
34 |     ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)
35 |     # finally: divide elementwise
36 |     p = y / ax_sum
37 |     # flatten if X was 1D
38 |     if len(X.shape) == 1: p = p.flatten()
39 |     return p
40 | 
41 | def compute_metrics(x):
42 |     sx = np.sort(-x, axis=1)
43 |     d = np.diag(-x)
44 |     d = d[:, np.newaxis]
45 |     ind = sx - d
46 |     ind = np.where(ind == 0)
47 |     ind = ind[1]
48 |     r1 = float(np.sum(ind == 0))  / len(ind)
49 |     r5 = float(np.sum(ind < 5))  / len(ind)
50 |     r10 = float(np.sum(ind < 10))  / len(ind)
51 |     medr = np.median(ind) + 1
52 |     meanr  = np.mean(ind) + 1
53 |     return r1, r5, r10, medr, meanr
54 | 
55 | def compute_metrics_multi(x, t2v_labels_list):
56 |     sx = np.sort(-x, axis=1)
57 |     t2v_labels_list = np.array(t2v_labels_list)
58 |     arg = np.arange(x.shape[0])
59 |     d = -x[arg, t2v_labels_list]
60 |     d = d[:, np.newaxis]
61 |     ind = sx - d
62 |     ind = np.where(ind == 0)
63 |     ind = ind[1]
64 |     r1 = float(np.sum(ind == 0))  / len(ind)
65 |     r5 = float(np.sum(ind < 5))  / len(ind)
66 |     r10 = float(np.sum(ind < 10))  / len(ind)
67 |     medr = np.median(ind) + 1
68 |     meanr  = np.mean(ind) + 1
69 |     return r1, r5, r10, medr, meanr
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 |     sim_matrix = np.random.random((5,5))
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/src/Singularity/dataset/base_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from dataset.utils import load_image_from_path
 3 | import random
 4 | import logging
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class ImageVideoBaseDataset(Dataset):
10 |     """Base class that implements the image and video loading methods"""
11 |     media_type = "video"
12 | 
13 |     def __init__(self):
14 |         assert self.media_type in ["image", "video"]
15 |         self.anno_list = None  # list(dict), each dict contains {"image": str, # image or video path}
16 |         self.transform = None
17 |         self.video_reader = None
18 |         self.num_tries = None
19 | 
20 |     def __getitem__(self, index):
21 |         raise NotImplementedError
22 | 
23 |     def __len__(self):
24 |         raise NotImplementedError
25 | 
26 |     def load_and_transform_media_data(self, index):
27 |         if self.media_type == "image":
28 |             return self.load_and_transform_media_data_image(index)
29 |         else:
30 |             return self.load_and_transform_media_data_video(index)
31 | 
32 |     def load_and_transform_media_data_image(self, index):
33 |         ann = self.anno_list[index]
34 |         data_path = ann["image"]
35 |         image = load_image_from_path(data_path)
36 |         image = self.transform(image)
37 |         return image, index
38 | 
39 |     def load_and_transform_media_data_video(self, index):
40 |         for i in range(self.num_tries):
41 |             ann = self.anno_list[index]
42 |             data_path = ann["image"]
43 |             try:
44 |                 max_num_frames = self.max_num_frames \
45 |                     if hasattr(self, "max_num_frames") else -1
46 |                 frames, frame_indices, video_duration = self.video_reader(
47 |                     data_path, self.num_frames, self.sample_type,
48 |                     max_num_frames=max_num_frames
49 |                 )
50 |             except Exception as e:
51 |                 index = random.randint(0, len(self) - 1)
52 |                 logger.warning(
53 |                     f"Caught exception {e} when loading video {data_path}, "
54 |                     f"randomly sample a new video as replacement")
55 |                 continue
56 | 
57 |             frames = self.transform(frames)
58 |             return frames, index
59 |         else:
60 |             raise RuntimeError(
61 |                 f"Failed to fetch video after {self.num_tries} tries. "
62 |                 f"This might indicate that you have many corrupted videos."
63 |             )
64 | 


--------------------------------------------------------------------------------
/src/Singularity/dataset/qa_dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataset.base_dataset import ImageVideoBaseDataset
 3 | from dataset.utils import pre_text, load_anno
 4 | from dataset.video_utils import VIDEO_READER_FUNCS
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class ImageQADataset(ImageVideoBaseDataset):
11 |     media_type = "image"
12 | 
13 |     def __init__(self, ann_file, transform, eos="[SEP]", mode="train", answer_list=None):
14 |         super(ImageQADataset, self).__init__()
15 |         assert mode in ["train", "eval"]
16 |         self.mode = mode
17 |         self.transform = transform
18 |         self.eos = eos
19 | 
20 |         self.anno_list = load_anno(ann_file)
21 | 
22 |         if mode == "eval":
23 |             self.answer_list = json.load(open(answer_list, "r"))
24 | 
25 |     def __len__(self):
26 |         return len(self.anno_list)
27 | 
28 |     def get_answers_with_weights(self, raw_answers):
29 |         if isinstance(raw_answers, str):
30 |             raw_answers = [raw_answers]
31 |         answer_weight = {}
32 |         for answer in raw_answers:
33 |             if answer in answer_weight.keys():
34 |                 answer_weight[answer] += 1/len(raw_answers)
35 |             else:
36 |                 answer_weight[answer] = 1/len(raw_answers)
37 | 
38 |         answers = list(answer_weight.keys())
39 |         weights = [answer_weight[a] for a in answers]
40 |         answers = [answer + " " + self.eos for answer in answers]
41 |         return answers, weights
42 | 
43 |     def __getitem__(self, index):
44 |         ann = self.anno_list[index]
45 |         image, index = self.load_and_transform_media_data(index)
46 | 
47 |         question = pre_text(ann["question"])
48 |         if self.mode == "train":
49 |             answers, weights = self.get_answers_with_weights(ann["answer"])
50 |             return image, question, answers, weights
51 |         else:  # self.mode == "eval":
52 |             question_id = ann["question_id"]
53 |             return image, question, question_id
54 | 
55 | 
56 | class VideoQADataset(ImageQADataset):
57 |     media_type = "video"
58 | 
59 |     def __init__(
60 |             self, ann_file, transform, eos="[SEP]", mode="train", answer_list=None,
61 |             num_frames=4, video_reader_type="decord", sample_type="rand", num_tries=1
62 |     ):
63 |         super(VideoQADataset, self).__init__(
64 |             ann_file, transform, eos, mode, answer_list)
65 |         self.num_frames = num_frames
66 |         self.video_reader_type = video_reader_type
67 |         self.video_reader = VIDEO_READER_FUNCS[video_reader_type]
68 |         self.sample_type = sample_type
69 |         self.num_tries = num_tries
70 | 


--------------------------------------------------------------------------------
/src/Singularity/utils/scheduler.py:
--------------------------------------------------------------------------------
 1 | """ Scheduler Factory
 2 | Hacked together by / Copyright 2020 Ross Wightman
 3 | """
 4 | from torch.optim import Optimizer
 5 | import math
 6 | from torch.optim.lr_scheduler import LambdaLR
 7 | 
 8 | 
 9 | def create_scheduler(args, optimizer):
10 |     lr_scheduler = None
11 |     if args.sched == 'cosine':
12 |         lr_scheduler = get_cosine_schedule_with_warmup(
13 |             optimizer,
14 |             num_warmup_steps=args.num_warmup_steps,
15 |             num_training_steps=args.num_training_steps,
16 |             num_cycles=0.5,
17 |             min_lr_multi=args.min_lr_multi
18 |         )
19 |     return lr_scheduler
20 | 
21 | 
22 | def get_cosine_schedule_with_warmup(
23 |         optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int,
24 |         num_cycles: float = 0.5, min_lr_multi: float = 0., last_epoch: int = -1
25 | ):
26 |     """
27 |     Modified from https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/optimization.py
28 | 
29 |     Create a schedule with a learning rate that decreases following the values of the cosine function between the
30 |     initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
31 |     initial lr set in the optimizer.
32 |     Args:
33 |         optimizer ([`~torch.optim.Optimizer`]):
34 |             The optimizer for which to schedule the learning rate.
35 |         num_warmup_steps (`int`):
36 |             The number of steps for the warmup phase.
37 |         num_training_steps (`int`):
38 |             The total number of training steps.
39 |         num_cycles (`float`, *optional*, defaults to 0.5):
40 |             The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
41 |             following a half-cosine).
42 |         min_lr_multi (`float`, *optional*, defaults to 0):
43 |             The minimum learning rate multiplier. Thus the minimum learning rate is base_lr * min_lr_multi.
44 |         last_epoch (`int`, *optional*, defaults to -1):
45 |             The index of the last epoch when resuming training.
46 |     Return:
47 |         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
48 |     """
49 | 
50 |     def lr_lambda(current_step):
51 |         if current_step < num_warmup_steps:
52 |             return max(min_lr_multi, float(current_step) / float(max(1, num_warmup_steps)))
53 |         progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
54 |         return max(min_lr_multi, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
55 | 
56 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
57 | 


--------------------------------------------------------------------------------
/dataset_cards/actionbench_ego4d.md:
--------------------------------------------------------------------------------
 1 | # Action Dynamic Benchmark (ActionBench) on Ego4d
 2 | 
 3 | ## Instruction for Downloading Videos
 4 | - Set up Ego4d CLI following [here](https://ego4d-data.org/docs/start-here/)
 5 | - Download the Moment clips using the following command:
 6 |     ```
 7 |         ego4d \
 8 |             --output_directory="Ego4d" \
 9 |             --datasets clips annotations \
10 |             --benchmarks "EM" \
11 |             --metadata
12 |     ```
13 | - Put the downloaded `clips/` folder into `datasets/Ego4D/video_clips/` as `datasets/Ego4D/video_clips/clips`
14 | - Run preprocessing on the video clips (at the root dir of this repo):
15 |     ```
16 |         python src/preprocessing/ego4d/downsample_downsize_video_clips.py
17 |     ```
18 | - The processed video clips will be stored at `datasets/Ego4D/video_clips/clips_downsampled_5fps_downsized_224x224`
19 | 
20 | 
21 | ## Annotation Details
22 | 
23 | ### Annotation for Action Antonym Task & Video Reversal Task
24 | - train size: 274,946
25 | - val size: 34,368
26 | - test size: 34,369
27 | 
28 | - ann_path: `ActionBench/ego4d/egoclip_subset_action_antonyms_train_val_test_split/{split}.jsonl`.  The original annotation is based on a subset of [EgoClip](https://github.com/showlab/EgoVLP). 
29 | - format:
30 |     ```
31 |     {
32 |         'video_uid': '002d2729-df71-438d-8396-5895b349e8fd', 
33 |         'video_dur': 3571.4333333333334, 
34 |         'narration_source': 'narration_pass_1', 
35 |         'narration_ind': 229, 
36 |         'narration_time': 592.6903, 
37 |         'clip_start': 592.3519665973915, 
38 |         'clip_end': 593.0286286452686, 
39 |         'clip_text': '#C C picks up the knife from the chopping board with her right hand.', 
40 |         'action_antonym_clip_text': '#C C drops down the knife from the chopping board with her right hand.', 
41 |         'tag_verb': '[17, 93]', 
42 |         'tag_noun': '[321, 268, 573, 105]', 
43 |         'Unnamed: 10': nan, 
44 |         'clip_uid': '116ec16b-0d76-4e71-b02c-72cb37ebd5c5', 
45 |         'narration_relative_time': 0.6902999999999793, 
46 |         'clip_relative_start': 0.351966597391538, 
47 |         'clip_relative_end': 1.0286286452685545, 
48 |         'clip_fps': 30.0}
49 |     ```
50 | 
51 | ### Annotation for Object Shuffle 
52 | A subset from above by filtering out clips with no object in the clip text.
53 | - val size: 31974
54 | - test size: 31925
55 | - ann_path: `ActionBench/ego4d/egoclip_subset_action_antonyms_object_shuffled_train_val_test_split/{split}.jsonl`
56 | - format: additional fields:
57 |     ```
58 |     {
59 |         ...
60 |         'object_shuffled_clip_text':'#C C picks up the banana from the chopping board with her right hand.',
61 |     }
62 |     ```
63 | 
64 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/moments_in_time/side_tuning_zero-shot.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: patch_and_fuse_internvideo
 3 |   model_type: InternVideo-MM-L-14
 4 |   load_pretrained: True
 5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 6 |   pretrained: "<path to ssv2-label side_tuning>/checkpoint_0.pth"
 7 | 
 8 |   text_perceiver_config:
 9 |     dim: 768 # latent query dim
10 |     k_v_dim: 768 # text_width
11 |     depth: 1
12 |     dim_head: 64
13 |     heads: 8
14 |     num_latents: 16
15 |     ff_mult: 2
16 | 
17 |   vision_perceiver_config:
18 |     dim: 768 # latent query dim
19 |     k_v_dim: 1024 # vision_width
20 |     depth: 1
21 |     dim_head: 64
22 |     heads: 8
23 |     num_latents: 16
24 |     ff_mult: 2
25 | 
26 |   objectives: ["video_text_contrastive"]
27 |   loss_weighting: [1.0]
28 |   if_use_attn_guidance: False
29 |   if_use_dual_perceiver: False
30 |   if_add_temporal_emebdding: True
31 |   num_frms: 8
32 |   temp_emb_drop_out: 0.1
33 |   if_as_knowledge_fuser: True
34 |   knowledge_fuser_type: "side_tuning"
35 |   train_knowledge_fuser_jointly: True
36 | 
37 | datasets:
38 |   downstream_tasks_moment_in_time:
39 |     type: "default"
40 |     vis_processor:
41 |       train:
42 |         name: "video_train"
43 |         image_size: 224
44 |       eval:
45 |         name: "internvideo_eval"
46 |         image_size: 224
47 |     text_processor:
48 |       train:
49 |         name: "minimum_text"
50 |       eval:
51 |         name: "minimum_text"
52 |     
53 |     # IMPORTANT configs:
54 |     task: video_action_retrieval_2k
55 |     neg_sampling_same_clip: 0
56 |     eval_only: True
57 |     
58 |     # other arguements
59 |     train_k: null # sample a subset of k instances
60 |     eval_k: null # sample a subset of 3000 instances, reduce evaluation time
61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
62 |     num_frm: 8
63 |     train_frame_height: 224
64 |     train_frame_width: 224
65 |     eval_frame_height: 224
66 |     eval_frame_width: 224
67 | 
68 | run:
69 |   # use custom runner
70 |   runner: runner_base_patch_and_fuse
71 |   
72 |   # task object name
73 |   task: downstream_tasks_retrieval
74 | 
75 |   # which module is used for inference ["backbone", "knowledge_patcher"]
76 |   eval_module: knowledge_patcher
77 |   eval_task: video_action_retrieval_2k
78 | 
79 |   batch_size_train: 32
80 |   batch_size_eval: 4
81 |   num_workers: 4
82 | 
83 |   seed: 42
84 |   output_dir: "output/downstream_tasks/MomentsInTime/eval/side_tuning_zero-shot"
85 | 
86 |   amp: False
87 |   resume_ckpt_path: null
88 | 
89 |   evaluate: True
90 |   
91 |   # train_splits: ["train"]
92 |   valid_splits: ["val"]
93 |   # test_splits: ["test"]
94 | 
95 |   device: "cuda"
96 |   world_size: 1
97 |   dist_url: "env://"
98 |   distributed: True
99 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/nextqa/side_tuning.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: patch_and_fuse_internvideo_mcqa
 3 |   model_type: InternVideo-MM-L-14
 4 |   load_pretrained: True
 5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 6 |   pretrained: "<path to trained side_tuning on nextqa>/checkpoint_best.pth" # set trained patch_and_fuse on nextqa checkpoint path
 7 | 
 8 |   text_perceiver_config:
 9 |     dim: 768 # latent query dim
10 |     k_v_dim: 768 # text_width
11 |     depth: 1
12 |     dim_head: 64
13 |     heads: 8
14 |     num_latents: 16
15 |     ff_mult: 2
16 | 
17 |   vision_perceiver_config:
18 |     dim: 768 # latent query dim
19 |     k_v_dim: 1024 # vision_width
20 |     depth: 1
21 |     dim_head: 64
22 |     heads: 8
23 |     num_latents: 16
24 |     ff_mult: 2
25 | 
26 |   objectives: ["mcqa_loss"]
27 |   loss_weighting: [1.0]
28 |   if_use_attn_guidance: False
29 |   if_use_dual_perceiver: False
30 |   if_add_temporal_emebdding: False
31 |   num_frms: 8
32 |   temp_emb_drop_out: 0.0
33 |   knowledge_fuser_type: "side_tuning"
34 |   if_as_knowledge_fuser: True
35 |   train_knowledge_fuser_jointly: True
36 | 
37 | datasets:
38 |   downstream_tasks_qa_nextqa_224x224_5fps:
39 |     type: "default"
40 |     vis_processor:
41 |       train:
42 |         name: "video_train"
43 |         image_size: 224
44 |       eval:
45 |         name: "internvideo_eval"
46 |         image_size: 224
47 |     text_processor:
48 |       train:
49 |         name: "minimum_text"
50 |       eval:
51 |         name: "minimum_text"
52 |     
53 |     # IMPORTANT configs:
54 |     task: 5way-multiple-choice-qa # 
55 |     neg_sampling_same_clip: 0
56 |     eval_only: True
57 |     
58 |     # other arguements
59 |     train_k: null # sample a subset of k instances
60 |     eval_k: null # sample a subset of k instances, reduce evaluation time
61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
62 |     num_frm: 8
63 |     train_frame_height: 224
64 |     train_frame_width: 224
65 |     eval_frame_height: 224
66 |     eval_frame_width: 224
67 | 
68 | run:
69 |   # use custom runner
70 |   runner: runner_base_patch_and_fuse
71 |   
72 |   # task object name
73 |   task: downstream_tasks_multi_choice_qa
74 | 
75 |   # which module is used for inference ["backbone", "knowledge_patcher"]
76 |   eval_module: knowledge_patcher
77 |   eval_task: 5way-multiple-choice-qa
78 | 
79 |   batch_size_train: 32
80 |   batch_size_eval: 8
81 |   num_workers: 4
82 | 
83 |   seed: 42
84 |   output_dir: "output/downstream_tasks/NextQA/eval/side_tuning"
85 | 
86 |   amp: False
87 |   resume_ckpt_path: null
88 | 
89 |   evaluate: True
90 |   
91 |   # train_splits: ["train"]
92 |   valid_splits: ["val"]
93 |   test_splits: ["test"]
94 | 
95 |   device: "cuda"
96 |   world_size: 1
97 |   dist_url: "env://"
98 |   distributed: True
99 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/nextqa/patch_and_fuse.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: patch_and_fuse_internvideo_mcqa
 3 |   model_type: InternVideo-MM-L-14
 4 |   load_pretrained: True
 5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 6 |   pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/nextqa_patch_and_fuse.pth" # set trained patch_and_fuse on nextqa checkpoint path
 7 | 
 8 |   text_perceiver_config:
 9 |     dim: 768 # latent query dim
10 |     k_v_dim: 768 # text_width
11 |     depth: 1
12 |     dim_head: 64
13 |     heads: 8
14 |     num_latents: 16
15 |     ff_mult: 2
16 | 
17 |   vision_perceiver_config:
18 |     dim: 768 # latent query dim
19 |     k_v_dim: 1024 # vision_width
20 |     depth: 1
21 |     dim_head: 64
22 |     heads: 8
23 |     num_latents: 16
24 |     ff_mult: 2
25 | 
26 |   objectives: ["mcqa_loss"]
27 |   loss_weighting: [1.0]
28 |   if_use_attn_guidance: False
29 |   if_use_dual_perceiver: False
30 |   if_add_temporal_emebdding: False
31 |   num_frms: 8
32 |   temp_emb_drop_out: 0.0
33 |   knowledge_fuser_type: "xattn"
34 |   if_as_knowledge_fuser: True
35 |   train_knowledge_fuser_jointly: True
36 | 
37 | datasets:
38 |   downstream_tasks_qa_nextqa_224x224_5fps:
39 |     type: "default"
40 |     vis_processor:
41 |       train:
42 |         name: "video_train"
43 |         image_size: 224
44 |       eval:
45 |         name: "internvideo_eval"
46 |         image_size: 224
47 |     text_processor:
48 |       train:
49 |         name: "minimum_text"
50 |       eval:
51 |         name: "minimum_text"
52 |     
53 |     # IMPORTANT configs:
54 |     task: 5way-multiple-choice-qa # 
55 |     neg_sampling_same_clip: 0
56 |     eval_only: True
57 |     
58 |     # other arguements
59 |     train_k: null # sample a subset of k instances
60 |     eval_k: null # sample a subset of k instances, reduce evaluation time
61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
62 |     num_frm: 8
63 |     train_frame_height: 224
64 |     train_frame_width: 224
65 |     eval_frame_height: 224
66 |     eval_frame_width: 224
67 | 
68 | run:
69 |   # use custom runner
70 |   runner: runner_base_patch_and_fuse
71 |   
72 |   # task object name
73 |   task: downstream_tasks_multi_choice_qa
74 | 
75 |   # which module is used for inference ["backbone", "knowledge_patcher"]
76 |   eval_module: knowledge_patcher
77 |   eval_task: 5way-multiple-choice-qa
78 | 
79 |   batch_size_train: 32
80 |   batch_size_eval: 8
81 |   num_workers: 4
82 | 
83 |   seed: 42
84 |   output_dir: "output/downstream_tasks/NextQA/eval/patch_and_fuse"
85 | 
86 |   amp: False
87 |   resume_ckpt_path: null
88 | 
89 |   evaluate: True
90 |   
91 |   # train_splits: ["train"]
92 |   valid_splits: ["val"]
93 |   test_splits: ["test"]
94 | 
95 |   device: "cuda"
96 |   world_size: 1
97 |   dist_url: "env://"
98 |   distributed: True
99 | 


--------------------------------------------------------------------------------
/src/configs/projects/train/downstream_tasks/nextqa/KP-Perceiver-VTC-DVDM.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   arch: patch_and_fuse_internvideo_mcqa
  3 |   model_type: InternVideo-MM-L-14
  4 |   load_pretrained: True
  5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/models/InternVideo-MM-L-14.ckpt"
  6 | 
  7 |   text_perceiver_config:
  8 |     dim: 768 # latent query dim
  9 |     k_v_dim: 768 # text_width
 10 |     depth: 1
 11 |     dim_head: 64
 12 |     heads: 8
 13 |     num_latents: 16
 14 |     ff_mult: 2
 15 | 
 16 |   vision_perceiver_config:
 17 |     dim: 768 # latent query dim
 18 |     k_v_dim: 1024 # vision_width
 19 |     depth: 1
 20 |     dim_head: 64
 21 |     heads: 8
 22 |     num_latents: 16
 23 |     ff_mult: 2
 24 | 
 25 |   objectives: ["mcqa_loss", "video_action_contrastive"]
 26 |   loss_weighting: [1.0, 1.0]
 27 |   if_use_attn_guidance: False
 28 |   if_use_dual_perceiver: False
 29 |   if_add_temporal_emebdding: False
 30 |   num_frms: 8
 31 |   temp_emb_drop_out: 0.0
 32 | 
 33 | datasets:
 34 |   downstream_tasks_qa_nextqa_224x224_5fps:
 35 |     type: "default"
 36 |     vis_processor:
 37 |       train:
 38 |         name: "video_train"
 39 |         image_size: 224
 40 |       eval:
 41 |         name: "internvideo_eval"
 42 |         image_size: 224
 43 |     text_processor:
 44 |       train:
 45 |         name: "minimum_text"
 46 |       eval:
 47 |         name: "minimum_text"
 48 |     
 49 |     # IMPORTANT configs:
 50 |     task: 5way-multiple-choice-qa
 51 |     neg_sampling_same_clip: 0
 52 |     eval_only: False
 53 |     
 54 |     # other arguements
 55 |     train_k: null # sample a subset of k instances
 56 |     eval_k: null # sample a subset of 3000 instances, reduce evaluation time
 57 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 58 |     num_frm: 8
 59 |     train_frame_height: 224
 60 |     train_frame_width: 224
 61 |     eval_frame_height: 224
 62 |     eval_frame_width: 224
 63 | 
 64 | 
 65 | run:
 66 |   # use custom runner
 67 |   runner: runner_base_patch_and_fuse
 68 |   
 69 |   # task object name
 70 |   task: actionbench
 71 | 
 72 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 73 |   eval_module: knowledge_patcher
 74 |   eval_task: video_text_matching
 75 | 
 76 |   # optimizer
 77 |   lr_sched: "linear_warmup_cosine_lr"
 78 |   init_lr: 1e-5
 79 |   min_lr: 0
 80 |   weight_decay: 0.05
 81 |   max_epoch: 1 
 82 | 
 83 |   batch_size_train: 16
 84 |   batch_size_eval: 4
 85 |   
 86 |   num_workers: 4
 87 | 
 88 |   seed: 42
 89 |   output_dir: "output/downstream_tasks/NextQA/train/KP-Perceiver-VTC-DVDM_internvideo"
 90 | 
 91 |   amp: False
 92 |   resume_ckpt_path: null
 93 | 
 94 |   evaluate: False
 95 |   
 96 |   train_splits: ["train"]
 97 |   valid_splits: ["val"]
 98 |   # test_splits: ["test"]
 99 | 
100 |   device: "cuda"
101 |   world_size: 1
102 |   dist_url: "env://"
103 |   distributed: True


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/pretrain/pretrain_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |           "name": "hdvila",
 5 |           "vis_format": "videoframe",
 6 |           "txt": "datasets/hdvila/hdvila_subtitles_92m_db",
 7 |           "vis": "youtube_data/ytt180m/video_clips_3fps",
 8 |           "vid_cap_path": "datasets/hdvila/hdvila_captions_db",
 9 |           "vid_txt": "subtitle",
10 |           "img_dir": "",
11 |           "cap_path": "",
12 |           "img_source": "",
13 |           "img_ratio": 0
14 |         }
15 |       ],
16 |       "val_datasets": [
17 |         {
18 |           "name": "msrvtt",
19 |           "vis_format": "video",
20 |           "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
21 |           "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
22 |         },
23 |         {
24 |           "name": "how2",
25 |           "vis_format": "video",
26 |           "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl",
27 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
28 |         },
29 |         {
30 |           "name": "ours",
31 |           "vis_format": "video",
32 |           "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl",
33 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
34 |         }
35 |     ],
36 |   
37 |     "train_n_clips": 1,
38 |     "train_num_frms": 12,
39 |     "test_n_clips": 1,
40 |     "test_num_frms": 12,
41 |     "sample_rate": 0,
42 |     "sample_jitter": 1,
43 |     "video_res": [240, 320],
44 |     "input_res": [224, 224],
45 |     "max_txt_len": 70,
46 |   
47 |     "e2e_weights_path": null,
48 |     "clip_weights": "openai/clip-vit-base-patch16",
49 |     "clip_config": "openai/clip-vit-base-patch16",
50 |     "clip_vision_additional_config": {
51 |       "type": "ViP",
52 |       "temporal_size": 12,
53 |       "if_use_temporal_embed": 1,
54 |       "logit_scale_init_value": 4.60,
55 |       "add_cls_num": 3
56 |     },
57 |   
58 |     "train_batch_size": 16,
59 |     "test_batch_size": 16,
60 |     "max_n_example_per_group": 1,
61 |     "gradient_accumulation_steps": 1,
62 |     "n_workers": 8,
63 |     "pin_mem": 1,
64 |     "fp16": 1,
65 |     "amp_level": "O2",
66 |     "seed": 42,
67 |   
68 |     "optim": "adamw",
69 |     "betas": [0.9, 0.98],
70 |     "learning_rate": 5e-6,
71 |     "weight_decay": 0.05,
72 |     "lr_mul": 1,
73 |     "lr_mul_prefix": "",
74 |     "loss_config": {
75 |       "loss_name": "NCELearnableTempLoss_vsc_fc",
76 |       "if_gather": 1
77 |     },
78 |     "warmup_ratio": 0.01,
79 |     "decay": "cosine",
80 |     "grad_norm": 5.0,
81 |   
82 |     "num_train_epochs": 5,
83 |     "min_valid_steps": 1,
84 |     "num_valid": 100,
85 |     "only_valid_steps": 1000,
86 |     "save_steps_ratio": 0.01,
87 |     "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_16/",
88 |     "if_tb_log": 1,
89 |     "if_model_saver": 1,
90 |     "if_log2file": 1,
91 |     "dummy_data": 0
92 |   }
93 |   


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/pretrain/pretrain_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |           "name": "hdvila",
 5 |           "vis_format": "videoframe",
 6 |           "txt": "datasets/hdvila/hdvila_subtitles_92m_db",
 7 |           "vis": "youtube_data/ytt180m/video_clips_3fps",
 8 |           "vid_cap_path": "datasets/hdvila/hdvila_captions_db",
 9 |           "vid_txt": "subtitle",
10 |           "img_dir": "",
11 |           "cap_path": "",
12 |           "img_source": "",
13 |           "img_ratio": 0
14 |         }
15 |       ],
16 |       "val_datasets": [
17 |         {
18 |           "name": "msrvtt",
19 |           "vis_format": "video",
20 |           "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
21 |           "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
22 |         },
23 |         {
24 |           "name": "how2",
25 |           "vis_format": "video",
26 |           "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl",
27 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
28 |         },
29 |         {
30 |           "name": "ours",
31 |           "vis_format": "video",
32 |           "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl",
33 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
34 |         }
35 |     ],
36 |   
37 |     "train_n_clips": 1,
38 |     "train_num_frms": 12,
39 |     "test_n_clips": 1,
40 |     "test_num_frms": 12,
41 |     "sample_rate": 0,
42 |     "sample_jitter": 1,
43 |     "video_res": [240, 320],
44 |     "input_res": [224, 224],
45 |     "max_txt_len": 70,
46 |   
47 |     "e2e_weights_path": null,
48 |     "clip_weights": "openai/clip-vit-base-patch32",
49 |     "clip_config": "openai/clip-vit-base-patch32",
50 |     "clip_vision_additional_config": {
51 |       "type": "ViP",
52 |       "temporal_size": 12,
53 |       "if_use_temporal_embed": 1,
54 |       "logit_scale_init_value": 4.60,
55 |       "add_cls_num": 3
56 |     },
57 |   
58 |     "train_batch_size": 32,
59 |     "test_batch_size": 32,
60 |     "max_n_example_per_group": 1,
61 |     "gradient_accumulation_steps": 1,
62 |     "n_workers": 8,
63 |     "pin_mem": 1,
64 |     "fp16": 1,
65 |     "amp_level": "O2",
66 |     "seed": 42,
67 |   
68 |     "optim": "adamw",
69 |     "betas": [0.9, 0.98],
70 |     "learning_rate": 5e-6,
71 |     "weight_decay": 0.05,
72 |     "lr_mul": 1,
73 |     "lr_mul_prefix": "",
74 |     "loss_config": {
75 |       "loss_name": "NCELearnableTempLoss_vsc_fc",
76 |       "if_gather": 1
77 |     },
78 |     "warmup_ratio": 0.01,
79 |     "decay": "cosine",
80 |     "grad_norm": 5.0,
81 |   
82 |     "num_train_epochs": 5,
83 |     "min_valid_steps": 1,
84 |     "num_valid": 100,
85 |     "only_valid_steps": 1000,
86 |     "save_steps_ratio": 0.01,
87 |     "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_32/",
88 |     "if_tb_log": 1,
89 |     "if_model_saver": 1,
90 |     "if_log2file": 1,
91 |     "dummy_data": 0
92 |   }
93 |   


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/temporal_kinetics/side_tuning_zero-shot.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   arch: patch_and_fuse_internvideo
  3 |   model_type: InternVideo-MM-L-14
  4 |   load_pretrained: True
  5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  6 |   pretrained: "<path to ssv2-label side_tuning>/checkpoint_0.pth"
  7 | 
  8 |   text_perceiver_config:
  9 |     dim: 768 # latent query dim
 10 |     k_v_dim: 768 # text_width
 11 |     depth: 1
 12 |     dim_head: 64
 13 |     heads: 8
 14 |     num_latents: 16
 15 |     ff_mult: 2
 16 | 
 17 |   vision_perceiver_config:
 18 |     dim: 768 # latent query dim
 19 |     k_v_dim: 1024 # vision_width
 20 |     depth: 1
 21 |     dim_head: 64
 22 |     heads: 8
 23 |     num_latents: 16
 24 |     ff_mult: 2
 25 | 
 26 |   objectives: ["video_text_contrastive"]
 27 |   loss_weighting: [1.0]
 28 |   if_use_attn_guidance: False
 29 |   if_use_dual_perceiver: False
 30 |   if_add_temporal_emebdding: True
 31 |   num_frms: 8
 32 |   temp_emb_drop_out: 0.1
 33 |   if_as_knowledge_fuser: True
 34 |   knowledge_fuser_type: "side_tuning"
 35 |   train_knowledge_fuser_jointly: True
 36 | 
 37 | datasets:
 38 |   downstream_tasks_temporal:
 39 |     type: "default"
 40 |     vis_processor:
 41 |       train:
 42 |         name: "video_train"
 43 |         image_size: 224
 44 |       eval:
 45 |         name: "internvideo_eval"
 46 |         image_size: 224
 47 |     text_processor:
 48 |       train:
 49 |         name: "minimum_text"
 50 |       eval:
 51 |         name: "minimum_text"
 52 |     
 53 |     # IMPORTANT configs:
 54 |     fps: 5
 55 |     task: v1.0_2.4k
 56 |     subset: kinetics
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of 3000 instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: downstream_tasks_retrieval
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: knowledge_patcher
 79 |   eval_task: v1.0_2.4k
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/downstream_tasks/temporal-kinetics/eval/side_tuning_zero-shot"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True
 92 |   
 93 |   # train_splits: ["train"]
 94 |   valid_splits: ["val"]
 95 |   # test_splits: ["test"]
 96 | 
 97 |   device: "cuda"
 98 |   world_size: 1
 99 |   dist_url: "env://"
100 |   distributed: True
101 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/clipvip/ego4d/acdybench_ego4d_clipvip_backbone__action_antonym.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | model:
 4 |   arch: patch_and_fuse_clipvip
 5 |   model_type: pretrain_vip_base_32
 6 |   load_pretrained: True
 7 |   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
 8 | 
 9 |   text_perceiver_config:
10 |     dim: 512 # latent query dim
11 |     k_v_dim: 512 # text_width
12 |     depth: 1
13 |     dim_head: 64
14 |     heads: 8
15 |     num_latents: 16
16 |     ff_mult: 2
17 | 
18 |   vision_perceiver_config:
19 |     dim: 512 # latent query dim
20 |     k_v_dim: 768 # vision_width
21 |     depth: 1
22 |     dim_head: 64
23 |     heads: 8
24 |     num_latents: 16
25 |     ff_mult: 2
26 | 
27 |   objectives: ["video_text_contrastive"]
28 |   loss_weighting: [1.0]
29 |   if_use_attn_guidance: False
30 |   if_use_dual_perceiver: False
31 |   if_add_temporal_emebdding: False
32 |   num_frms: 8
33 |   temp_emb_drop_out: 0.0
34 |   # if_as_knowledge_fuser: True
35 |   # knowledge_fuser_type: "xattn"
36 |   # train_knowledge_fuser_jointly: True
37 | 
38 | datasets:
39 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
40 |     vis_processor:
41 |       train:
42 |         name: "video_train"
43 |         image_size: 224
44 |       eval:
45 |         name: "internvideo_eval"
46 |         image_size: 224
47 |     text_processor:
48 |       train:
49 |         name: "vl_dynamic_ego4d_text"
50 |       eval:
51 |         name: "vl_dynamic_ego4d_text"
52 |     
53 |     # IMPORTANT configs: 
54 |     fps: 5 # if downsampled, use 5 fps
55 |     task: "action_antonym" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
56 |     neg_sampling_same_clip: 0 # evaluation set to 0
57 |     eval_only: True
58 |     
59 |     # other arguements
60 |     k: null # sample a subset of k instances
61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
62 |     num_frm: 8
63 |     train_frame_height: 224
64 |     train_frame_width: 224
65 |     eval_frame_height: 224
66 |     eval_frame_width: 224
67 | 
68 | run:
69 |   # use custom runner
70 |   runner: runner_base_patch_and_fuse
71 |   
72 |   # task object name
73 |   task: actionbench
74 | 
75 |   # which module is used for inference ["backbone", "knowledge_patcher"]
76 |   eval_module: backbone
77 |   eval_task: action_antonym
78 | 
79 |   batch_size_train: 32
80 |   batch_size_eval: 4
81 |   num_workers: 4
82 | 
83 |   seed: 42
84 |   output_dir: "output/actionbench/eval/ClipViP/ego4d__clipvip_backbone__action_antonym"
85 | 
86 |   amp: False
87 |   resume_ckpt_path: null
88 | 
89 |   evaluate: True 
90 |   # train_splits: ["train"]
91 |   # valid_splits: ["val"]
92 |   test_splits: ["test"]
93 | 
94 |   device: "cuda"
95 |   world_size: 1
96 |   dist_url: "env://"
97 |   distributed: True
98 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/clipvip/ego4d/acdybench_ego4d_clipvip_backbone__reversed_video.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | model:
 4 |   arch: patch_and_fuse_clipvip
 5 |   model_type: pretrain_vip_base_32
 6 |   load_pretrained: True
 7 |   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
 8 | 
 9 |   text_perceiver_config:
10 |     dim: 512 # latent query dim
11 |     k_v_dim: 512 # text_width
12 |     depth: 1
13 |     dim_head: 64
14 |     heads: 8
15 |     num_latents: 16
16 |     ff_mult: 2
17 | 
18 |   vision_perceiver_config:
19 |     dim: 512 # latent query dim
20 |     k_v_dim: 768 # vision_width
21 |     depth: 1
22 |     dim_head: 64
23 |     heads: 8
24 |     num_latents: 16
25 |     ff_mult: 2
26 | 
27 |   objectives: ["video_text_contrastive"]
28 |   loss_weighting: [1.0]
29 |   if_use_attn_guidance: False
30 |   if_use_dual_perceiver: False
31 |   if_add_temporal_emebdding: False
32 |   num_frms: 8
33 |   temp_emb_drop_out: 0.0
34 |   # if_as_knowledge_fuser: True
35 |   # knowledge_fuser_type: "xattn"
36 |   # train_knowledge_fuser_jointly: True
37 | 
38 | datasets:
39 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
40 |     vis_processor:
41 |       train:
42 |         name: "video_train"
43 |         image_size: 224
44 |       eval:
45 |         name: "internvideo_eval"
46 |         image_size: 224
47 |     text_processor:
48 |       train:
49 |         name: "vl_dynamic_ego4d_text"
50 |       eval:
51 |         name: "vl_dynamic_ego4d_text"
52 |     
53 |     # IMPORTANT configs: 
54 |     fps: 5 # if downsampled, use 5 fps
55 |     task: "reversed_video" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
56 |     neg_sampling_same_clip: 0 # evaluation set to 0
57 |     eval_only: True
58 |     
59 |     # other arguements
60 |     k: null # sample a subset of k instances
61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
62 |     num_frm: 8
63 |     train_frame_height: 224
64 |     train_frame_width: 224
65 |     eval_frame_height: 224
66 |     eval_frame_width: 224
67 | 
68 | run:
69 |   # use custom runner
70 |   runner: runner_base_patch_and_fuse
71 |   
72 |   # task object name
73 |   task: actionbench
74 | 
75 |   # which module is used for inference ["backbone", "knowledge_patcher"]
76 |   eval_module: backbone
77 |   eval_task: reversed_video
78 | 
79 |   batch_size_train: 32
80 |   batch_size_eval: 4
81 |   num_workers: 4
82 | 
83 |   seed: 42
84 |   output_dir: "output/actionbench/eval/ClipViP/ego4d__clipvip_backbone__reversed_video"
85 | 
86 |   amp: False
87 |   resume_ckpt_path: null
88 | 
89 |   evaluate: True 
90 |   # train_splits: ["train"]
91 |   # valid_splits: ["val"]
92 |   test_splits: ["test"]
93 | 
94 |   device: "cuda"
95 |   world_size: 1
96 |   dist_url: "env://"
97 |   distributed: True
98 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | references: UNITER
 3 | """
 4 | 
 5 | import logging
 6 | from tensorboardX import SummaryWriter
 7 | import os
 8 | 
 9 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s'
10 | _DATE_FMT = '%m/%d/%Y %H:%M:%S'
11 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO)
12 | LOGGER = logging.getLogger('__main__')  # this is the global logger
13 | 
14 | 
15 | def add_log_to_file(log_path):
16 |     fh = logging.FileHandler(log_path)
17 |     formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT)
18 |     fh.setFormatter(formatter)
19 |     LOGGER.addHandler(fh)
20 | 
21 | 
22 | class TensorboardLogger(object):
23 |     def __init__(self):
24 |         self._logger = None
25 |         self._global_step = 0
26 | 
27 |     def create(self, path):
28 |         if "AZUREML_TB_PATH" in os.environ:
29 |             self._logger = SummaryWriter(os.environ["AZUREML_TB_PATH"])
30 |         else:
31 |             self._logger = SummaryWriter(path)
32 | 
33 |     def noop(self, *args, **kwargs):
34 |         return
35 | 
36 |     def step(self):
37 |         self._global_step += 1
38 | 
39 |     @property
40 |     def global_step(self):
41 |         return self._global_step
42 | 
43 |     @global_step.setter
44 |     def global_step(self, step):
45 |         self._global_step = step
46 | 
47 |     def log_scalar_dict(self, log_dict, prefix=''):
48 |         """ log a dictionary of scalar values"""
49 |         if self._logger is None:
50 |             return
51 |         if prefix:
52 |             prefix = f'{prefix}_'
53 |         for name, value in log_dict.items():
54 |             if isinstance(value, dict):
55 |                 self.log_scalar_dict(value, self._global_step,
56 |                                      prefix=f'{prefix}{name}')
57 |             else:
58 |                 self._logger.add_scalar(f'{prefix}{name}', value,
59 |                                         self._global_step)
60 | 
61 |     def __getattr__(self, name):
62 |         if self._logger is None:
63 |             return self.noop
64 |         return self._logger.__getattribute__(name)
65 | 
66 | 
67 | TB_LOGGER = TensorboardLogger()
68 | 
69 | 
70 | class RunningMeter(object):
71 |     """ running meteor of a scalar value
72 |         (useful for monitoring training loss)
73 |     """
74 |     def __init__(self, name, val=None, smooth=0.99):
75 |         self._name = name
76 |         self._sm = smooth
77 |         self._val = val
78 | 
79 |     def __call__(self, value):
80 |         self._val = (value if self._val is None
81 |                      else value*(1-self._sm) + self._val*self._sm)
82 | 
83 |     def __str__(self):
84 |         return f'{self._name}: {self._val:.4f}'
85 | 
86 |     @property
87 |     def val(self):
88 |         return self._val
89 | 
90 |     @property
91 |     def name(self):
92 |         return self._name
93 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/singularity/ego4d/acdybench_ego4d_singularity_backbone__action_antonym.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | model:
 4 |   arch: patch_and_fuse_singularity
 5 |   model_type: singularity_temporal_17m
 6 |   load_pretrained: True
 7 |   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
 8 | 
 9 |   text_perceiver_config:
10 |     dim: 256 # latent query dim
11 |     k_v_dim: 768 # text_width
12 |     depth: 1
13 |     dim_head: 64
14 |     heads: 8
15 |     num_latents: 16
16 |     ff_mult: 2
17 | 
18 |   vision_perceiver_config:
19 |     dim: 256 # latent query dim
20 |     k_v_dim: 768 # vision_width
21 |     depth: 1
22 |     dim_head: 64
23 |     heads: 8
24 |     num_latents: 16
25 |     ff_mult: 2
26 | 
27 |   objectives: ["video_text_contrastive"]
28 |   loss_weighting: [1.0]
29 |   if_use_attn_guidance: False
30 |   if_use_dual_perceiver: False
31 |   if_add_temporal_emebdding: False
32 |   num_frms: 8
33 |   temp_emb_drop_out: 0.0
34 |   # if_as_knowledge_fuser: True
35 |   # knowledge_fuser_type: "xattn"
36 |   # train_knowledge_fuser_jointly: True
37 | 
38 | datasets:
39 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
40 |     vis_processor:
41 |       train:
42 |         name: "video_train"
43 |         image_size: 224
44 |       eval:
45 |         name: "internvideo_eval"
46 |         image_size: 224
47 |     text_processor:
48 |       train:
49 |         name: "vl_dynamic_ego4d_text"
50 |       eval:
51 |         name: "vl_dynamic_ego4d_text"
52 |     
53 |     # IMPORTANT configs: 
54 |     fps: 5 # if downsampled, use 5 fps
55 |     task: "action_antonym" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
56 |     neg_sampling_same_clip: 0 # evaluation set to 0
57 |     eval_only: True
58 |     
59 |     # other arguements
60 |     k: null # sample a subset of k instances
61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
62 |     num_frm: 8
63 |     train_frame_height: 224
64 |     train_frame_width: 224
65 |     eval_frame_height: 224
66 |     eval_frame_width: 224
67 | 
68 | run:
69 |   # use custom runner
70 |   runner: runner_base_patch_and_fuse
71 |   
72 |   # task object name
73 |   task: actionbench
74 | 
75 |   # which module is used for inference ["backbone", "knowledge_patcher"]
76 |   eval_module: backbone
77 |   eval_task: action_antonym
78 | 
79 |   batch_size_train: 32
80 |   batch_size_eval: 4
81 |   num_workers: 4
82 | 
83 |   seed: 42
84 |   output_dir: "output/actionbench/eval/Singularity/ego4d__Singularity_backbone__action_antonym"
85 | 
86 |   amp: False
87 |   resume_ckpt_path: null
88 | 
89 |   evaluate: True 
90 |   # train_splits: ["train"]
91 |   # valid_splits: ["val"]
92 |   test_splits: ["test"]
93 | 
94 |   device: "cuda"
95 |   world_size: 1
96 |   dist_url: "env://"
97 |   distributed: True
98 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/singularity/ego4d/acdybench_ego4d_singularity_backbone__reversed_video.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | model:
 4 |   arch: patch_and_fuse_singularity
 5 |   model_type: singularity_temporal_17m
 6 |   load_pretrained: True
 7 |   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
 8 | 
 9 |   text_perceiver_config:
10 |     dim: 256 # latent query dim
11 |     k_v_dim: 768 # text_width
12 |     depth: 1
13 |     dim_head: 64
14 |     heads: 8
15 |     num_latents: 16
16 |     ff_mult: 2
17 | 
18 |   vision_perceiver_config:
19 |     dim: 256 # latent query dim
20 |     k_v_dim: 768 # vision_width
21 |     depth: 1
22 |     dim_head: 64
23 |     heads: 8
24 |     num_latents: 16
25 |     ff_mult: 2
26 | 
27 |   objectives: ["video_text_contrastive"]
28 |   loss_weighting: [1.0]
29 |   if_use_attn_guidance: False
30 |   if_use_dual_perceiver: False
31 |   if_add_temporal_emebdding: False
32 |   num_frms: 8
33 |   temp_emb_drop_out: 0.0
34 |   # if_as_knowledge_fuser: True
35 |   # knowledge_fuser_type: "xattn"
36 |   # train_knowledge_fuser_jointly: True
37 | 
38 | datasets:
39 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
40 |     vis_processor:
41 |       train:
42 |         name: "video_train"
43 |         image_size: 224
44 |       eval:
45 |         name: "internvideo_eval"
46 |         image_size: 224
47 |     text_processor:
48 |       train:
49 |         name: "vl_dynamic_ego4d_text"
50 |       eval:
51 |         name: "vl_dynamic_ego4d_text"
52 |     
53 |     # IMPORTANT configs: 
54 |     fps: 5 # if downsampled, use 5 fps
55 |     task: "reversed_video" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
56 |     neg_sampling_same_clip: 0 # evaluation set to 0
57 |     eval_only: True
58 |     
59 |     # other arguements
60 |     k: null # sample a subset of k instances
61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
62 |     num_frm: 8
63 |     train_frame_height: 224
64 |     train_frame_width: 224
65 |     eval_frame_height: 224
66 |     eval_frame_width: 224
67 | 
68 | run:
69 |   # use custom runner
70 |   runner: runner_base_patch_and_fuse
71 |   
72 |   # task object name
73 |   task: actionbench
74 | 
75 |   # which module is used for inference ["backbone", "knowledge_patcher"]
76 |   eval_module: backbone
77 |   eval_task: reversed_video
78 | 
79 |   batch_size_train: 32
80 |   batch_size_eval: 4
81 |   num_workers: 4
82 | 
83 |   seed: 42
84 |   output_dir: "output/actionbench/eval/Singularity/ego4d__Singularity_backbone__reversed_video"
85 | 
86 |   amp: False
87 |   resume_ckpt_path: null
88 | 
89 |   evaluate: True 
90 |   # train_splits: ["train"]
91 |   # valid_splits: ["val"]
92 |   test_splits: ["test"]
93 | 
94 |   device: "cuda"
95 |   world_size: 1
96 |   dist_url: "env://"
97 |   distributed: True
98 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/temporal_ssv2/side_tuning.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | model:
  3 |   arch: patch_and_fuse_internvideo
  4 |   model_type: InternVideo-MM-L-14
  5 |   load_pretrained: True
  6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  7 |   pretrained: "<path to trained side_tuning on SSv2-template>/checkpoint_best.pth" #TODO: set trained patch_and_fuse checkpoint path
  8 |    
  9 |   text_perceiver_config:
 10 |     dim: 768 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 768 # latent query dim
 20 |     k_v_dim: 1024 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: True
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.1
 34 |   if_as_knowledge_fuser: True
 35 |   knowledge_fuser_type: "side_tuning"
 36 |   train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   downstream_tasks_temporal:
 40 |     type: "default"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs:
 55 |     fps: 5
 56 |     task: v1.0_2.4k
 57 |     subset: ssv2
 58 |     neg_sampling_same_clip: 0
 59 |     eval_only: True
 60 |     
 61 |     # other arguements
 62 |     train_k: null # sample a subset of k instances
 63 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 64 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 65 |     num_frm: 8
 66 |     train_frame_height: 224
 67 |     train_frame_width: 224
 68 |     eval_frame_height: 224
 69 |     eval_frame_width: 224
 70 | 
 71 | run:
 72 |   # use custom runner
 73 |   runner: runner_base_patch_and_fuse
 74 |   
 75 |   # task object name
 76 |   task: downstream_tasks_retrieval
 77 | 
 78 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 79 |   eval_module: knowledge_patcher
 80 |   eval_task: v1.0_2.4k
 81 | 
 82 |   batch_size_train: 32
 83 |   batch_size_eval: 4
 84 |   num_workers: 4
 85 | 
 86 |   seed: 42
 87 |   output_dir: "output/downstream_tasks/temporal_ssv2/side_tuning"
 88 | 
 89 |   amp: False
 90 |   resume_ckpt_path: null
 91 | 
 92 |   evaluate: True
 93 |   
 94 |   # train_splits: ["train"]
 95 |   valid_splits: ["val"]
 96 |   # test_splits: ["test"]
 97 | 
 98 |   device: "cuda"
 99 |   world_size: 1
100 |   dist_url: "env://"
101 |   distributed: True
102 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/clipvip/ego4d/acdybench_ego4d_clipvip_backbone__object_shuffle.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | model:
 4 |   arch: patch_and_fuse_clipvip
 5 |   model_type: pretrain_vip_base_32
 6 |   load_pretrained: True
 7 |   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
 8 | 
 9 |   text_perceiver_config:
10 |     dim: 512 # latent query dim
11 |     k_v_dim: 512 # text_width
12 |     depth: 1
13 |     dim_head: 64
14 |     heads: 8
15 |     num_latents: 16
16 |     ff_mult: 2
17 | 
18 |   vision_perceiver_config:
19 |     dim: 512 # latent query dim
20 |     k_v_dim: 768 # vision_width
21 |     depth: 1
22 |     dim_head: 64
23 |     heads: 8
24 |     num_latents: 16
25 |     ff_mult: 2
26 | 
27 |   objectives: ["video_text_contrastive"]
28 |   loss_weighting: [1.0]
29 |   if_use_attn_guidance: False
30 |   if_use_dual_perceiver: False
31 |   if_add_temporal_emebdding: False
32 |   num_frms: 8
33 |   temp_emb_drop_out: 0.0
34 |   # if_as_knowledge_fuser: True
35 |   # knowledge_fuser_type: "xattn"
36 |   # train_knowledge_fuser_jointly: True
37 | 
38 | datasets:
39 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
40 |     type: "object_shuffled"
41 |     vis_processor:
42 |       train:
43 |         name: "video_train"
44 |         image_size: 224
45 |       eval:
46 |         name: "internvideo_eval"
47 |         image_size: 224
48 |     text_processor:
49 |       train:
50 |         name: "vl_dynamic_ego4d_text"
51 |       eval:
52 |         name: "vl_dynamic_ego4d_text"
53 |     
54 |     # IMPORTANT configs: 
55 |     fps: 5 # if downsampled, use 5 fps
56 |     task: "object_shuffle" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
57 |     neg_sampling_same_clip: 0 # evaluation set to 0
58 |     eval_only: True
59 |     
60 |     # other arguements
61 |     k: null # sample a subset of k instances
62 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
63 |     num_frm: 8
64 |     train_frame_height: 224
65 |     train_frame_width: 224
66 |     eval_frame_height: 224
67 |     eval_frame_width: 224
68 | 
69 | run:
70 |   # use custom runner
71 |   runner: runner_base_patch_and_fuse
72 |   
73 |   # task object name
74 |   task: actionbench
75 | 
76 |   # which module is used for inference ["backbone", "knowledge_patcher"]
77 |   eval_module: backbone
78 |   eval_task: object_shuffle
79 | 
80 |   batch_size_train: 32
81 |   batch_size_eval: 4
82 |   num_workers: 4
83 | 
84 |   seed: 42
85 |   output_dir: "output/actionbench/eval/ClipViP/ego4d__clipvip_backbone__object_shuffle"
86 | 
87 |   amp: False
88 |   resume_ckpt_path: null
89 | 
90 |   evaluate: True 
91 |   # train_splits: ["train"]
92 |   # valid_splits: ["val"]
93 |   test_splits: ["test"]
94 | 
95 |   device: "cuda"
96 |   world_size: 1
97 |   dist_url: "env://"
98 |   distributed: True
99 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/configs/pretrained/pretrain_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |           "name": "hdvila",
 5 |           "vis_format": "videoframe",
 6 |           "txt": "datasets/hdvila/hdvila_subtitles_92m_db",
 7 |           "vis": "youtube_data/ytt180m/video_clips_3fps",
 8 |           "vid_cap_path": "datasets/hdvila/hdvila_captions_db",
 9 |           "vid_txt": "subtitle",
10 |           "img_dir": "",
11 |           "cap_path": "",
12 |           "img_source": "",
13 |           "img_ratio": 0
14 |         }
15 |       ],
16 |       "val_datasets": [
17 |         {
18 |           "name": "msrvtt",
19 |           "vis_format": "video",
20 |           "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
21 |           "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
22 |         },
23 |         {
24 |           "name": "how2",
25 |           "vis_format": "video",
26 |           "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl",
27 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
28 |         },
29 |         {
30 |           "name": "ours",
31 |           "vis_format": "video",
32 |           "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl",
33 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
34 |         }
35 |     ],
36 |   
37 |     "train_n_clips": 1,
38 |     "train_num_frms": 12,
39 |     "test_n_clips": 1,
40 |     "test_num_frms": 12,
41 |     "sample_rate": 0,
42 |     "sample_jitter": 1,
43 |     "video_res": [240, 320],
44 |     "input_res": [224, 224],
45 |     "max_txt_len": 70,
46 |   
47 |     "e2e_weights_path": "pretrained_ckpt/ClipViP/pretrain_clipvip_base_32.pt",
48 |     "clip_weights": "openai/clip-vit-base-patch32",
49 |     "clip_config": "openai/clip-vit-base-patch32",
50 |     "clip_vision_additional_config": {
51 |       "type": "ViP",
52 |       "temporal_size": 12,
53 |       "if_use_temporal_embed": 1,
54 |       "logit_scale_init_value": 4.60,
55 |       "add_cls_num": 3
56 |     },
57 |   
58 |     "train_batch_size": 32,
59 |     "test_batch_size": 32,
60 |     "max_n_example_per_group": 1,
61 |     "gradient_accumulation_steps": 1,
62 |     "n_workers": 8,
63 |     "pin_mem": 1,
64 |     "fp16": 1,
65 |     "amp_level": "O2",
66 |     "seed": 42,
67 |   
68 |     "optim": "adamw",
69 |     "betas": [0.9, 0.98],
70 |     "learning_rate": 5e-6,
71 |     "weight_decay": 0.05,
72 |     "lr_mul": 1,
73 |     "lr_mul_prefix": "",
74 |     "loss_config": {
75 |       "loss_name": "NCELearnableTempLoss_vsc_fc",
76 |       "if_gather": 1
77 |     },
78 |     "warmup_ratio": 0.01,
79 |     "decay": "cosine",
80 |     "grad_norm": 5.0,
81 |   
82 |     "num_train_epochs": 5,
83 |     "min_valid_steps": 1,
84 |     "num_valid": 100,
85 |     "only_valid_steps": 1000,
86 |     "save_steps_ratio": 0.01,
87 |     "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_32/",
88 |     "if_tb_log": 1,
89 |     "if_model_saver": 1,
90 |     "if_log2file": 1,
91 |     "dummy_data": 0
92 |   }
93 |   


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/temporal_ssv2/patch_and_fuse.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | model:
  3 |   arch: patch_and_fuse_internvideo
  4 |   model_type: InternVideo-MM-L-14
  5 |   load_pretrained: True
  6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  7 |   pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_template_patch_and_fuse.pth" #TODO: set trained patch_and_fuse checkpoint path
  8 |    
  9 |   text_perceiver_config:
 10 |     dim: 768 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 768 # latent query dim
 20 |     k_v_dim: 1024 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: True
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.1
 34 |   if_as_knowledge_fuser: True
 35 |   knowledge_fuser_type: "xattn"
 36 |   train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   downstream_tasks_temporal:
 40 |     type: "default"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs:
 55 |     fps: 5
 56 |     task: v1.0_2.4k
 57 |     subset: ssv2
 58 |     neg_sampling_same_clip: 0
 59 |     eval_only: True
 60 |     
 61 |     # other arguements
 62 |     train_k: null # sample a subset of k instances
 63 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 64 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 65 |     num_frm: 8
 66 |     train_frame_height: 224
 67 |     train_frame_width: 224
 68 |     eval_frame_height: 224
 69 |     eval_frame_width: 224
 70 | 
 71 | run:
 72 |   # use custom runner
 73 |   runner: runner_base_patch_and_fuse
 74 |   
 75 |   # task object name
 76 |   task: downstream_tasks_retrieval
 77 | 
 78 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 79 |   eval_module: knowledge_patcher
 80 |   eval_task: v1.0_2.4k
 81 | 
 82 |   batch_size_train: 32
 83 |   batch_size_eval: 4
 84 |   num_workers: 4
 85 | 
 86 |   seed: 42
 87 |   output_dir: "output/downstream_tasks/temporal_ssv2/patch_and_fuse"
 88 | 
 89 |   amp: False
 90 |   resume_ckpt_path: null
 91 | 
 92 |   evaluate: True
 93 |   
 94 |   # train_splits: ["train"]
 95 |   valid_splits: ["val"]
 96 |   # test_splits: ["test"]
 97 | 
 98 |   device: "cuda"
 99 |   world_size: 1
100 |   dist_url: "env://"
101 |   distributed: True
102 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/singularity/ego4d/acdybench_ego4d_singularity_backbone__object_shuffle.yaml:
--------------------------------------------------------------------------------
 1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
 2 | 
 3 | model:
 4 |   arch: patch_and_fuse_singularity
 5 |   model_type: singularity_temporal_17m
 6 |   load_pretrained: True
 7 |   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
 8 | 
 9 |   text_perceiver_config:
10 |     dim: 256 # latent query dim
11 |     k_v_dim: 768 # text_width
12 |     depth: 1
13 |     dim_head: 64
14 |     heads: 8
15 |     num_latents: 16
16 |     ff_mult: 2
17 | 
18 |   vision_perceiver_config:
19 |     dim: 256 # latent query dim
20 |     k_v_dim: 768 # vision_width
21 |     depth: 1
22 |     dim_head: 64
23 |     heads: 8
24 |     num_latents: 16
25 |     ff_mult: 2
26 | 
27 |   objectives: ["video_text_contrastive"]
28 |   loss_weighting: [1.0]
29 |   if_use_attn_guidance: False
30 |   if_use_dual_perceiver: False
31 |   if_add_temporal_emebdding: False
32 |   num_frms: 8
33 |   temp_emb_drop_out: 0.0
34 |   # if_as_knowledge_fuser: True
35 |   # knowledge_fuser_type: "xattn"
36 |   # train_knowledge_fuser_jointly: True
37 | 
38 | datasets:
39 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
40 |     type: "object_shuffled"
41 |     vis_processor:
42 |       train:
43 |         name: "video_train"
44 |         image_size: 224
45 |       eval:
46 |         name: "internvideo_eval"
47 |         image_size: 224
48 |     text_processor:
49 |       train:
50 |         name: "vl_dynamic_ego4d_text"
51 |       eval:
52 |         name: "vl_dynamic_ego4d_text"
53 |     
54 |     # IMPORTANT configs: 
55 |     fps: 5 # if downsampled, use 5 fps
56 |     task: "object_shuffle" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
57 |     neg_sampling_same_clip: 0 # evaluation set to 0
58 |     eval_only: True
59 |     
60 |     # other arguements
61 |     k: null # sample a subset of k instances
62 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
63 |     num_frm: 8
64 |     train_frame_height: 224
65 |     train_frame_width: 224
66 |     eval_frame_height: 224
67 |     eval_frame_width: 224
68 | 
69 | run:
70 |   # use custom runner
71 |   runner: runner_base_patch_and_fuse
72 |   
73 |   # task object name
74 |   task: actionbench
75 | 
76 |   # which module is used for inference ["backbone", "knowledge_patcher"]
77 |   eval_module: backbone
78 |   eval_task: object_shuffle
79 | 
80 |   batch_size_train: 32
81 |   batch_size_eval: 4
82 |   num_workers: 4
83 | 
84 |   seed: 42
85 |   output_dir: "output/actionbench/eval/Singularity/ego4d__Singularity_backbone__object_shuffle"
86 | 
87 |   amp: False
88 |   resume_ckpt_path: null
89 | 
90 |   evaluate: True 
91 |   # train_splits: ["train"]
92 |   # valid_splits: ["val"]
93 |   test_splits: ["test"]
94 | 
95 |   device: "cuda"
96 |   world_size: 1
97 |   dist_url: "env://"
98 |   distributed: True
99 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/ssv2_label/side_tuning.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | model:
  3 |   arch: patch_and_fuse_internvideo
  4 |   model_type: InternVideo-MM-L-14
  5 |   load_pretrained: True
  6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  7 |   pretrained: "<path to trained side_tuning on SSv2-label>/checkpoint_best.pth" #TODO: set trained patch_and_fuse checkpoint path
  8 |    
  9 |   text_perceiver_config:
 10 |     dim: 768 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 768 # latent query dim
 20 |     k_v_dim: 1024 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: True
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.1
 34 |   if_as_knowledge_fuser: True
 35 |   knowledge_fuser_type: "side_tuning"
 36 |   train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   downstream_tasks_retrieval_ssv2_224x224_5fps:
 40 |     type: "default"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs: 
 55 |     fps: 5 # if downsampled, use 5 fps
 56 |     task: ssv2_label # ssv2_label, ssv2_template
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: downstream_tasks_retrieval
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: knowledge_patcher
 79 |   eval_task: ssv2_label
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/downstream_tasks/ssv2_label/side_tuning"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True
 92 |   
 93 |   # train_splits: ["train"]
 94 |   valid_splits: ["val"]
 95 |   # test_splits: ["test"]
 96 | 
 97 |   device: "cuda"
 98 |   world_size: 1
 99 |   dist_url: "env://"
100 |   distributed: True
101 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/ssv2_label/patch_and_fuse.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | model:
  3 |   arch: patch_and_fuse_internvideo
  4 |   model_type: InternVideo-MM-L-14
  5 |   load_pretrained: True
  6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  7 |   pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_label_patch_and_fuse.pth" #TODO: set trained patch_and_fuse checkpoint path
  8 |    
  9 |   text_perceiver_config:
 10 |     dim: 768 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 768 # latent query dim
 20 |     k_v_dim: 1024 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: True
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.1
 34 |   if_as_knowledge_fuser: True
 35 |   knowledge_fuser_type: "xattn"
 36 |   train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   downstream_tasks_retrieval_ssv2_224x224_5fps:
 40 |     type: "default"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs: 
 55 |     fps: 5 # if downsampled, use 5 fps
 56 |     task: ssv2_label # ssv2_label, ssv2_template
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: downstream_tasks_retrieval
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: knowledge_patcher
 79 |   eval_task: ssv2_label
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/downstream_tasks/ssv2_label/patch_and_fuse"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True
 92 |   
 93 |   # train_splits: ["train"]
 94 |   valid_splits: ["val"]
 95 |   # test_splits: ["test"]
 96 | 
 97 |   device: "cuda"
 98 |   world_size: 1
 99 |   dist_url: "env://"
100 |   distributed: True
101 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/ssv2_template/side_tuning.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | model:
  3 |   arch: patch_and_fuse_internvideo
  4 |   model_type: InternVideo-MM-L-14
  5 |   load_pretrained: True
  6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  7 |   pretrained: "<path to trained side_tuning on SSv2-template>/checkpoint_best.pth" #TODO: set trained patch_and_fuse checkpoint path
  8 |    
  9 |   text_perceiver_config:
 10 |     dim: 768 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 768 # latent query dim
 20 |     k_v_dim: 1024 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: True
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.1
 34 |   if_as_knowledge_fuser: True
 35 |   knowledge_fuser_type: "side_tuning"
 36 |   train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   downstream_tasks_retrieval_ssv2_224x224_5fps:
 40 |     type: "default"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs: 
 55 |     fps: 5 # if downsampled, use 5 fps
 56 |     task: ssv2_template # ssv2_label, ssv2_template
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: downstream_tasks_retrieval
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: knowledge_patcher
 79 |   eval_task: ssv2_template
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/downstream_tasks/ssv2_template/side_tuning"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True
 92 |   
 93 |   # train_splits: ["train"]
 94 |   valid_splits: ["val"]
 95 |   # test_splits: ["test"]
 96 | 
 97 |   device: "cuda"
 98 |   world_size: 1
 99 |   dist_url: "env://"
100 |   distributed: True
101 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/ssv2_template/patch_and_fuse.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | model:
  3 |   arch: patch_and_fuse_internvideo
  4 |   model_type: InternVideo-MM-L-14
  5 |   load_pretrained: True
  6 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  7 |   pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_template_patch_and_fuse.pth" #TODO: set trained patch_and_fuse checkpoint path
  8 |    
  9 |   text_perceiver_config:
 10 |     dim: 768 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 768 # latent query dim
 20 |     k_v_dim: 1024 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: True
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.1
 34 |   if_as_knowledge_fuser: True
 35 |   knowledge_fuser_type: "xattn"
 36 |   train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   downstream_tasks_retrieval_ssv2_224x224_5fps:
 40 |     type: "default"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs: 
 55 |     fps: 5 # if downsampled, use 5 fps
 56 |     task: ssv2_template # ssv2_label, ssv2_template
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: downstream_tasks_retrieval
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: knowledge_patcher
 79 |   eval_task: ssv2_template
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/downstream_tasks/ssv2_template/patch_and_fuse"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True
 92 |   
 93 |   # train_splits: ["train"]
 94 |   valid_splits: ["val"]
 95 |   # test_splits: ["test"]
 96 | 
 97 |   device: "cuda"
 98 |   world_size: 1
 99 |   dist_url: "env://"
100 |   distributed: True
101 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/clipvip/ssv2/acdybench_ssv2_clipvip_backbone__action_antonym.yaml:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
  2 | 
  3 | model:
  4 |   arch: patch_and_fuse_clipvip
  5 |   model_type: pretrain_vip_base_32
  6 |   load_pretrained: True
  7 |   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
  8 | 
  9 |   text_perceiver_config:
 10 |     dim: 512 # latent query dim
 11 |     k_v_dim: 512 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 512 # latent query dim
 20 |     k_v_dim: 768 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: False
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.0
 34 |   # if_as_knowledge_fuser: True
 35 |   # knowledge_fuser_type: "xattn"
 36 |   # train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   actionbench_ssv2_224x224_5fps:
 40 |     type: "action_antonyms_and_object_shuffled"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs: 
 55 |     fps: 5 # if downsampled, use 5 fps
 56 |     task: "action_antonym" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: actionbench
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: backbone
 79 |   eval_task: action_antonym
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/actionbench/eval/ClipViP/ssv2__clipvip_backbone__action_antonym"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True 
 92 |   # train_splits: ["train"]
 93 |   valid_splits: ["val"]
 94 |   # test_splits: ["test"]
 95 | 
 96 |   device: "cuda"
 97 |   world_size: 1
 98 |   dist_url: "env://"
 99 |   distributed: True
100 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/clipvip/ssv2/acdybench_ssv2_clipvip_backbone__object_shuffle.yaml:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
  2 | 
  3 | model:
  4 |   arch: patch_and_fuse_clipvip
  5 |   model_type: pretrain_vip_base_32
  6 |   load_pretrained: True
  7 |   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
  8 | 
  9 |   text_perceiver_config:
 10 |     dim: 512 # latent query dim
 11 |     k_v_dim: 512 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 512 # latent query dim
 20 |     k_v_dim: 768 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: False
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.0
 34 |   # if_as_knowledge_fuser: True
 35 |   # knowledge_fuser_type: "xattn"
 36 |   # train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   actionbench_ssv2_224x224_5fps:
 40 |     type: "action_antonyms_and_object_shuffled"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs: 
 55 |     fps: 5 # if downsampled, use 5 fps
 56 |     task: "object_shuffle" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: actionbench
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: backbone
 79 |   eval_task: object_shuffle
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/actionbench/eval/ClipViP/ssv2__clipvip_backbone__object_shuffle"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True 
 92 |   # train_splits: ["train"]
 93 |   valid_splits: ["val"]
 94 |   # test_splits: ["test"]
 95 | 
 96 |   device: "cuda"
 97 |   world_size: 1
 98 |   dist_url: "env://"
 99 |   distributed: True
100 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/moments_in_time/patch_and_fuse_zero-shot.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   arch: patch_and_fuse_internvideo
  3 |   model_type: InternVideo-MM-L-14
  4 |   load_pretrained: True
  5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  6 |   pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_label_patch_and_fuse.pth"
  7 | 
  8 |   text_perceiver_config:
  9 |     dim: 768 # latent query dim
 10 |     k_v_dim: 768 # text_width
 11 |     depth: 1
 12 |     dim_head: 64
 13 |     heads: 8
 14 |     num_latents: 16
 15 |     ff_mult: 2
 16 | 
 17 |   vision_perceiver_config:
 18 |     dim: 768 # latent query dim
 19 |     k_v_dim: 1024 # vision_width
 20 |     depth: 1
 21 |     dim_head: 64
 22 |     heads: 8
 23 |     num_latents: 16
 24 |     ff_mult: 2
 25 | 
 26 |   objectives: ["video_text_contrastive"]
 27 |   loss_weighting: [1.0]
 28 |   if_use_attn_guidance: False
 29 |   if_use_dual_perceiver: False
 30 |   if_add_temporal_emebdding: True
 31 |   num_frms: 8
 32 |   temp_emb_drop_out: 0.1
 33 |   if_as_knowledge_fuser: True
 34 |   knowledge_fuser_type: "xattn"
 35 |   train_knowledge_fuser_jointly: True
 36 | 
 37 | datasets:
 38 |   downstream_tasks_moment_in_time:
 39 |     type: "default"
 40 |     vis_processor:
 41 |       train:
 42 |         name: "video_train"
 43 |         image_size: 224
 44 |       eval:
 45 |         name: "internvideo_eval"
 46 |         image_size: 224
 47 |     text_processor:
 48 |       train:
 49 |         name: "minimum_text"
 50 |       eval:
 51 |         name: "minimum_text"
 52 |     
 53 |     # IMPORTANT configs:
 54 |     task: video_action_retrieval_2k
 55 |     neg_sampling_same_clip: 0
 56 |     eval_only: True
 57 |     
 58 |     # other arguements
 59 |     train_k: null # sample a subset of k instances
 60 |     eval_k: null # sample a subset of 3000 instances, reduce evaluation time
 61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 62 |     num_frm: 8
 63 |     train_frame_height: 224
 64 |     train_frame_width: 224
 65 |     eval_frame_height: 224
 66 |     eval_frame_width: 224
 67 | 
 68 | run:
 69 |   # use custom runner
 70 |   runner: runner_base_patch_and_fuse
 71 |   
 72 |   # task object name
 73 |   task: downstream_tasks_retrieval
 74 | 
 75 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 76 |   eval_module: knowledge_patcher
 77 |   eval_task: video_action_retrieval_2k
 78 | 
 79 |   ## NOTE: uncomment the following to use Backbone Ensemble
 80 |   # eval_method: ensemble_with_backbone 
 81 | 
 82 |   batch_size_train: 32
 83 |   batch_size_eval: 4
 84 |   num_workers: 4
 85 | 
 86 |   seed: 42
 87 |   output_dir: "output/downstream_tasks/MomentsInTime/eval/patch_and_fuse_zero-shot"
 88 | 
 89 |   amp: False
 90 |   resume_ckpt_path: null
 91 | 
 92 |   evaluate: True
 93 |   
 94 |   # train_splits: ["train"]
 95 |   valid_splits: ["val"]
 96 |   # test_splits: ["test"]
 97 | 
 98 |   device: "cuda"
 99 |   world_size: 1
100 |   dist_url: "env://"
101 |   distributed: True
102 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/clipvip/ssv2/acdybench_ssv2_clipvip_backbone__reversed_video.yaml:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
  2 | 
  3 | model:
  4 |   arch: patch_and_fuse_clipvip
  5 |   model_type: pretrain_vip_base_32
  6 |   load_pretrained: True
  7 |   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
  8 | 
  9 |   text_perceiver_config:
 10 |     dim: 512 # latent query dim
 11 |     k_v_dim: 512 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 512 # latent query dim
 20 |     k_v_dim: 768 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: False
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.0
 34 |   # if_as_knowledge_fuser: True
 35 |   # knowledge_fuser_type: "xattn"
 36 |   # train_knowledge_fuser_jointly: True
 37 | 
 38 | 
 39 | datasets:
 40 |   actionbench_ssv2_224x224_5fps:
 41 |     type: "action_antonyms_and_object_shuffled"
 42 |     vis_processor:
 43 |       train:
 44 |         name: "video_train"
 45 |         image_size: 224
 46 |       eval:
 47 |         name: "internvideo_eval"
 48 |         image_size: 224
 49 |     text_processor:
 50 |       train:
 51 |         name: "minimum_text"
 52 |       eval:
 53 |         name: "minimum_text"
 54 |     
 55 |     # IMPORTANT configs: 
 56 |     fps: 5 # if downsampled, use 5 fps
 57 |     task: "reversed_video" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
 58 |     neg_sampling_same_clip: 0
 59 |     eval_only: True
 60 |     
 61 |     # other arguements
 62 |     train_k: null # sample a subset of k instances
 63 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 64 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 65 |     num_frm: 8
 66 |     train_frame_height: 224
 67 |     train_frame_width: 224
 68 |     eval_frame_height: 224
 69 |     eval_frame_width: 224
 70 | 
 71 | run:
 72 |   # use custom runner
 73 |   runner: runner_base_patch_and_fuse
 74 |   
 75 |   # task object name
 76 |   task: actionbench
 77 | 
 78 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 79 |   eval_module: backbone
 80 |   eval_task: reversed_video
 81 | 
 82 |   batch_size_train: 32
 83 |   batch_size_eval: 4
 84 |   num_workers: 4
 85 | 
 86 |   seed: 42
 87 |   output_dir: "output/actionbench/eval/ClipViP/ssv2__clipvip_backbone__reversed_video"
 88 | 
 89 |   amp: False
 90 |   resume_ckpt_path: null
 91 | 
 92 |   evaluate: True 
 93 |   # train_splits: ["train"]
 94 |   valid_splits: ["val"]
 95 |   # test_splits: ["test"]
 96 | 
 97 |   device: "cuda"
 98 |   world_size: 1
 99 |   dist_url: "env://"
100 |   distributed: True
101 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/downstream_task/temporal_kinetics/patch_and_fuse_zero-shot.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   arch: patch_and_fuse_internvideo
  3 |   model_type: InternVideo-MM-L-14
  4 |   load_pretrained: True
  5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  6 |   pretrained: "pretrained_ckpt/PatchAndFuse/downstream_tasks/ssv2_label_patch_and_fuse.pth"
  7 | 
  8 |   text_perceiver_config:
  9 |     dim: 768 # latent query dim
 10 |     k_v_dim: 768 # text_width
 11 |     depth: 1
 12 |     dim_head: 64
 13 |     heads: 8
 14 |     num_latents: 16
 15 |     ff_mult: 2
 16 | 
 17 |   vision_perceiver_config:
 18 |     dim: 768 # latent query dim
 19 |     k_v_dim: 1024 # vision_width
 20 |     depth: 1
 21 |     dim_head: 64
 22 |     heads: 8
 23 |     num_latents: 16
 24 |     ff_mult: 2
 25 | 
 26 |   objectives: ["video_text_contrastive"]
 27 |   loss_weighting: [1.0]
 28 |   if_use_attn_guidance: False
 29 |   if_use_dual_perceiver: False
 30 |   if_add_temporal_emebdding: True
 31 |   num_frms: 8
 32 |   temp_emb_drop_out: 0.1
 33 |   if_as_knowledge_fuser: True
 34 |   knowledge_fuser_type: "xattn"
 35 |   train_knowledge_fuser_jointly: True
 36 | 
 37 | datasets:
 38 |   downstream_tasks_temporal:
 39 |     type: "default"
 40 |     vis_processor:
 41 |       train:
 42 |         name: "video_train"
 43 |         image_size: 224
 44 |       eval:
 45 |         name: "internvideo_eval"
 46 |         image_size: 224
 47 |     text_processor:
 48 |       train:
 49 |         name: "minimum_text"
 50 |       eval:
 51 |         name: "minimum_text"
 52 |     
 53 |     # IMPORTANT configs:
 54 |     fps: 5
 55 |     task: v1.0_2.4k
 56 |     subset: kinetics
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of 3000 instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: downstream_tasks_retrieval
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: knowledge_patcher
 79 |   eval_task: v1.0_2.4k
 80 | 
 81 |   ## NOTE: uncomment the following to use Backbone Ensemble
 82 |   # eval_method: ensemble_with_backbone 
 83 | 
 84 |   batch_size_train: 32
 85 |   batch_size_eval: 4
 86 |   num_workers: 4
 87 | 
 88 |   seed: 42
 89 |   output_dir: "output/downstream_tasks/temporal-kinetics/eval/patch_and_fuse_zero-shot"
 90 | 
 91 |   amp: False
 92 |   resume_ckpt_path: null
 93 | 
 94 |   evaluate: True
 95 |   
 96 |   # train_splits: ["train"]
 97 |   valid_splits: ["val"]
 98 |   # test_splits: ["test"]
 99 | 
100 |   device: "cuda"
101 |   world_size: 1
102 |   dist_url: "env://"
103 |   distributed: True
104 | 


--------------------------------------------------------------------------------
/src/Singularity/configs/qa_anet.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: anet
 2 | data_root: ${oc.env:SL_DATA_DIR}/videos_images
 3 | anno_root_downstream: ${oc.env:SL_DATA_DIR}/anno_downstream
 4 | train_file:
 5 |   - ['${anno_root_downstream}/anet_qa_train.json', '${data_root}/activity_net_2fps_360', video]
 6 | test_types: [val, ] # one of [minival, test]
 7 | test_file:
 8 |   val: ['${anno_root_downstream}/anet_qa_val.json', '${data_root}/activity_net_2fps_360', video]
 9 |   test: ['${anno_root_downstream}/anet_qa_test.json', '${data_root}/activity_net_2fps_360', video]
10 | stop_key: val  #  one of the key in `test_file`
11 | answer_list: ${anno_root_downstream}/anet_qa_answer_list.json # list of answer words
12 | 
13 | text_encoder: bert-base-uncased
14 | text_decoder: bert-base-uncased
15 | bert_config: configs/config_bert.json
16 | vit_type: beit  # items in ${vit_zoo}
17 | vit_zoo:  # from huggingface
18 |   beit: microsoft/beit-base-patch16-224-pt22k-ft22k
19 | vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]}
20 | temporal_vision_encoder:
21 |   enable: False
22 |   num_layers: 2
23 |   update_pooler_embed: False
24 | add_temporal_embed: False  # whether to add temporal embed to encoded frames
25 | 
26 | image_res: 224
27 | embed_dim: 256  # -- not used
28 | video_input:  # input -- not used
29 |   num_frames: 1
30 |   reader: decord  # one of [decord, av]
31 |   sample_type: rand
32 |   num_frames_test: 4  # num_frames during inference/test
33 |   sample_type_test: middle
34 | 
35 | batch_size:
36 |   image: 128
37 |   video: 32
38 | batch_size_test:
39 |   image: 64
40 |   video: 64
41 | k_test: 128
42 | temp: 0.07  # -- not used
43 | eos: '[SEP]'
44 | max_q_len: 25
45 | max_a_len: 5
46 | 
47 | optimizer:
48 |   opt: adamW
49 |   lr: 1e-5
50 |   opt_betas: [0.9, 0.999]  # default
51 |   weight_decay: 0.02
52 |   max_grad_norm: -1  # requires a positive float, use -1 to disable
53 |   different_lr:  # use a different lr for some modules, e.g., larger lr for new modules
54 |     enable: False
55 |     module_names: []
56 |     lr: 1e-3
57 | 
58 | scheduler:
59 |   sched: cosine
60 |   epochs: 10
61 |   min_lr_multi: 0.1  # min_lr will be `optimizer.lr * min_lr_multi`
62 |   warmup_epochs: 0.5  # float
63 | 
64 | output_dir: None # output dir
65 | pretrained_path: None  # path to pretrained model weights
66 | resume: False  # if True, load optimizer and scheduler states as well
67 | evaluate: False
68 | # `eval_frame_ensemble': how do we aggregate scores if `video_input.num_frames_test' > `video_input.num_frames'
69 | # `concat': concat frames before input to multi-modal encoder, i.e., early fusion
70 | # `mean', `max', `lse': mean/max/lse-pool scores after multi-modal encoder, i.e., late fusion, as in ClipBERT
71 | eval_frame_ensemble: concat  # [concat, max, mean, lse]
72 | device: cuda
73 | seed: 42
74 | log_freq: 100
75 | dist_url: env://
76 | distributed: True
77 | fp16: True
78 | debug: False
79 | num_workers: 24
80 | 
81 | wandb:
82 |   enable: False
83 |   entity: None   # username or teamname to store the runs, see https://docs.wandb.ai/ref/python/init
84 |   project: vqa  # setup in your command line
85 | 
86 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/singularity/ssv2/acdybench_ssv2_singularity_backbone__action_antonym.yaml:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
  2 | 
  3 | model:
  4 |   arch: patch_and_fuse_singularity
  5 |   model_type: singularity_temporal_17m
  6 |   load_pretrained: True
  7 |   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
  8 | 
  9 |   text_perceiver_config:
 10 |     dim: 256 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 256 # latent query dim
 20 |     k_v_dim: 768 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: False
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.0
 34 |   # if_as_knowledge_fuser: True
 35 |   # knowledge_fuser_type: "xattn"
 36 |   # train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   actionbench_ssv2_224x224_5fps:
 40 |     type: "action_antonyms_and_object_shuffled"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs: 
 55 |     fps: 5 # if downsampled, use 5 fps
 56 |     task: "action_antonym" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: actionbench
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: backbone
 79 |   eval_task: action_antonym
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/actionbench/eval/Singularity/ssv2__Singularity_backbone__action_antonym"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True 
 92 |   # train_splits: ["train"]
 93 |   valid_splits: ["val"]
 94 |   # test_splits: ["test"]
 95 | 
 96 |   device: "cuda"
 97 |   world_size: 1
 98 |   dist_url: "env://"
 99 |   distributed: True
100 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/singularity/ssv2/acdybench_ssv2_singularity_backbone__object_shuffle.yaml:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
  2 | 
  3 | model:
  4 |   arch: patch_and_fuse_singularity
  5 |   model_type: singularity_temporal_17m
  6 |   load_pretrained: True
  7 |   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
  8 | 
  9 |   text_perceiver_config:
 10 |     dim: 256 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 256 # latent query dim
 20 |     k_v_dim: 768 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: False
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.0
 34 |   # if_as_knowledge_fuser: True
 35 |   # knowledge_fuser_type: "xattn"
 36 |   # train_knowledge_fuser_jointly: True
 37 | 
 38 | datasets:
 39 |   actionbench_ssv2_224x224_5fps:
 40 |     type: "action_antonyms_and_object_shuffled"
 41 |     vis_processor:
 42 |       train:
 43 |         name: "video_train"
 44 |         image_size: 224
 45 |       eval:
 46 |         name: "internvideo_eval"
 47 |         image_size: 224
 48 |     text_processor:
 49 |       train:
 50 |         name: "minimum_text"
 51 |       eval:
 52 |         name: "minimum_text"
 53 |     
 54 |     # IMPORTANT configs: 
 55 |     fps: 5 # if downsampled, use 5 fps
 56 |     task: "object_shuffle" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
 57 |     neg_sampling_same_clip: 0
 58 |     eval_only: True
 59 |     
 60 |     # other arguements
 61 |     train_k: null # sample a subset of k instances
 62 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: actionbench
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: backbone
 79 |   eval_task: object_shuffle
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/actionbench/eval/Singularity/ssv2__Singularity_backbone__object_shuffle"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True 
 92 |   # train_splits: ["train"]
 93 |   valid_splits: ["val"]
 94 |   # test_splits: ["test"]
 95 | 
 96 |   device: "cuda"
 97 |   world_size: 1
 98 |   dist_url: "env://"
 99 |   distributed: True
100 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/backbone/singularity/ssv2/acdybench_ssv2_singularity_backbone__reversed_video.yaml:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
  2 | 
  3 | model:
  4 |   arch: patch_and_fuse_singularity
  5 |   model_type: singularity_temporal_17m
  6 |   load_pretrained: True
  7 |   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
  8 | 
  9 |   text_perceiver_config:
 10 |     dim: 256 # latent query dim
 11 |     k_v_dim: 768 # text_width
 12 |     depth: 1
 13 |     dim_head: 64
 14 |     heads: 8
 15 |     num_latents: 16
 16 |     ff_mult: 2
 17 | 
 18 |   vision_perceiver_config:
 19 |     dim: 256 # latent query dim
 20 |     k_v_dim: 768 # vision_width
 21 |     depth: 1
 22 |     dim_head: 64
 23 |     heads: 8
 24 |     num_latents: 16
 25 |     ff_mult: 2
 26 | 
 27 |   objectives: ["video_text_contrastive"]
 28 |   loss_weighting: [1.0]
 29 |   if_use_attn_guidance: False
 30 |   if_use_dual_perceiver: False
 31 |   if_add_temporal_emebdding: False
 32 |   num_frms: 8
 33 |   temp_emb_drop_out: 0.0
 34 |   # if_as_knowledge_fuser: True
 35 |   # knowledge_fuser_type: "xattn"
 36 |   # train_knowledge_fuser_jointly: True
 37 | 
 38 | 
 39 | datasets:
 40 |   actionbench_ssv2_224x224_5fps:
 41 |     type: "action_antonyms_and_object_shuffled"
 42 |     vis_processor:
 43 |       train:
 44 |         name: "video_train"
 45 |         image_size: 224
 46 |       eval:
 47 |         name: "internvideo_eval"
 48 |         image_size: 224
 49 |     text_processor:
 50 |       train:
 51 |         name: "minimum_text"
 52 |       eval:
 53 |         name: "minimum_text"
 54 |     
 55 |     # IMPORTANT configs: 
 56 |     fps: 5 # if downsampled, use 5 fps
 57 |     task: "reversed_video" # ["video_text_matching", "action_antonym", "object_shuffle", "reversed_video"]
 58 |     neg_sampling_same_clip: 0
 59 |     eval_only: True
 60 |     
 61 |     # other arguements
 62 |     train_k: null # sample a subset of k instances
 63 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 64 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 65 |     num_frm: 8
 66 |     train_frame_height: 224
 67 |     train_frame_width: 224
 68 |     eval_frame_height: 224
 69 |     eval_frame_width: 224
 70 | 
 71 | run:
 72 |   # use custom runner
 73 |   runner: runner_base_patch_and_fuse
 74 |   
 75 |   # task object name
 76 |   task: actionbench
 77 | 
 78 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 79 |   eval_module: backbone
 80 |   eval_task: reversed_video
 81 | 
 82 |   batch_size_train: 32
 83 |   batch_size_eval: 4
 84 |   num_workers: 4
 85 | 
 86 |   seed: 42
 87 |   output_dir: "output/actionbench/eval/Singularity/ssv2__Singularity_backbone__reversed_video"
 88 | 
 89 |   amp: False
 90 |   resume_ckpt_path: null
 91 | 
 92 |   evaluate: True 
 93 |   # train_splits: ["train"]
 94 |   valid_splits: ["val"]
 95 |   # test_splits: ["test"]
 96 | 
 97 |   device: "cuda"
 98 |   world_size: 1
 99 |   dist_url: "env://"
100 |   distributed: True
101 | 


--------------------------------------------------------------------------------
/src/configs/projects/train/downstream_tasks/nextqa/Patch_and_Fuse.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   arch: patch_and_fuse_internvideo_mcqa
  3 |   model_type: InternVideo-MM-L-14
  4 |   load_pretrained: True
  5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/models/InternVideo-MM-L-14.ckpt"
  6 |   pretrained: "<path to trained KP-Perceiver>/checkpoint_best.pth" #TODO: set trained Knowledge Patcher checkpoint path
  7 | 
  8 |   text_perceiver_config:
  9 |     dim: 768 # latent query dim
 10 |     k_v_dim: 768 # text_width
 11 |     depth: 1
 12 |     dim_head: 64
 13 |     heads: 8
 14 |     num_latents: 16
 15 |     ff_mult: 2
 16 | 
 17 |   vision_perceiver_config:
 18 |     dim: 768 # latent query dim
 19 |     k_v_dim: 1024 # vision_width
 20 |     depth: 1
 21 |     dim_head: 64
 22 |     heads: 8
 23 |     num_latents: 16
 24 |     ff_mult: 2
 25 | 
 26 |   objectives: ["mcqa_loss"]
 27 |   loss_weighting: [1.0]
 28 |   if_use_attn_guidance: False
 29 |   if_use_dual_perceiver: False
 30 |   if_add_temporal_emebdding: False
 31 |   num_frms: 8
 32 |   temp_emb_drop_out: 0.0
 33 |   knowledge_fuser_type: "xattn"
 34 |   if_as_knowledge_fuser: True
 35 |   train_knowledge_fuser_jointly: True
 36 | 
 37 | datasets:
 38 |   downstream_tasks_qa_nextqa_224x224_5fps:
 39 |     type: "default"
 40 |     vis_processor:
 41 |       train:
 42 |         name: "video_train"
 43 |         image_size: 224
 44 |       eval:
 45 |         name: "internvideo_eval"
 46 |         image_size: 224
 47 |     text_processor:
 48 |       train:
 49 |         name: "minimum_text"
 50 |       eval:
 51 |         name: "minimum_text"
 52 |     
 53 |     # IMPORTANT configs:
 54 |     task: 5way-multiple-choice-qa
 55 |     neg_sampling_same_clip: 0
 56 |     eval_only: False
 57 |     
 58 |     # other arguements
 59 |     train_k: null # sample a subset of k instances
 60 |     eval_k: null # sample a subset of k instances, reduce evaluation time
 61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 62 |     num_frm: 8
 63 |     train_frame_height: 224
 64 |     train_frame_width: 224
 65 |     eval_frame_height: 224
 66 |     eval_frame_width: 224
 67 | 
 68 | 
 69 | run:
 70 |   # use custom runner
 71 |   runner: runner_base_patch_and_fuse
 72 |   
 73 |   # task object name
 74 |   task: actionbench
 75 | 
 76 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 77 |   eval_module: knowledge_patcher
 78 |   eval_task: video_text_matching
 79 | 
 80 |   # optimizer
 81 |   lr_sched: "linear_warmup_cosine_lr"
 82 |   init_lr: 1e-5
 83 |   min_lr: 0
 84 |   weight_decay: 0.05
 85 |   max_epoch: 4
 86 | 
 87 |   batch_size_train: 16
 88 |   batch_size_eval: 4
 89 |   
 90 |   num_workers: 4
 91 | 
 92 |   seed: 42
 93 |   output_dir: "output/downstream_tasks/NextQA/train/patch_and_fuse_internvideo"
 94 | 
 95 |   amp: False
 96 |   resume_ckpt_path: null
 97 | 
 98 |   evaluate: False
 99 |   
100 |   train_splits: ["train"]
101 |   valid_splits: ["val"]
102 |   # test_splits: ["test"]
103 | 
104 |   device: "cuda"
105 |   world_size: 1
106 |   dist_url: "env://"
107 |   distributed: True


--------------------------------------------------------------------------------
/src/configs/projects/train/downstream_tasks/nextqa/Side_Tuning.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   arch: patch_and_fuse_internvideo_mcqa
  3 |   model_type: InternVideo-MM-L-14
  4 |   load_pretrained: True
  5 |   backbone_pretrained: "pretrained_ckpt/InternVideo/models/InternVideo-MM-L-14.ckpt"
  6 |   pretrained: "<path to trained KP-Perceiver>/checkpoint_best.pth" #TODO: set trained Knowledge Patcher checkpoint path
  7 | 
  8 |   text_perceiver_config:
  9 |     dim: 768 # latent query dim
 10 |     k_v_dim: 768 # text_width
 11 |     depth: 1
 12 |     dim_head: 64
 13 |     heads: 8
 14 |     num_latents: 16
 15 |     ff_mult: 2
 16 | 
 17 |   vision_perceiver_config:
 18 |     dim: 768 # latent query dim
 19 |     k_v_dim: 1024 # vision_width
 20 |     depth: 1
 21 |     dim_head: 64
 22 |     heads: 8
 23 |     num_latents: 16
 24 |     ff_mult: 2
 25 | 
 26 |   objectives: ["mcqa_loss"]
 27 |   loss_weighting: [1.0]
 28 |   if_use_attn_guidance: False
 29 |   if_use_dual_perceiver: False
 30 |   if_add_temporal_emebdding: False
 31 |   num_frms: 8
 32 |   temp_emb_drop_out: 0.0
 33 |   knowledge_fuser_type: "side_tuning"
 34 |   if_as_knowledge_fuser: True
 35 |   train_knowledge_fuser_jointly: True
 36 | 
 37 | datasets:
 38 |   downstream_tasks_qa_nextqa_224x224_5fps:
 39 |     type: "default"
 40 |     vis_processor:
 41 |       train:
 42 |         name: "video_train"
 43 |         image_size: 224
 44 |       eval:
 45 |         name: "internvideo_eval"
 46 |         image_size: 224
 47 |     text_processor:
 48 |       train:
 49 |         name: "minimum_text"
 50 |       eval:
 51 |         name: "minimum_text"
 52 |     
 53 |     # IMPORTANT configs:
 54 |     task: 5way-multiple-choice-qa
 55 |     neg_sampling_same_clip: 0
 56 |     eval_only: False
 57 |     
 58 |     # other arguements
 59 |     train_k: null # sample a subset of k instances
 60 |     eval_k: null # sample a subset of 3000 instances, reduce evaluation time
 61 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 62 |     num_frm: 8
 63 |     train_frame_height: 224
 64 |     train_frame_width: 224
 65 |     eval_frame_height: 224
 66 |     eval_frame_width: 224
 67 | 
 68 | 
 69 | run:
 70 |   # use custom runner
 71 |   runner: runner_base_patch_and_fuse
 72 |   
 73 |   # task object name
 74 |   task: actionbench
 75 | 
 76 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 77 |   eval_module: knowledge_patcher
 78 |   eval_task: video_text_matching
 79 | 
 80 |   # optimizer
 81 |   lr_sched: "linear_warmup_cosine_lr"
 82 |   init_lr: 1e-5
 83 |   min_lr: 0
 84 |   weight_decay: 0.05
 85 |   max_epoch: 4
 86 | 
 87 |   batch_size_train: 16
 88 |   batch_size_eval: 4
 89 |   
 90 |   num_workers: 4
 91 | 
 92 |   seed: 42
 93 |   output_dir: "output/downstream_tasks/NextQA/train/side_tuning_internvideo"
 94 | 
 95 |   amp: False
 96 |   resume_ckpt_path: null
 97 | 
 98 |   evaluate: False
 99 |   
100 |   train_splits: ["train"]
101 |   valid_splits: ["val"]
102 |   # test_splits: ["test"]
103 | 
104 |   device: "cuda"
105 |   world_size: 1
106 |   dist_url: "env://"
107 |   distributed: True


--------------------------------------------------------------------------------
/src/Singularity/configs/qa_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: msrvtt
 2 | data_root: ${oc.env:SL_DATA_DIR}/videos_images
 3 | anno_root_downstream: ${oc.env:SL_DATA_DIR}/anno_downstream
 4 | train_file:  # each file should be formatted similar to data/downstream/vqa_train_sample.json
 5 |   - ['${anno_root_downstream}/msrvtt_qa_train.json', '${data_root}/msrvtt_2fps_224', video]
 6 | test_types: [val, ] # one of [minival, test]
 7 | test_file:
 8 |   val: ['${anno_root_downstream}/msrvtt_qa_val.json', '${data_root}/msrvtt_2fps_224', video]
 9 |   test: ['${anno_root_downstream}/msrvtt_qa_test.json', '${data_root}/msrvtt_2fps_224', video]
10 | stop_key: val  #  on of the key in `test_file`
11 | answer_list: ${anno_root_downstream}/msrvtt_qa_answer_list.json # list of answer words
12 | 
13 | text_encoder: bert-base-uncased
14 | text_decoder: bert-base-uncased
15 | bert_config: configs/config_bert.json
16 | vit_type: beit  # items in ${vit_zoo}
17 | vit_zoo:  # from huggingface
18 |   beit: microsoft/beit-base-patch16-224-pt22k-ft22k
19 | vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]}
20 | temporal_vision_encoder:
21 |   enable: False
22 |   num_layers: 2
23 |   update_pooler_embed: False
24 | add_temporal_embed: False  # whether to add temporal embed to encoded frames
25 | 
26 | image_res: 224
27 | embed_dim: 256  # -- not used
28 | video_input:  # input -- not used
29 |   num_frames: 1
30 |   reader: decord  # one of [decord, av]
31 |   sample_type: rand
32 |   num_frames_test: 4  # num_frames during inference/test
33 |   sample_type_test: middle
34 | max_q_len: 25
35 | max_a_len: 5
36 | 
37 | batch_size:
38 |   image: 128
39 |   video: 32
40 | batch_size_test:
41 |   image: 64
42 |   video: 64
43 | k_test: 128
44 | temp: 0.07  # -- not used
45 | eos: '[SEP]'
46 | 
47 | optimizer:
48 |   opt: adamW
49 |   lr: 1e-5
50 |   opt_betas: [0.9, 0.999]  # default
51 |   weight_decay: 0.02
52 |   max_grad_norm: -1  # requires a positive float, use -1 to disable
53 |   different_lr:  # use a different lr for some modules, e.g., larger lr for new modules
54 |     enable: False
55 |     module_names: []
56 |     lr: 1e-3
57 | 
58 | scheduler:
59 |   sched: cosine
60 |   epochs: 10
61 |   min_lr_multi: 0.1  # min_lr will be `optimizer.lr * min_lr_multi`
62 |   warmup_epochs: 0.5  # float
63 | 
64 | output_dir: None # output dir
65 | pretrained_path: None  # path to pretrained model weights
66 | resume: False  # if True, load optimizer and scheduler states as well
67 | evaluate: False
68 | # `eval_frame_ensemble': how do we aggregate scores if `video_input.num_frames_test' > `video_input.num_frames'
69 | # `concat': concat frames before input to multi-modal encoder, i.e., early fusion
70 | # `mean', `max', `lse': mean/max/lse-pool scores after multi-modal encoder, i.e., late fusion, as in ClipBERT
71 | eval_frame_ensemble: concat  # [concat, max, mean, lse]
72 | device: cuda
73 | seed: 42
74 | log_freq: 100
75 | dist_url: env://
76 | distributed: True
77 | fp16: True
78 | debug: False
79 | num_workers: 16
80 | 
81 | wandb:
82 |   enable: False
83 |   entity: None   # username or teamname to store the runs, see https://docs.wandb.ai/ref/python/init
84 |   project: vqa  # setup in your command line
85 | 
86 | 


--------------------------------------------------------------------------------
/ActionBench/src/split_train_val_test_ego4d.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import random
 4 | from tqdm import tqdm
 5 | random.seed(42)  # Set the random seed to ensure reproducibility
 6 | 
 7 | ### == some helper functions == ###
 8 | class PostProcess():
 9 |     # handle edge cases and artifacts from the antonym mining
10 |     def __init__(self) -> None:
11 |         additional_antonyms_mapping = json.load(open("additional_antonyms_mapping_ego4d.json"))
12 |         self.post_process_targets = {}
13 |         for value in additional_antonyms_mapping.values():
14 |             if value.endswith("s") or value.endswith("es"):
15 |                 self.post_process_targets[value+"es"] = value
16 |         print("post processing target:value:", self.post_process_targets)
17 |     
18 |     def run(self, ann):
19 |         action_antonym_clip_text = ann['action_antonym_clip_text']
20 |         for key,value in self.post_process_targets.items():
21 |             action_antonym_clip_text = action_antonym_clip_text.replace(key,value)
22 |         ann['action_antonym_clip_text'] = action_antonym_clip_text
23 | 
24 | def filterer(ann):
25 |     if_filter = False
26 |     filtering_verbs = ["keeps","keep","kept"]
27 |     for v in filtering_verbs:
28 |         if v in ann["clip_text"]:
29 |             if_filter = True
30 |             break
31 |     return if_filter 
32 | 
33 | 
34 | ### == set the input and output paths == ###
35 | processed_annotation_jsonl_name = "<processed_jsonl_name>" # e.g. "egoclip_subset_action_antonyms"
36 | input_jsonl = f"../ego4d/{processed_annotation_jsonl_name}.jsonl"
37 | 
38 | train_output = f"<output_path>/{processed_annotation_jsonl_name}/train.jsonl"
39 | val_output = f"<output_path>/{processed_annotation_jsonl_name}/val.jsonl"
40 | test_output = f"<output_path>/{processed_annotation_jsonl_name}/test.jsonl"
41 | 
42 | ### == set split ratio == ###
43 | ratios = [0.8,0.1,0.1]
44 | 
45 | 
46 | ### == run the script == ###
47 | post_processor = PostProcess()
48 | 
49 | annotations = []
50 | with open(input_jsonl, 'r') as f:
51 |     for line in tqdm(f):
52 |         loaded_ann = json.loads(line)
53 |         post_processor.run(loaded_ann)
54 |         if not filterer(loaded_ann):
55 |             annotations.append(loaded_ann)
56 |         else:
57 |             print("filtered:", loaded_ann['clip_text'])
58 | print(len(annotations))
59 | 
60 | random.shuffle(annotations)
61 | 
62 | sizes = [int(len(annotations)*r) for r in ratios]
63 | 
64 | print(sizes)
65 | 
66 | train_anns = annotations[:sizes[0]]
67 | val_anns = annotations[sizes[0]:sizes[0]+sizes[1]]
68 | test_anns = annotations[sizes[0]+sizes[1]:]
69 | 
70 | print("train size:", len(train_anns))
71 | print("val size:", len(val_anns))
72 | print("test size:", len(test_anns))
73 | 
74 | with open(train_output, 'w') as out:
75 |     for line in train_anns:
76 |         out.write(json.dumps(line))
77 |         out.write("\n")
78 | 
79 | with open(val_output, 'w') as out:
80 |     for line in val_anns:
81 |         out.write(json.dumps(line))
82 |         out.write("\n")
83 | 
84 | with open(test_output, 'w') as out:
85 |     for line in test_anns:
86 |         out.write(json.dumps(line))
87 |         out.write("\n")


--------------------------------------------------------------------------------
/src/configs/projects/train/actionbench/ssv2/KP-Transformer-VTC.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | ## == internvideo backbone == ## 
 3 | model:
 4 |   arch: patch_and_fuse_internvideo_baseline_simple
 5 |   model_type: InternVideo-MM-L-14
 6 |   load_pretrained: True
 7 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 8 | 
 9 | ## == ClipVip backbone == ## 
10 | ## NOTE: uncomment following "model" section to use the clipvip backbone; modify the output_dir accordingly under "run:" section
11 | # model:
12 | #   arch: patch_and_fuse_clipvip_baseline_simple # w/o jointly train vis encoder using action prediction
13 | #   model_type: pretrain_vip_base_32
14 | #   load_pretrained: True
15 | #   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
16 | 
17 | 
18 | ## == Singularity backbone == ## 
19 | ## NOTE: uncomment following "model" section to use the singularity backbone; modify the output_dir accordingly under "run:" section 
20 | # model:
21 | #   arch: patch_and_fuse_singularity_baseline_simple # w/o jointly train vis encoder using action prediction
22 | #   model_type: singularity_temporal_17m
23 | #   load_pretrained: True
24 | #   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
25 | 
26 | 
27 | datasets:
28 |   actionbench_ssv2_224x224_5fps:
29 |     type: "action_antonyms_and_object_shuffled"
30 |     vis_processor:
31 |       train:
32 |         name: "video_train"
33 |         image_size: 224
34 |       eval:
35 |         name: "internvideo_eval"
36 |         image_size: 224
37 |     text_processor:
38 |       train:
39 |         name: "minimum_text"
40 |       eval:
41 |         name: "minimum_text"
42 |     
43 |     # IMPORTANT configs: 
44 |     fps: 5 # if downsampled, use 5 fps
45 |     task: "action_antonym"
46 |     neg_sampling_same_clip: 0
47 |     # eval_only: True
48 |     
49 |     # other arguements
50 |     train_k: null # sample a subset of k instances
51 |     eval_k: null # sample a subset of k instances, reduce evaluation time
52 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
53 |     num_frm: 8
54 |     train_frame_height: 224
55 |     train_frame_width: 224
56 |     eval_frame_height: 224
57 |     eval_frame_width: 224
58 | 
59 | run:
60 |   # use custom runner
61 |   runner: runner_base_patch_and_fuse
62 |   
63 |   # task object name
64 |   task: actionbench
65 | 
66 |   # which module is used for inference ["backbone", "knowledge_patcher", "knowledge_patcher_baseline"]
67 |   eval_module: knowledge_patcher_baseline
68 |   eval_task: action_antonym
69 | 
70 |   # optimizer
71 |   lr_sched: "linear_warmup_cosine_lr"
72 |   init_lr: 1e-5
73 |   min_lr: 0
74 |   weight_decay: 0.05
75 |   max_epoch: 1
76 | 
77 |   batch_size_train: 8
78 |   batch_size_eval: 4
79 |   
80 |   num_workers: 4
81 | 
82 |   seed: 42
83 |   output_dir: "output/actionbench/train/SSv2__KP-Transformer-VTC__internvideo"
84 | 
85 |   amp: False
86 |   resume_ckpt_path: null
87 | 
88 |   evaluate: False
89 |   
90 |   train_splits: ["train"]
91 |   valid_splits: ["val"]
92 |   # test_splits: ["test"]
93 | 
94 |   device: "cuda"
95 |   world_size: 1
96 |   dist_url: "env://"
97 |   distributed: True
98 | 


--------------------------------------------------------------------------------
/src/configs/projects/train/actionbench/ego4d/KP-Transformer-VTC.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | ## == internvideo backbone == ## 
 3 | model:
 4 |   arch: patch_and_fuse_internvideo_baseline_simple
 5 |   model_type: InternVideo-MM-L-14
 6 |   load_pretrained: True
 7 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
 8 | 
 9 | ## == ClipVip backbone == ## 
10 | ## NOTE: uncomment following "model" section to use the clipvip backbone; modify the output_dir accordingly under "run:" section
11 | # model:
12 | #   arch: patch_and_fuse_clipvip_baseline_simple # w/o jointly train vis encoder using action prediction
13 | #   model_type: pretrain_vip_base_32
14 | #   load_pretrained: True
15 | #   backbone_config_json: "ClipViP/src/configs/pretrained/pretrain_vip_base_32.json"
16 | 
17 | 
18 | ## == Singularity backbone == ## 
19 | ## NOTE: uncomment following "model" section to use the singularity backbone; modify the output_dir accordingly under "run:" section 
20 | # model:
21 | #   arch: patch_and_fuse_singularity_baseline_simple # w/o jointly train vis encoder using action prediction
22 | #   model_type: singularity_temporal_17m
23 | #   load_pretrained: True
24 | #   backbone_config_yaml: "Singularity/configs/pretrained_singularity_temporal_17m.yaml"
25 | 
26 | 
27 | datasets:
28 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
29 |     vis_processor:
30 |       train:
31 |         name: "video_train"
32 |         image_size: 224
33 |       eval:
34 |         name: "internvideo_eval"
35 |         image_size: 224
36 |     text_processor:
37 |       train:
38 |         name: "vl_dynamic_ego4d_text"
39 |       eval:
40 |         name: "vl_dynamic_ego4d_text"
41 |     
42 |     # IMPORTANT configs: 
43 |     fps: 5 # if downsampled, use 5 fps
44 |     task: "action_antonym"
45 |     neg_sampling_same_clip: 0
46 |     # eval_only: True
47 |     
48 |     # other arguements
49 |     train_k: null # sample a subset of k instances
50 |     eval_k: null # sample a subset of k instances 
51 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
52 |     num_frm: 8
53 |     train_frame_height: 224
54 |     train_frame_width: 224
55 |     eval_frame_height: 224
56 |     eval_frame_width: 224
57 | 
58 | run:
59 |   # use custom runner
60 |   runner: runner_base_patch_and_fuse
61 |   
62 |   # task object name
63 |   task: actionbench
64 | 
65 |   # which module is used for inference ["backbone", "knowledge_patcher", "knowledge_patcher_baseline"]
66 |   eval_module: knowledge_patcher_baseline
67 |   eval_task: action_antonym
68 | 
69 |   # optimizer
70 |   lr_sched: "linear_warmup_cosine_lr"
71 |   init_lr: 1e-5
72 |   min_lr: 0
73 |   weight_decay: 0.05
74 |   max_epoch: 1
75 | 
76 |   batch_size_train: 8
77 |   batch_size_eval: 4
78 |   
79 |   num_workers: 4
80 | 
81 |   seed: 42
82 |   output_dir: "output/actionbench/train/Ego4d__KP-Transformer-VTC__internvideo"
83 | 
84 |   amp: False
85 |   resume_ckpt_path: null
86 | 
87 |   evaluate: False
88 |   
89 |   train_splits: ["train"]
90 |   valid_splits: ["val"]
91 |   # test_splits: ["test"]
92 | 
93 |   device: "cuda"
94 |   world_size: 1
95 |   dist_url: "env://"
96 |   distributed: True
97 | 


--------------------------------------------------------------------------------
/src/Singularity/utils/config_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import logging
 4 | from os.path import join, dirname
 5 | from omegaconf import OmegaConf, ListConfig, DictConfig
 6 | from .distributed import init_distributed_mode, is_main_process
 7 | from .logger import setup_logger
 8 | # from utils.distributed import init_distributed_mode, is_main_process
 9 | # from utils.logger import setup_logger
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def convert_types(config):
16 |     """Convert `'None'` (str) --> `None` (None). Only supports top-level"""
17 |     for k, v in config.items():
18 |         if isinstance(v, DictConfig):
19 |             setattr(config, k, convert_types(v))
20 | 
21 |         # TODO convert types in ListConfig, right now they are ignored
22 |         # if isinstance(v, ListConfig):
23 |         #     new_v = ListConfig()
24 | 
25 |         if v in ["None", "none"]:
26 |             setattr(config, k, None)
27 |     return config
28 | 
29 | 
30 | def setup_config():
31 |     """Conbine yaml config and command line config with OmegaConf.
32 |     Also converts types, e.g., `'None'` (str) --> `None` (None)
33 |     """
34 |     config_path = sys.argv[1]
35 |     del sys.argv[1]  # not needed
36 |     cli_args = sys.argv[1:]
37 |     yaml_config = OmegaConf.load(config_path)
38 |     cli_config = OmegaConf.from_cli() if len(cli_args) else OmegaConf.create()
39 |     # the latter overwrite the former, i.e, cli_config higher priority.
40 |     logger.info(f"Command line configs: {cli_config}")
41 |     config = OmegaConf.merge(yaml_config, cli_config)
42 |     config = convert_types(config)
43 |     if config.debug:
44 |         config.wandb.enable = False
45 |     return config
46 | 
47 | 
48 | def setup_evaluate_config(config):
49 |     """setup evaluation default settings, e.g., disable wandb"""
50 |     assert config.evaluate
51 |     config.wandb.enable = False
52 |     if config.output_dir is None:
53 |         config.output_dir = join(dirname(config.pretrained_path), "eval")
54 |     return config
55 | 
56 | 
57 | def setup_output_dir(output_dir, excludes=["code"]):
58 |     """ensure not overwritting an exisiting/non-empty output dir"""
59 |     if not os.path.exists(output_dir):
60 |         os.makedirs(output_dir, exist_ok=False)
61 |     else:
62 |         existing_dirs_files = os.listdir(output_dir)  # list
63 |         remaining = set(existing_dirs_files) - set(excludes)
64 |         remaining = [e for e in remaining if "slurm" not in e]
65 |         assert len(remaining) == 0, f"remaining dirs or files: {remaining}"
66 | 
67 | 
68 | def setup_main():
69 |     """
70 |     Setup config, logger, output_dir, etc. 
71 |     Shared for pretrain and all downstream tasks.
72 |     """
73 |     config = setup_config()
74 |     if hasattr(config, "evaluate") and config.evaluate:
75 |         config = setup_evaluate_config(config)    
76 |     init_distributed_mode(config)
77 | 
78 |     if is_main_process():
79 |         setup_output_dir(config.output_dir, excludes=["code"])
80 |         setup_logger(output=config.output_dir, color=True, name="loopitr")
81 |         OmegaConf.save(
82 |             config, open(os.path.join(config.output_dir, 'config.yaml'), 'w'))
83 |     return config
84 | 


--------------------------------------------------------------------------------
/src/Singularity/configs/qa_vqa.yaml:
--------------------------------------------------------------------------------
 1 | dataset_name: vqa
 2 | data_root: ${oc.env:SL_DATA_DIR}/videos_images
 3 | anno_root_downstream: ${oc.env:SL_DATA_DIR}/anno_downstream
 4 | train_file:  # each file should be formatted similar to data/downstream/vqa_train_sample.json
 5 |   - ['${anno_root_downstream}/vqa_train.json', '${data_root}/coco']
 6 |   - ['${anno_root_downstream}/vqa_valminusminival.json', '${data_root}/coco']
 7 |   - ['${anno_root_downstream}/vg_qa.json', '${data_root}/vg']
 8 | test_types: [minival, ] # one of [minival, test]
 9 | test_file:
10 |   minival: ['${anno_root_downstream}/vqa_minival.json', '${data_root}/coco']
11 |   test: ['${anno_root_downstream}/vqa_test.json', '${data_root}/coco']
12 | stop_key: minival  #  on of the key in `test_file`
13 | answer_list: ${anno_root_downstream}/vqa_answer_list.json # list of answer words
14 | 
15 | text_encoder: bert-base-uncased
16 | text_decoder: bert-base-uncased
17 | bert_config: configs/config_bert.json
18 | vit_type: beit  # items in ${vit_zoo}
19 | vit_zoo:  # from huggingface
20 |   beit: microsoft/beit-base-patch16-224-pt22k-ft22k
21 | vit_name_or_pretrained_path: ${vit_zoo[${vit_type}]}
22 | temporal_vision_encoder:
23 |   enable: False
24 |   num_layers: 2
25 |   update_pooler_embed: False
26 | add_temporal_embed: False  # whether to add temporal embed to encoded frames
27 | 
28 | image_res: 224
29 | embed_dim: 256  # -- not used
30 | video_input:  # input -- not used
31 |   num_frames: 1
32 |   reader: decord  # one of [decord, av]
33 |   sample_type: rand
34 |   num_frames_test: 1  # num_frames during inference/test
35 |   sample_type_test: middle
36 | max_q_len: 25
37 | max_a_len: 6
38 | 
39 | batch_size:
40 |   image: 128
41 |   video: 128
42 | batch_size_test:
43 |   image: 64
44 |   video: 64
45 | k_test: 128
46 | temp: 0.07  # -- not used
47 | eos: '[SEP]'
48 | 
49 | optimizer:
50 |   opt: adamW
51 |   lr: 1e-5
52 |   opt_betas: [0.9, 0.999]  # default
53 |   weight_decay: 0.02
54 |   max_grad_norm: -1  # requires a positive float, use -1 to disable
55 |   different_lr:  # use a different lr for some modules, e.g., larger lr for new modules
56 |     enable: False
57 |     module_names: []
58 |     lr: 1e-3
59 | 
60 | scheduler:
61 |   sched: cosine
62 |   epochs: 5
63 |   min_lr_multi: 0.1  # min_lr will be `optimizer.lr * min_lr_multi`
64 |   warmup_epochs: 0.5  # float
65 | 
66 | output_dir: None # output dir
67 | pretrained_path: None  # path to pretrained model weights
68 | resume: False  # if True, load optimizer and scheduler states as well
69 | evaluate: False
70 | # `eval_frame_ensemble': how do we aggregate scores if `video_input.num_frames_test' > `video_input.num_frames'
71 | # `concat': concat frames before input to multi-modal encoder, i.e., early fusion
72 | # `mean', `max', `lse': mean/max/lse-pool scores after multi-modal encoder, i.e., late fusion, as in ClipBERT
73 | eval_frame_ensemble: concat  # [concat, max, mean, lse]
74 | device: cuda
75 | seed: 42
76 | log_freq: 100
77 | dist_url: env://
78 | distributed: True
79 | fp16: True
80 | debug: False
81 | num_workers: 16
82 | 
83 | wandb:
84 |   enable: False
85 |   entity: None   # username or teamname to store the runs, see https://docs.wandb.ai/ref/python/init
86 |   project: vqa  # setup in your command line
87 | 
88 | 


--------------------------------------------------------------------------------
/src/ClipViP/src/optimization/sched.py:
--------------------------------------------------------------------------------
 1 | """
 2 | optimizer learning rate scheduling helpers
 3 | """
 4 | import math
 5 | from math import ceil
 6 | from collections import Counter
 7 | 
 8 | 
 9 | def noam_schedule(step, warmup_step=4000):
10 |     if step <= warmup_step:
11 |         return step / warmup_step
12 |     return (warmup_step ** 0.5) * (step ** -0.5)
13 | 
14 | 
15 | def warmup_linear(step, warmup_step, tot_step):
16 |     if step < warmup_step:
17 |         return step / warmup_step
18 |     return max(0, (tot_step-step)/(tot_step-warmup_step))
19 | 
20 | def warmup_cosine(step, warmup_step, tot_step):
21 |     if step < warmup_step:
22 |         return step / warmup_step
23 |     progress = (step - warmup_step) / (tot_step - warmup_step)
24 |     return 0.5 * (1.0 + math.cos(math.pi * progress))
25 | 
26 | def multi_step_schedule(n_epoch, milestones, step, warmup_step,gamma=0.5):
27 |     if step <= warmup_step:
28 |         return step / warmup_step
29 | 
30 |     milestones = list(sorted(milestones))
31 |     for i, m in enumerate(milestones):
32 |         if n_epoch < m:
33 |             return gamma**i
34 |     return gamma**(len(milestones)+1)
35 | 
36 | class AutoStep():
37 |     def __init__(self, tolerance, gamma):
38 |         self.tolerance = tolerance
39 |         self.coeff_mem = 1
40 |         self.gamma = gamma
41 |         self.best_score = 0.
42 |         self.count = 0
43 | 
44 |     def step(self, score):
45 |         if score <= self.best_score:
46 |             self.count += 1
47 |         else:
48 |             self.count = 0
49 |         self.best_score = score
50 |         if self.count > self.tolerance:
51 |             self.count = 0
52 |             self.coeff_mem = self.coeff_mem * self.gamma
53 | 
54 |     def get_lr(self, global_step, learning_rate, num_train_steps, warmup_ratio=0.1):
55 |         warmup_steps = int(warmup_ratio * num_train_steps)
56 |         if global_step <= warmup_steps:
57 |             return learning_rate * global_step / warmup_steps
58 | 
59 |         return max(self.coeff_mem * learning_rate, 1e-8)
60 | 
61 | 
62 | def get_lr_sched(global_step, decay, learning_rate,
63 |                  num_train_steps, warmup_ratio=0.1,
64 |                  decay_epochs=[], multi_step_epoch=-1):
65 |     warmup_steps = int(warmup_ratio*num_train_steps)
66 |     if decay == 'linear':
67 |         lr_this_step = learning_rate * warmup_linear(
68 |             global_step, warmup_steps, num_train_steps)
69 |     elif decay == 'cosine':
70 |         lr_this_step = learning_rate * warmup_cosine(
71 |             global_step, warmup_steps, num_train_steps)
72 |     elif decay == 'invsqrt':
73 |         lr_this_step = learning_rate * noam_schedule(
74 |             global_step, warmup_steps)
75 |     elif decay == 'constant':
76 |         lr_this_step = learning_rate
77 |     elif decay == "multi_step":
78 |         assert multi_step_epoch >= 0
79 |         lr_this_step = learning_rate * multi_step_schedule(
80 |             multi_step_epoch, decay_epochs, global_step, warmup_steps)
81 |     if lr_this_step <= 0:
82 |         # save guard for possible miscalculation of train steps
83 |         lr_this_step = 1e-8
84 |     return lr_this_step
85 | 


--------------------------------------------------------------------------------
/src/configs/projects/eval/actionbench/knowledge_patcher/acdybench_ego4d_internvideo_KP-Perceiver-VTC-DVDM__action_antonym.yaml:
--------------------------------------------------------------------------------
  1 | # modified from https://github.com/salesforce/LAVIS/tree/main/lavis/configs
  2 | 
  3 | ## == internvideo backbone == ## 
  4 | model:
  5 |   arch: patch_and_fuse_internvideo
  6 |   model_type: InternVideo-MM-L-14
  7 |   load_pretrained: True
  8 |   backbone_pretrained: "pretrained_ckpt/InternVideo/InternVideo-MM-L-14.ckpt"
  9 |   pretrained: "pretrained_ckpt/PatchAndFuse/ActionBench/actionbench_ego4d_patch_and_fuse.pth"
 10 | 
 11 |   text_perceiver_config:
 12 |     dim: 768 # latent query dim
 13 |     k_v_dim: 768 # text_width
 14 |     depth: 1
 15 |     dim_head: 64
 16 |     heads: 8
 17 |     num_latents: 16
 18 |     ff_mult: 2
 19 | 
 20 |   vision_perceiver_config:
 21 |     dim: 768 # latent query dim
 22 |     k_v_dim: 1024 # vision_width
 23 |     depth: 1
 24 |     dim_head: 64
 25 |     heads: 8
 26 |     num_latents: 16
 27 |     ff_mult: 2
 28 | 
 29 |   objectives: ["video_text_contrastive","video_action_contrastive","action_temporal_matching"]
 30 |   loss_weighting: [1.0,1.0,0.4]
 31 |   if_use_attn_guidance: False
 32 |   if_use_dual_perceiver: False
 33 |   if_add_temporal_emebdding: True
 34 |   num_frms: 8
 35 |   temp_emb_drop_out: 0.1
 36 | 
 37 |   # do ATM only on state change salient videos
 38 |   state_change_filtering_for_FDM: True
 39 | 
 40 | datasets:
 41 |   actionbench_ego4d_224x224_5fps: # using subset of egoclip for training and egomcq for validation
 42 |     vis_processor:
 43 |       train:
 44 |         name: "video_train"
 45 |         image_size: 224
 46 |       eval:
 47 |         name: "internvideo_eval"
 48 |         image_size: 224
 49 |     text_processor:
 50 |       train:
 51 |         name: "vl_dynamic_ego4d_text"
 52 |       eval:
 53 |         name: "vl_dynamic_ego4d_text"
 54 |     
 55 |     # IMPORTANT configs: 
 56 |     fps: 5 # if downsampled, use 5 fps
 57 |     task: "action_antonym" # evaluation task: ["video_text_matching", "action_antonym", "reversed_video"]
 58 |     neg_sampling_same_clip: 0 # evaluation set to 0
 59 |     eval_only: True
 60 |     
 61 |     # other arguements
 62 |     k: null # sample a subset of k instances
 63 |     frm_sampling_strategy: "uniform" # take the first and last frame as start and end state
 64 |     num_frm: 8
 65 |     train_frame_height: 224
 66 |     train_frame_width: 224
 67 |     eval_frame_height: 224
 68 |     eval_frame_width: 224
 69 | 
 70 | run:
 71 |   # use custom runner
 72 |   runner: runner_base_patch_and_fuse
 73 |   
 74 |   # task object name
 75 |   task: actionbench
 76 | 
 77 |   # which module is used for inference ["backbone", "knowledge_patcher"]
 78 |   eval_module: backbone
 79 |   eval_task: action_antonym
 80 | 
 81 |   batch_size_train: 32
 82 |   batch_size_eval: 4
 83 |   num_workers: 4
 84 | 
 85 |   seed: 42
 86 |   output_dir: "output/actionbench/eval/knowledge_patcher/ego4d__internvideo_KP-Perceiver-VTC-DVDM__action_antonym"
 87 | 
 88 |   amp: False
 89 |   resume_ckpt_path: null
 90 | 
 91 |   evaluate: True 
 92 |   # train_splits: ["train"]
 93 |   # valid_splits: ["val"]
 94 |   test_splits: ["test"]
 95 | 
 96 |   device: "cuda"
 97 |   world_size: 1
 98 |   dist_url: "env://"
 99 |   distributed: True
100 | 


--------------------------------------------------------------------------------