├── LF-VILA ├── src │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── video_classification_dataset.py │ │ └── actnet_qa_dataset.py │ ├── utils │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── misc.py │ │ ├── data.py │ │ ├── logger.py │ │ └── dist.py │ ├── optimization │ │ ├── __init__.py │ │ ├── optimizer.py │ │ └── lr_scheduler.py │ ├── models │ │ ├── __init__.py │ │ └── lfvila_video_classification.py │ ├── tools │ │ └── __init__.py │ ├── configs │ │ ├── bert_config.json │ │ ├── bert_large_config.json │ │ ├── queryd_ret.yaml │ │ ├── violin_qa.yaml │ │ ├── actnet_qa.yaml │ │ ├── didemo_ret.yaml │ │ ├── cmovie_ret.yaml │ │ ├── actnet_ret.yaml │ │ ├── coin_cls.yaml │ │ ├── lvu_scene_cls.yaml │ │ ├── lvu_relationship_cls.yaml │ │ ├── how2_qa.yaml │ │ ├── lvu_wayspeaking_cls.yaml │ │ ├── pretrain_stage1.yaml │ │ └── pretrain_stage2.yaml │ └── tasks │ │ ├── run_video_classification.py │ │ └── run_retrieval.py ├── figs │ ├── framework.png │ └── data_example.png ├── docker │ └── requirements.txt ├── setup.sh ├── scripts │ └── download_data.sh ├── launch_container.sh └── .gitignore ├── hd-vila ├── src │ ├── __init__.py │ ├── datasets │ │ └── __init__.py │ ├── modeling │ │ └── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── misc.py │ │ ├── logger.py │ │ └── stop_words.py │ ├── optimization │ │ ├── __init__.py │ │ └── sched.py │ └── configs │ │ ├── base_model.json │ │ ├── base_model_large.json │ │ ├── pretrain_stage2.json │ │ ├── msrvtt_qa.json │ │ ├── pretrain_stage1.json │ │ ├── tgif_frame_qa.json │ │ ├── tgif_action_qa.json │ │ ├── tgif_transition_qa.json │ │ ├── lsmdc_retrieval.json │ │ ├── didemo_retrieval.json │ │ ├── actnet_retrieval.json │ │ └── msrvtt_retrieval.json ├── figs │ └── framework.png ├── setup.sh ├── scripts │ ├── process_raw_video │ │ ├── gif2mp4.py │ │ ├── decode_frames.py │ │ └── compress_video.py │ └── download_data.sh └── launch_container.sh ├── CLIP-ViP ├── src │ ├── __init__.py │ ├── datasets │ │ └── __init__.py │ ├── modeling │ │ ├── __init__.py │ │ └── VidCLIP.py │ ├── utils │ │ ├── __init__.py │ │ ├── misc.py │ │ ├── metrics.py │ │ ├── logger.py │ │ └── stop_words.py │ ├── optimization │ │ ├── __init__.py │ │ ├── sched.py │ │ └── adamw.py │ └── configs │ │ ├── lsmdc_retrieval │ │ ├── lsmdc_retrieval_vip_base_16.json │ │ └── lsmdc_retrieval_vip_base_32.json │ │ ├── didemo_retrieval │ │ ├── didemo_retrieval_vip_base_32.json │ │ └── didemo_retrieval_vip_base_16.json │ │ ├── msrvtt_retrieval │ │ ├── msrvtt_retrieval_vip_base_16.json │ │ └── msrvtt_retrieval_vip_base_32.json │ │ ├── actnet_retrieval │ │ ├── actnet_retrieval_vip_base_16.json │ │ └── actnet_retrieval_vip_base_32.json │ │ └── pretrain │ │ ├── pretrain_vip_base_16.json │ │ └── pretrain_vip_base_32.json ├── setup.sh ├── launch_container.sh └── LICENSE ├── hd-vila-100m ├── figs │ ├── statics.png │ └── examples.png ├── README.md ├── LICENSE └── src │ └── cut_videos.py ├── visualparsing ├── visualparsing.png └── README.md ├── CODE_OF_CONDUCT.md ├── SUPPORT.md ├── RAI.md ├── SECURITY.md └── README.md /LF-VILA/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hd-vila/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CLIP-ViP/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CLIP-ViP/src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CLIP-ViP/src/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CLIP-ViP/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LF-VILA/src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LF-VILA/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hd-vila/src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hd-vila/src/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hd-vila/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CLIP-ViP/src/optimization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LF-VILA/src/optimization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hd-vila/src/optimization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LF-VILA/figs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/LF-VILA/figs/framework.png -------------------------------------------------------------------------------- /hd-vila/figs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/hd-vila/figs/framework.png -------------------------------------------------------------------------------- /LF-VILA/figs/data_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/LF-VILA/figs/data_example.png -------------------------------------------------------------------------------- /hd-vila-100m/figs/statics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/hd-vila-100m/figs/statics.png -------------------------------------------------------------------------------- /hd-vila-100m/figs/examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/hd-vila-100m/figs/examples.png -------------------------------------------------------------------------------- /visualparsing/visualparsing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/visualparsing/visualparsing.png -------------------------------------------------------------------------------- /CLIP-ViP/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # source setup.sh 4 | export DIR_PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 5 | export PYTHONPATH="$PYTHONPATH:$DIR_PWD" 6 | 7 | echo $PYTHONPATH 8 | -------------------------------------------------------------------------------- /hd-vila/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # source setup.sh 4 | export DIR_PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 5 | export PYTHONPATH="$PYTHONPATH:$DIR_PWD" 6 | 7 | echo $PYTHONPATH 8 | -------------------------------------------------------------------------------- /LF-VILA/docker/requirements.txt: -------------------------------------------------------------------------------- 1 | jsonlines 2 | deepspeed==0.5.8 3 | transformers==4.30.0 4 | timm==0.4.12 5 | einops==0.3.2 6 | jsonlines==3.0.0 7 | tensorboardX==2.4.1 8 | decord==0.6.0 9 | easydict==1.9 10 | ruamel_yaml -------------------------------------------------------------------------------- /LF-VILA/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # source setup.sh 4 | export DIR_PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 5 | export PYTHONPATH="$PYTHONPATH:$DIR_PWD" 6 | 7 | pip install lmdb 8 | 9 | echo $PYTHONPATH 10 | -------------------------------------------------------------------------------- /LF-VILA/src/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .lfvila_pretrain import LFVILA_Pretrain 2 | from .lfvila_qa_multichoice import LFVILA_QA_Multichoice 3 | from .lfvila_qa_classification import LFVILA_QA_Classification 4 | from .lfvila_retrieval import LFVILA_Retrieval 5 | from .lfvila_video_classification import LFVILA_Video_Classification -------------------------------------------------------------------------------- /LF-VILA/src/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .trainer_pretrain import Trainer_Pretrain 2 | from .trainer_qa_multichoice import Trainer_QA_Multichoice 3 | from .trainer_qa_classification import Trainer_QA_Classification 4 | from .trainer_retrieval import Trainer_Retrieval 5 | from .trainer_video_classification import Trainer_Video_Classification -------------------------------------------------------------------------------- /LF-VILA/scripts/download_data.sh: -------------------------------------------------------------------------------- 1 | # Download Models and Data: 2 | DOWNLOAD=$1 3 | 4 | BLOB='https://hdvila.blob.core.windows.net/dataset/lfvila_release.zip?sp=r&st=2023-03-16T05:01:27Z&se=2027-03-01T13:01:27Z&spr=https&sv=2021-12-02&sr=b&sig=lxR7bZ4i3Jpm4Z93u%2BgqhGvfF6DZ4hyRgPFwhwO9i78%3D' 5 | 6 | wget -nc $BLOB -O $DOWNLOAD/lfvila_release.zip 7 | unzip $DOWNLOAD/lfvila_release.zip -d $DOWNLOAD -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /hd-vila/scripts/process_raw_video/gif2mp4.py: -------------------------------------------------------------------------------- 1 | import moviepy.editor as mp 2 | import os 3 | import glob 4 | 5 | def gif2mp4(gif_dir, mp4_dir): 6 | gifs = glob.glob(os.path.join(gif_dir, "*.gif")) 7 | for gif in gifs: 8 | clip = mp.VideoFileClip(gif) 9 | target_path = os.path.join(mp4_dir, os.path.basename(gif).replace(".gif", ".mp4")) 10 | clip.write_videofile(target_path) 11 | 12 | 13 | if __name__ == "__main__": 14 | gif2mp4("path/to/gifs", "path/to/mp4") -------------------------------------------------------------------------------- /CLIP-ViP/launch_container.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=$1 2 | 3 | if [ -z $CUDA_VISIBLE_DEVICES ]; then 4 | CUDA_VISIBLE_DEVICES='all' 5 | fi 6 | 7 | docker run --gpus '"'device=$CUDA_VISIBLE_DEVICES'"' --ipc=host --rm -it \ 8 | --mount src=$(pwd),dst=/VidCLIP,type=bind \ 9 | --mount src=$DATA_DIR,dst=/blob_mount,type=bind \ 10 | -e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ 11 | -w /VidCLIP tiankaihang/azureml_docker:horovod \ 12 | bash -c "source /VidCLIP/setup.sh && export OMPI_MCA_btl_vader_single_copy_mechanism=none && bash" 13 | 14 | -------------------------------------------------------------------------------- /LF-VILA/launch_container.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=$1 2 | NAME=$2 3 | 4 | 5 | if [ -z $CUDA_VISIBLE_DEVICES ]; then 6 | CUDA_VISIBLE_DEVICES='all' 7 | fi 8 | 9 | docker run --gpus device=$CUDA_VISIBLE_DEVICES --ipc=host --rm -it \ 10 | --name $NAME \ 11 | --shm-size=128g \ 12 | --mount src=$(pwd),dst=/LF-VILA,type=bind \ 13 | --mount src=$DATA_DIR,dst=/blob_mount,type=bind \ 14 | -e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ 15 | -w /LF-VILA ycsun1972/azureml_docker:horovod_deepspeed_v2 \ 16 | bash -c "source /LF-VILA/setup.sh && bash" 17 | -------------------------------------------------------------------------------- /hd-vila/launch_container.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR=$1 2 | 3 | if [ -z $CUDA_VISIBLE_DEVICES ]; then 4 | CUDA_VISIBLE_DEVICES='all' 5 | fi 6 | 7 | docker run --gpus '"'device=$CUDA_VISIBLE_DEVICES'"' --ipc=host --rm -it \ 8 | --mount src=$(pwd),dst=/HD-VILA,type=bind \ 9 | --mount src=$DATA_DIR,dst=/data_mount,type=bind \ 10 | -e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ 11 | -w /HD-VILA tiankaihang/azureml_docker:horovod \ 12 | bash -c "source /HD-VILA/setup.sh && export OMPI_MCA_btl_vader_single_copy_mechanism=none && bash" 13 | 14 | -------------------------------------------------------------------------------- /LF-VILA/src/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def compute_rt_metrics(x): 5 | sx = np.sort(-x, axis=1) 6 | d = np.diag(-x) 7 | d = d[:, np.newaxis] 8 | ind = sx - d 9 | ind = np.where(ind == 0) 10 | ind = ind[1] 11 | r1 = float(np.sum(ind == 0)) / len(ind) 12 | r5 = float(np.sum(ind < 5)) / len(ind) 13 | r10 = float(np.sum(ind < 10)) / len(ind) 14 | r50 = float(np.sum(ind < 50)) / len(ind) 15 | medr = np.median(ind) + 1 16 | meanr = np.mean(ind) + 1 17 | return r1, r5, r10, r50, medr, meanr 18 | 19 | 20 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "gradient_checkpointing": false, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "hidden_size": 768, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 3072, 12 | "layer_norm_eps": 1e-12, 13 | "max_position_embeddings": 512, 14 | "model_type": "bert", 15 | "num_attention_heads": 12, 16 | "num_hidden_layers": 12, 17 | "pad_token_id": 0, 18 | "position_embedding_type": "absolute", 19 | "transformers_version": "4.6.0.dev0", 20 | "type_vocab_size": 2, 21 | "use_cache": true, 22 | "vocab_size": 30522 23 | } 24 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/bert_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "gradient_checkpointing": false, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "hidden_size": 1024, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 4096, 12 | "layer_norm_eps": 1e-12, 13 | "max_position_embeddings": 512, 14 | "model_type": "bert", 15 | "num_attention_heads": 16, 16 | "num_hidden_layers": 24, 17 | "pad_token_id": 0, 18 | "position_embedding_type": "absolute", 19 | "transformers_version": "4.6.0.dev0", 20 | "type_vocab_size": 2, 21 | "use_cache": true, 22 | "vocab_size": 30522 23 | } 24 | -------------------------------------------------------------------------------- /hd-vila/src/configs/base_model.json: -------------------------------------------------------------------------------- 1 | { 2 | "max_temporal_position_embeddings": 100, 3 | "backbone_channel_in_size": 2048, 4 | "max_grid_row_position_embeddings": 100, 5 | "max_grid_col_position_embeddings": 100, 6 | "attention_probs_dropout_prob": 0.1, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "hidden_size": 768, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 3072, 12 | "layer_norm_eps": 1e-12, 13 | "max_position_embeddings": 512, 14 | "model_type": "bert", 15 | "num_attention_heads": 12, 16 | "num_hidden_layers": 12, 17 | "pad_token_id": 0, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522 20 | } 21 | -------------------------------------------------------------------------------- /hd-vila/src/configs/base_model_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "max_temporal_position_embeddings": 100, 3 | "backbone_channel_in_size": 2048, 4 | "max_grid_row_position_embeddings": 100, 5 | "max_grid_col_position_embeddings": 100, 6 | "attention_probs_dropout_prob": 0.1, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "hidden_size": 1024, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 4096, 12 | "layer_norm_eps": 1e-12, 13 | "max_position_embeddings": 512, 14 | "model_type": "bert", 15 | "num_attention_heads": 16, 16 | "num_hidden_layers": 24, 17 | "pad_token_id": 0, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522 20 | 21 | } 22 | -------------------------------------------------------------------------------- /CLIP-ViP/src/utils/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | modified from UNITER 3 | """ 4 | import json 5 | import random 6 | import sys 7 | 8 | import torch 9 | import numpy as np 10 | 11 | 12 | class NoOp(object): 13 | """ useful for distributed training No-Ops """ 14 | def __getattr__(self, name): 15 | return self.noop 16 | 17 | def noop(self, *args, **kwargs): 18 | return 19 | 20 | 21 | def set_random_seed(seed): 22 | random.seed(seed) 23 | np.random.seed(seed) 24 | torch.manual_seed(seed) 25 | torch.cuda.manual_seed_all(seed) 26 | 27 | 28 | def zero_none_grad(model): 29 | for p in model.parameters(): 30 | if p.grad is None and p.requires_grad: 31 | p.grad = p.data.new(p.size()).zero_() 32 | -------------------------------------------------------------------------------- /hd-vila/scripts/download_data.sh: -------------------------------------------------------------------------------- 1 | # Download Models: 2 | # 1, pretrained model 3 | DOWNLOAD=$1 4 | 5 | BLOB='https://hdvila.blob.core.windows.net/dataset/pretrained.zip?sp=r&st=2022-09-13T08:25:54Z&se=2024-12-31T16:25:54Z&spr=https&sv=2021-06-08&sr=b&sig=Zt8vmQ%2F5wU35507Dar4i4Qsk3dqf15aEBQOS4QqUUrc%3D' 6 | 7 | # 1, pretrained model 8 | wget -nc $BLOB -O $DOWNLOAD/pretrained.zip 9 | unzip $DOWNLOAD/pretrained.zip -d $DOWNLOAD 10 | 11 | BLOB='https://hdvila.blob.core.windows.net/dataset/data.zip?sp=r&st=2022-09-13T02:35:13Z&se=2024-12-31T10:35:13Z&spr=https&sv=2021-06-08&sr=b&sig=BjQXSegSvllCLpx%2B%2FhDH6VwVDE0e2XHZ%2FqwAo5ZpyeQ%3D' 12 | 13 | # 2, downstream dataset 14 | wget -nc $BLOB -O $DOWNLOAD/data.zip 15 | unzip $DOWNLOAD/data.zip -d $DOWNLOAD 16 | mv $DOWNLOAD/downstream_data $DOWNLOAD/data 17 | -------------------------------------------------------------------------------- /hd-vila/src/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def cal_cossim(feats1, feats2): 4 | sim_matrix = np.dot(feats1, feats2.T) 5 | return sim_matrix 6 | 7 | 8 | def compute_metrics(x): 9 | sx = np.sort(-x, axis=1) 10 | d = np.diag(-x) 11 | d = d[:, np.newaxis] 12 | ind = sx - d 13 | ind = np.where(ind == 0) 14 | ind = ind[1] 15 | r1 = float(np.sum(ind == 0)) / len(ind) 16 | r5 = float(np.sum(ind < 5)) / len(ind) 17 | r10 = float(np.sum(ind < 10)) / len(ind) 18 | r50 = float(np.sum(ind < 50)) / len(ind) 19 | medr = np.median(ind) + 1 20 | meanr = np.mean(ind) + 1 21 | return r1, r5, r10, r50, medr, meanr 22 | 23 | 24 | if __name__ == '__main__': 25 | 26 | sim_matrix = np.random.random((5,5)) 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /hd-vila/src/utils/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | modified from UNITER 3 | """ 4 | import json 5 | import random 6 | import sys 7 | 8 | import torch 9 | import numpy as np 10 | 11 | 12 | class NoOp(object): 13 | """ useful for distributed training No-Ops """ 14 | def __getattr__(self, name): 15 | return self.noop 16 | 17 | def noop(self, *args, **kwargs): 18 | return 19 | 20 | 21 | def set_random_seed(seed): 22 | random.seed(seed) 23 | np.random.seed(seed) 24 | torch.manual_seed(seed) 25 | torch.cuda.manual_seed_all(seed) 26 | 27 | 28 | def zero_none_grad(model): 29 | HAS_NAN = False 30 | for p in model.parameters(): 31 | if p.grad is None and p.requires_grad: 32 | HAS_NAN = True 33 | p.grad = p.data.new(p.size()).zero_() 34 | return HAS_NAN -------------------------------------------------------------------------------- /LF-VILA/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | 3 | # script 4 | tmp_all/script/ 5 | 6 | # Philly-realted # 7 | pt/ 8 | .ptconfig 9 | 10 | 11 | 12 | # Project-related # 13 | */*results*/ 14 | *results*/ 15 | tmp*/ 16 | cache/* 17 | */cache*/ 18 | tmp*.py 19 | *pickle 20 | 21 | # compiled files # 22 | *.pyc 23 | **/__pycache__/ 24 | 25 | # Packages # 26 | ############ 27 | # it's better to unpack these files and commit the raw source 28 | # git has its own built in compression methods 29 | *.7z 30 | *.dmg 31 | *.gz 32 | *.iso 33 | *.jar 34 | *.rar 35 | *.tar 36 | *.zip 37 | 38 | # Logs and databases # 39 | ###################### 40 | *.log 41 | *.sql 42 | *.sqlite 43 | .ipynb_checkpoints/ 44 | *.swp 45 | *.vscode/ 46 | *.idea/ 47 | 48 | # OS generated files # 49 | ###################### 50 | .DS_Store 51 | .DS_Store? 52 | ._* 53 | .Spotlight-V100 54 | .Trashes 55 | ehthumbs.db 56 | Thumbs.db 57 | -------------------------------------------------------------------------------- /LF-VILA/src/optimization/optimizer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | 4 | 5 | 6 | def build_optimizer_parameters(config, model): 7 | 8 | param_optimizer = list(model.named_parameters()) 9 | param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] 10 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'pos_embed','relative_position_bias_table'] 11 | 12 | if "weight_decay" in config.TRAINING.keys(): 13 | weight_decay = config.TRAINING["weight_decay"] 14 | else: 15 | weight_decay = 0.01 16 | 17 | 18 | optimizer_grouped_parameters = [{ 19 | 'params': [ 20 | p for n, p in param_optimizer 21 | if not any(nd in n for nd in no_decay) and p.requires_grad 22 | ], 23 | 'weight_decay': 24 | weight_decay 25 | }, { 26 | 'params': 27 | [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad], 28 | 'weight_decay': 29 | 0.0 30 | }] 31 | 32 | return optimizer_grouped_parameters -------------------------------------------------------------------------------- /CLIP-ViP/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jie Lei 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /LF-VILA/src/utils/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import torch 5 | import einops 6 | 7 | def mkdirp(p): 8 | if not os.path.exists(p): 9 | os.makedirs(p) 10 | 11 | def set_random_seed(seed): 12 | random.seed(seed) 13 | np.random.seed(seed) 14 | torch.manual_seed(seed) 15 | torch.cuda.manual_seed_all(seed) 16 | 17 | def vector_gather(vectors, indices): 18 | """ 19 | Gathers (batched) vectors according to indices. 20 | Arguments: 21 | vectors: Tensor[N, L, D] 22 | indices: Tensor[N, K] or Tensor[N] 23 | Returns: 24 | Tensor[N, K, D] or Tensor[N, D] 25 | """ 26 | N, L, D = vectors.shape 27 | squeeze = False 28 | if indices.ndim == 1: 29 | squeeze = True 30 | indices = indices.unsqueeze(-1) 31 | N2, K = indices.shape 32 | assert N == N2 33 | indices = einops.repeat(indices, "N K -> N K D", D=D) 34 | out = torch.gather(vectors, dim=1, index=indices) 35 | if squeeze: 36 | out = out.squeeze(1) 37 | return out 38 | 39 | class AverageMeter(object): 40 | """Computes and stores the average and current/max/min value""" 41 | def __init__(self): 42 | self.val = 0 43 | self.avg = 0 44 | self.sum = 0 45 | self.count = 0 46 | self.max = -1e10 47 | self.min = 1e10 48 | self.reset() 49 | 50 | def reset(self): 51 | self.val = 0 52 | self.avg = 0 53 | self.sum = 0 54 | self.count = 0 55 | self.max = -1e10 56 | self.min = 1e10 57 | 58 | def update(self, val, n=1): 59 | self.max = max(val, self.max) 60 | self.min = min(val, self.min) 61 | self.val = val 62 | self.sum += val * n 63 | self.count += n 64 | self.avg = self.sum / self.count -------------------------------------------------------------------------------- /visualparsing/README.md: -------------------------------------------------------------------------------- 1 | # Visual Parsing 2 | 3 | [Probing Inter-modality: Visual Parsing with Self-Attention for Vision-and-Language Pre-training](https://proceedings.neurips.cc/paper/2021/file/23fa71cc32babb7b91130824466d25a5-Paper.pdf) accepted by [NeurIPS 2021](https://nips.cc/Conferences/2021/). 4 | 5 | By [Hongwei Xue](https://hellwayxue.github.io/), [Yupan Huang](https://hypjudy.github.io/), [Bei Liu](https://www.microsoft.com/en-us/research/people/libei/), [Houwen Peng](https://www.microsoft.com/en-us/research/people/hopeng/), [Jianlong Fu](https://www.microsoft.com/en-us/research/people/jianf/), [Houqiang Li](http://staff.ustc.edu.cn/~lihq/en/), and [Jiebo Luo](https://www.cs.rochester.edu/u/jluo/). 6 | 7 | ## Introdution 8 | 9 | We propose a fully Transformer visual embedding for Vision-Language Pre-training (VLP) to 10 | better learn visual relation and further promote inter-modal alignment. Specifically, 11 | we propose a metric named Inter-Modality Flow (IMF) to measure the interaction 12 | between vision and language (i.e., inter-modality). We also design a novel masking 13 | optimization mechanism named Masked Feature Regression (MFR) in Transformer 14 | to further promote the inter-modality learning. 15 | 16 |

17 | framework 18 |

19 |

20 | The framework of Visual Parsing. 21 |

22 | 23 | ## Citing Our Paper 24 | 25 | If you find our work useful for your research, please consider citing our paper. :blush: 26 | 27 | ```bibtex 28 | @article{xue2021probing, 29 | title={Probing Inter-modality: Visual Parsing with Self-Attention for Vision-and-Language Pre-training}, 30 | author={Xue, Hongwei and Huang, Yupan and Liu, Bei and Peng, Houwen and Fu, Jianlong and Li, Houqiang and Luo, Jiebo}, 31 | journal={Advances in Neural Information Processing Systems}, 32 | volume={34}, 33 | year={2021} 34 | } 35 | ``` 36 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/lsmdc_retrieval/lsmdc_retrieval_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "lsmdc-101k", 5 | "vis_format": "video", 6 | "txt": "clip_data/vis_db/lsmdc/train_101k_frame.jsonl", 7 | "vis": "datasets/lsmdc" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "lsmdc-1k", 13 | "vis_format": "video", 14 | "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl", 15 | "vis": "datasets/lsmdc" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "lsmdc-1k", 21 | "vis_format": "video", 22 | "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl", 23 | "vis": "datasets/lsmdc" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch16", 39 | "clip_config": "openai/clip-vit-base-patch16", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 10, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/lsmdc_retrieval/lsmdc_retrieval_vip_base_16", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/lsmdc_retrieval/lsmdc_retrieval_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "lsmdc-101k", 5 | "vis_format": "video", 6 | "txt": "clip_data/vis_db/lsmdc/train_101k_frame.jsonl", 7 | "vis": "datasets/lsmdc" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "lsmdc-1k", 13 | "vis_format": "video", 14 | "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl", 15 | "vis": "datasets/lsmdc" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "lsmdc-1k", 21 | "vis_format": "video", 22 | "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl", 23 | "vis": "datasets/lsmdc" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch32", 39 | "clip_config": "openai/clip-vit-base-patch32", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 10, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/lsmdc_retrieval/lsmdc_retrieval_vip_base_32", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/didemo_retrieval/didemo_retrieval_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "didemo-train", 5 | "vis_format": "video", 6 | "txt": "datasets/lfvideo_data/task/didemo/train.jsonl", 7 | "vis": "datasets/didemo/didemo_video_xfps/" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "didemo-val", 13 | "vis_format": "video", 14 | "txt": "datasets/lfvideo_data/task/didemo/val.jsonl", 15 | "vis": "datasets/didemo/didemo_video_xfps/" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "didemo-test", 21 | "vis_format": "video", 22 | "txt": "datasets/lfvideo_data/task/didemo/test.jsonl", 23 | "vis": "datasets/didemo/didemo_video_xfps/" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch32", 39 | "clip_config": "openai/clip-vit-base-patch32", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 20, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/didemo_retrieval/didemo_retrieval_vip_base_32", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/didemo_retrieval/didemo_retrieval_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "didemo-train", 5 | "vis_format": "video", 6 | "txt": "datasets/lfvideo_data/task/didemo/train.jsonl", 7 | "vis": "datasets/didemo/didemo_video_xfps/" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "didemo-val", 13 | "vis_format": "video", 14 | "txt": "datasets/lfvideo_data/task/didemo/val.jsonl", 15 | "vis": "datasets/didemo/didemo_video_xfps/" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "didemo-test", 21 | "vis_format": "video", 22 | "txt": "datasets/lfvideo_data/task/didemo/test.jsonl", 23 | "vis": "datasets/didemo/didemo_video_xfps/" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 70, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch16", 39 | "clip_config": "openai/clip-vit-base-patch16", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 10, 63 | "lr_mul_prefix": "logit_scale", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 20, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/didemo_retrieval/didemo_retrieval_vip_base_16", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /LF-VILA/src/utils/data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def mask_batch_text_tokens( 4 | inputs, tokenizer, mlm_probability=0.15, is_train=True): 5 | """ modified from transformers.data.data_collator 6 | Args: 7 | inputs: (B, L), 2D torch.Tensor, does not work for 1D. It has already been padded. 8 | tokenizer: 9 | mlm_probability: float 10 | is_train: if True use random masking, else mask tokens at fixed position to remove randomness in evaluation. 11 | """ 12 | if tokenizer.mask_token is None: 13 | raise ValueError( 14 | "This tokenizer does not have a mask token which is necessary for masked language modeling. " 15 | "Remove the --mlm flag if you want to use this tokenizer." 16 | ) 17 | 18 | labels = inputs.clone() 19 | # We sample a few tokens in each sequence for masked-LM training 20 | # (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) 21 | probability_matrix = torch.full(labels.shape, mlm_probability) 22 | special_tokens_mask = [ 23 | tokenizer.get_special_tokens_mask( 24 | val, already_has_special_tokens=True) for val in labels.tolist() 25 | ] 26 | probability_matrix.masked_fill_(torch.tensor( 27 | special_tokens_mask, dtype=torch.bool), value=0.0) 28 | if tokenizer._pad_token is not None: 29 | padding_mask = labels.eq(tokenizer.pad_token_id) 30 | probability_matrix.masked_fill_(padding_mask, value=0.0) 31 | masked_indices = torch.bernoulli(probability_matrix).bool() 32 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 33 | 34 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 35 | indices_replaced = torch.bernoulli( 36 | torch.full(labels.shape, 0.8)).bool() & masked_indices 37 | inputs[indices_replaced] = tokenizer.convert_tokens_to_ids( 38 | tokenizer.mask_token) 39 | 40 | # 10% of the time, we replace masked input tokens with random word 41 | indices_random = torch.bernoulli( 42 | torch.full(labels.shape, 0.5) 43 | ).bool() & masked_indices & ~indices_replaced 44 | random_words = torch.randint( 45 | len(tokenizer), labels.shape, 46 | dtype=torch.long) # len(tokenizer) == #vocab 47 | inputs[indices_random] = random_words[indices_random] 48 | 49 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 50 | return inputs, labels -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/msrvtt_retrieval/msrvtt_retrieval_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "msrvtt-9k", 5 | "vis_format": "video", 6 | "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl", 7 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "msrvtt-1ka", 13 | "vis_format": "video", 14 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 15 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "msrvtt-1ka", 21 | "vis_format": "video", 22 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 23 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch16", 39 | "clip_config": "openai/clip-vit-base-patch16", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 100, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_16", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/msrvtt_retrieval/msrvtt_retrieval_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "msrvtt-9k", 5 | "vis_format": "video", 6 | "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl", 7 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "msrvtt-1ka", 13 | "vis_format": "video", 14 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 15 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "msrvtt-1ka", 21 | "vis_format": "video", 22 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 23 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 12, 29 | "test_n_clips": 1, 30 | "test_num_frms": 12, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 50, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch32", 39 | "clip_config": "openai/clip-vit-base-patch32", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 100, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_32", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/actnet_retrieval/actnet_retrieval_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "actnet-train", 5 | "vis_format": "frame", 6 | "txt": "clip_data/vis_db/anet_retrieval/train.jsonl", 7 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "actnet-test", 13 | "vis_format": "frame", 14 | "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl", 15 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "actnet-test", 21 | "vis_format": "frame", 22 | "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl", 23 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 32, 29 | "test_n_clips": 1, 30 | "test_num_frms": 32, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 70, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch16", 39 | "clip_config": "openai/clip-vit-base-patch16", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 8, 49 | "test_batch_size": 8, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 20, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/actnet_retrieval/actnet_retrieval_vip_base_16", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/actnet_retrieval/actnet_retrieval_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "actnet-train", 5 | "vis_format": "frame", 6 | "txt": "clip_data/vis_db/anet_retrieval/train.jsonl", 7 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "actnet-test", 13 | "vis_format": "frame", 14 | "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl", 15 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "actnet-test", 21 | "vis_format": "frame", 22 | "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl", 23 | "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr" 24 | } 25 | ], 26 | 27 | "train_n_clips": 1, 28 | "train_num_frms": 32, 29 | "test_n_clips": 1, 30 | "test_num_frms": 32, 31 | "sample_rate": 0, 32 | "sample_jitter": 1, 33 | "video_res": [240, 320], 34 | "input_res": [224, 224], 35 | "max_txt_len": 70, 36 | 37 | "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint", 38 | "clip_weights": "openai/clip-vit-base-patch32", 39 | "clip_config": "openai/clip-vit-base-patch32", 40 | "clip_vision_additional_config": { 41 | "type": "ViP", 42 | "temporal_size": 12, 43 | "if_use_temporal_embed": 1, 44 | "logit_scale_init_value": 4.60, 45 | "add_cls_num": 3 46 | }, 47 | 48 | "train_batch_size": 16, 49 | "test_batch_size": 16, 50 | "max_n_example_per_group": 1, 51 | "gradient_accumulation_steps": 1, 52 | "n_workers": 8, 53 | "pin_mem": 1, 54 | "fp16": 1, 55 | "amp_level": "O2", 56 | "seed": 42, 57 | 58 | "optim": "adamw", 59 | "betas": [0.9, 0.98], 60 | "learning_rate": 1e-6, 61 | "weight_decay": 0.2, 62 | "lr_mul": 1, 63 | "lr_mul_prefix": "", 64 | "loss_config": { 65 | "loss_name": "NCELearnableTempLoss", 66 | "if_gather": 1 67 | }, 68 | "warmup_ratio": 0.01, 69 | "decay": "cosine", 70 | "grad_norm": 1.0, 71 | 72 | "num_train_epochs": 20, 73 | "min_valid_steps": 1, 74 | "num_valid": 1, 75 | "only_valid_steps": 100, 76 | "save_steps_ratio": 0.9, 77 | "output_dir": "vidclip_data/output/actnet_retrieval/actnet_retrieval_vip_base_32", 78 | "if_tb_log": 0, 79 | "if_model_saver": 1, 80 | "if_log2file": 1, 81 | "dummy_data": 0 82 | } 83 | -------------------------------------------------------------------------------- /hd-vila/src/configs/pretrain_stage2.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "hdvila_pretrain", 5 | "vis_format": "video", 6 | "txt": "data/hdvila/train_group/part1.jsonl", 7 | "vis": "data/hdvila/video_clips" 8 | } 9 | ], 10 | "val_datasets": [ 11 | { 12 | "name": "msrvtt", 13 | "vis_format": "video", 14 | "txt": "data/msrvtt_retrieval/test1ka.jsonl", 15 | "vis": "data/msrvtt_retrieval/videos" 16 | }, 17 | { 18 | "name": "hdvila_test_full", 19 | "vis_format": "video", 20 | "txt": "data/hdvila/test_full_1k.jsonl", 21 | "vis": "data/hdvila/video_clips" 22 | } 23 | ], 24 | 25 | "model_config": "src/configs/base_model_large.json", 26 | "e2e_weights_path": "data/output/pretrain_stage1/ckpt/model_step_1129660.pt", 27 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 28 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 29 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 30 | "output_dir": "data/output/pretrain_stage2", 31 | "resnet_depth": 50, 32 | "resnet_frozen_stage": -1, 33 | "timesformer_depth": 4, 34 | "timesformer_heads": 16, 35 | "max_txt_len": 50, 36 | "score_agg_func": "lse", 37 | "loss_type": "ce", 38 | "train_n_clips": 2, 39 | "num_frm": 7, 40 | "sample_rate": 12, 41 | "crop_size": [640, 1024], 42 | "out_size": [256, 128, 64, 3], 43 | "train_batch_size": 16, 44 | "val_batch_size": 16, 45 | "max_n_example_per_group": 1, 46 | "gradient_accumulation_steps": 2, 47 | "num_train_epochs": 10, 48 | "min_valid_steps": 1, 49 | "num_valid": 100, 50 | "only_valid_steps": 500, 51 | "save_steps_ratio": 0.01, 52 | "learning_rate": 5e-5, 53 | "decay": "linear", 54 | "optim": "adamw", 55 | "betas": [0.9, 0.98], 56 | "dropout": 0.1, 57 | "weight_decay": 1e-3, 58 | "grad_norm": 5.0, 59 | "cnn_learning_rate": 5e-5, 60 | "cnn_weight_decay": 1e-3, 61 | "cnn_lr_decay": "linear", 62 | "align_learning_rate": 5e-5, 63 | "align_weight_decay": 1e-3, 64 | "pixel_random_sampling_size": 160, 65 | "seed":24, 66 | "fp16": 1, 67 | "amp_level": "O2", 68 | "freeze_s1": 1, 69 | "use_itm": 0, 70 | "use_itc": 0, 71 | "use_mlm": 1, 72 | 73 | "bert_mean": 1, 74 | "n_workers": 8, 75 | 76 | "backbone_channels": [256, 512, 1024, 2048], 77 | "backbone_channel_in_size": 2048, 78 | "hidden_size": 1024, 79 | 80 | "temp": 0.05 81 | } 82 | -------------------------------------------------------------------------------- /CLIP-ViP/src/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def cal_cossim(feats1, feats2): 4 | sim_matrix = np.dot(feats1, feats2.T) 5 | return sim_matrix 6 | 7 | def np_softmax(X, theta = 1.0, axis = None): 8 | """ 9 | Compute the softmax of each element along an axis of X. 10 | 11 | Parameters 12 | ---------- 13 | X: ND-Array. Probably should be floats. 14 | theta (optional): float parameter, used as a multiplier 15 | prior to exponentiation. Default = 1.0 16 | axis (optional): axis to compute values along. Default is the 17 | first non-singleton axis. 18 | 19 | Returns an array the same size as X. The result will sum to 1 20 | along the specified axis. 21 | """ 22 | # make X at least 2d 23 | y = np.atleast_2d(X) 24 | # find axis 25 | if axis is None: 26 | axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1) 27 | # multiply y against the theta parameter, 28 | y = y * float(theta) 29 | # subtract the max for numerical stability 30 | y = y - np.expand_dims(np.max(y, axis = axis), axis) 31 | # exponentiate y 32 | y = np.exp(y) 33 | # take the sum along the specified axis 34 | ax_sum = np.expand_dims(np.sum(y, axis = axis), axis) 35 | # finally: divide elementwise 36 | p = y / ax_sum 37 | # flatten if X was 1D 38 | if len(X.shape) == 1: p = p.flatten() 39 | return p 40 | 41 | def compute_metrics(x): 42 | sx = np.sort(-x, axis=1) 43 | d = np.diag(-x) 44 | d = d[:, np.newaxis] 45 | ind = sx - d 46 | ind = np.where(ind == 0) 47 | ind = ind[1] 48 | r1 = float(np.sum(ind == 0)) / len(ind) 49 | r5 = float(np.sum(ind < 5)) / len(ind) 50 | r10 = float(np.sum(ind < 10)) / len(ind) 51 | medr = np.median(ind) + 1 52 | meanr = np.mean(ind) + 1 53 | return r1, r5, r10, medr, meanr 54 | 55 | def compute_metrics_multi(x, t2v_labels_list): 56 | sx = np.sort(-x, axis=1) 57 | t2v_labels_list = np.array(t2v_labels_list) 58 | arg = np.arange(x.shape[0]) 59 | d = -x[arg, t2v_labels_list] 60 | d = d[:, np.newaxis] 61 | ind = sx - d 62 | ind = np.where(ind == 0) 63 | ind = ind[1] 64 | r1 = float(np.sum(ind == 0)) / len(ind) 65 | r5 = float(np.sum(ind < 5)) / len(ind) 66 | r10 = float(np.sum(ind < 10)) / len(ind) 67 | medr = np.median(ind) + 1 68 | meanr = np.mean(ind) + 1 69 | return r1, r5, r10, medr, meanr 70 | 71 | 72 | if __name__ == '__main__': 73 | 74 | sim_matrix = np.random.random((5,5)) 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /RAI.md: -------------------------------------------------------------------------------- 1 | Responsible AI Considerations 2 | 3 | The proposed video-language dataset and pre-training model shows the capacity and generalization of learned VL representation which could benefit many applications of CV and NLP with a large range of uses across many domains. Each one of the uses has potential benefits and societal impacts. While we foresee that our technology could be used to find key information and improve efficiency and effectiveness for helpdesks, recommendation, retail and sales, we realize that it could also be used, in combination with new data, to fine-tune models to mislead, or otherwise harm people. We are also aware that this work uses considerable computation resources which itself, has environmental impacts. Therefore reducing the model size and computing effort is essential for future research. 4 | 5 | Machine learning systems can display unfair behavior for different individuals or groups. This is a multi-dimensional, socio-technical challenge and is not explicitly addressed or captured in the current accuracy metrics for this research technology. In general, standardized fairness measures have not yet been agreed upon in academia or industry. We see opportunities for more work in this area to develop methods and benchmarks for measuring fairness aspects. 6 | 7 | Given that user generated data is used, it is possible that certain demographic groups may not have enough representation. While we have balanced various video categories to mitigate for disparities, it is still likely that bias and fairness issues exist; this is an area of potential future work. There may be a Western heteronormative bias, stereotypical depictions of historically marginalized populations and/or lack of representation among some groups. Although we have filtered the input data for explicit and violent content, it is possible that it hasn’t been totally eliminated in the training data and could have impacts on the results. 8 | 9 | With visual generation techniques it is particularly important to do further work to prevent malicious use to misinform or harm people. 10 | 11 | While some mitigations for potential harms can be done in the base model, it’s important to recognize that considering risks for fine tuning data for particular scenarios is critical as well. Ultimately, choosing the application scenario of any final model used in a production system will require careful consideration of potential harms specific to the scenario. 12 | 13 | For help or issues using the pre-trained models, please submit an issue or contact Bei Liu (bei.liu@microsoft.com) and Jianlong Fu (jianf@microsoft.com). 14 | -------------------------------------------------------------------------------- /hd-vila/src/configs/msrvtt_qa.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "msrvtt_qa", 5 | "txt": { 6 | "msrvtt_qa": "data/msrvtt_qa/train.jsonl" 7 | }, 8 | "vis": "data/msrvtt_retrieval/videos_6fps" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "name": "msrvtt_qa", 14 | "txt": { 15 | "msrvtt_qa": "data/msrvtt_qa/val.jsonl" 16 | }, 17 | "vis": "data/msrvtt_retrieval/videos_6fps" 18 | } 19 | ], 20 | "ans2label_path": "data/msrvtt_qa/train_ans2label.json", 21 | "max_txt_len": 100, 22 | "max_img_size": 448, 23 | "fps": 2, 24 | "reshape_size": [180, 288], 25 | "crop_size": [160, 256], 26 | "sample_rate": 4, 27 | "num_frm": 7, 28 | "train_n_clips": 1, 29 | "score_agg_func": "lse", 30 | "max_n_example_per_group": 1, 31 | "model_config": "src/configs/base_model_large.json", 32 | "e2e_weights_path": "data/pretrained/hdvila_stage2.pt", 33 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 34 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 35 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 36 | "output_dir": "data/output/videoqa/msrvtt_qa", 37 | "train_batch_size": 16, 38 | "val_batch_size": 16, 39 | "gradient_accumulation_steps": 4, 40 | "num_train_epochs": 20, 41 | "min_valid_steps": 1, 42 | "num_valid": 20, 43 | "save_steps_ratio": 0.2, 44 | "learning_rate": 1e-5, 45 | "weight_decay": 0.3, 46 | "decay": "linear", 47 | "optim": "adamw", 48 | "betas": [0.9, 0.98], 49 | "dropout": 0.3, 50 | "grad_norm": 5.0, 51 | "cnn_learning_rate": 1e-5, 52 | "cnn_weight_decay": 0.3, 53 | "cnn_lr_decay": "linear", 54 | "align_learning_rate": 1e-5, 55 | "align_weight_decay": 0.3, 56 | "seed":66, 57 | "fp16": 1, 58 | "classifier": "mlp", 59 | "cls_hidden_scale": 2, 60 | "task": "msrvtt_qa", 61 | 62 | "resnet_depth": 50, 63 | "resnet_frozen_stage": -1, 64 | "timesformer_depth": 4, 65 | "timesformer_heads": 16, 66 | "backbone_channels": [256, 512, 1024, 2048], 67 | "backbone_downsample": [4, 8, 16, 32], 68 | "backbone_channel_in_size": 2048, 69 | "hidden_size": 1024, 70 | 71 | "inference_model_step": 0, 72 | "inference_txt_db": "data/txt_db/msrvtt_qa/test.jsonl", 73 | "inference_img_db": "data/vis_db/msrvtt_video_clips/videos_6fps", 74 | "inference_batch_size": 4, 75 | "inference_n_clips": 8 76 | } -------------------------------------------------------------------------------- /hd-vila/src/configs/pretrain_stage1.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "hdvila_pretrain", 5 | "vis_format": "video", 6 | "txt": "data/hdvila/train_group/part1.jsonl", 7 | "vis": "data/hdvila/video_clips" 8 | } 9 | ], 10 | "val_datasets": [ 11 | { 12 | "name": "msrvtt", 13 | "vis_format": "video", 14 | "txt": "data/msrvtt_retrieval/test1ka.jsonl", 15 | "vis": "data/msrvtt_retrieval/videos" 16 | }, 17 | { 18 | "name": "hdvila_test_how2", 19 | "vis_format": "video", 20 | "txt": "data/hdvila/test_howto_1k.jsonl", 21 | "vis": "data/hdvila/video_clips" 22 | }, 23 | { 24 | "name": "hdvila_test_full", 25 | "vis_format": "video", 26 | "txt": "data/hdvila/test_full_1k.jsonl", 27 | "vis": "data/hdvila/video_clips" 28 | } 29 | ], 30 | 31 | "model_config": "src/configs/base_model_large.json", 32 | "e2e_weights_path": null, 33 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 34 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 35 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 36 | "output_dir": "data/output/pretrain_stage1/", 37 | "resnet_depth": 50, 38 | "resnet_frozen_stage": -1, 39 | "timesformer_depth": 4, 40 | "timesformer_heads": 16, 41 | "max_txt_len": 50, 42 | "score_agg_func": "lse", 43 | "loss_type": "ce", 44 | "train_n_clips": 2, 45 | "num_frm": 7, 46 | "sample_rate": 12, 47 | "crop_size": [640, 1024], 48 | "out_size": [256, 128, 64, 3], 49 | "train_batch_size": 8, 50 | "val_batch_size": 8, 51 | "max_n_example_per_group": 1, 52 | "gradient_accumulation_steps": 1, 53 | "num_train_epochs": 10, 54 | "min_valid_steps": 1, 55 | "num_valid": 200, 56 | "only_valid_steps": 1000, 57 | "save_steps_ratio": 0.005, 58 | "learning_rate": 5e-5, 59 | "decay": "linear", 60 | "optim": "adamw", 61 | "betas": [0.9, 0.98], 62 | "dropout": 0.1, 63 | "weight_decay": 1e-3, 64 | "grad_norm": 5.0, 65 | "cnn_learning_rate": 5e-5, 66 | "cnn_weight_decay": 1e-3, 67 | "cnn_lr_decay": "linear", 68 | "align_learning_rate": 5e-5, 69 | "align_weight_decay": 1e-3, 70 | "pixel_random_sampling_size": 160, 71 | "seed":24, 72 | "fp16": 1, 73 | "amp_level": "O2", 74 | "use_itm": 0, 75 | "use_itc": 1, 76 | "use_mlm": 0, 77 | 78 | "bert_mean": 1, 79 | "n_workers": 8, 80 | 81 | "backbone_channels": [256, 512, 1024, 2048], 82 | "backbone_channel_in_size": 2048, 83 | "hidden_size": 1024, 84 | 85 | "temp": 0.05 86 | } 87 | -------------------------------------------------------------------------------- /hd-vila/src/optimization/sched.py: -------------------------------------------------------------------------------- 1 | """ 2 | optimizer learning rate scheduling helpers 3 | """ 4 | from math import ceil 5 | from collections import Counter 6 | 7 | 8 | def noam_schedule(step, warmup_step=4000): 9 | if step <= warmup_step: 10 | return step / warmup_step 11 | return (warmup_step ** 0.5) * (step ** -0.5) 12 | 13 | 14 | def warmup_linear(step, warmup_step, tot_step): 15 | if step < warmup_step: 16 | return step / warmup_step 17 | return max(0, (tot_step-step)/(tot_step-warmup_step)) 18 | 19 | 20 | def multi_step_schedule(n_epoch, milestones, step, warmup_step,gamma=0.5): 21 | if step <= warmup_step: 22 | return step / warmup_step 23 | 24 | milestones = list(sorted(milestones)) 25 | for i, m in enumerate(milestones): 26 | if n_epoch < m: 27 | return gamma**i 28 | return gamma**(len(milestones)+1) 29 | 30 | class AutoStep(): 31 | def __init__(self, tolerance, gamma): 32 | self.tolerance = tolerance 33 | self.coeff_mem = 1 34 | self.gamma = gamma 35 | self.best_score = 0. 36 | self.count = 0 37 | 38 | def step(self, score): 39 | if score <= self.best_score: 40 | self.count += 1 41 | else: 42 | self.count = 0 43 | self.best_score = score 44 | if self.count > self.tolerance: 45 | self.count = 0 46 | self.coeff_mem = self.coeff_mem * self.gamma 47 | 48 | def get_lr(self, global_step, learning_rate, num_train_steps, warmup_ratio=0.1): 49 | warmup_steps = int(warmup_ratio * num_train_steps) 50 | if global_step <= warmup_steps: 51 | return learning_rate * global_step / warmup_steps 52 | 53 | return max(self.coeff_mem * learning_rate, 1e-8) 54 | 55 | 56 | def get_lr_sched(global_step, decay, learning_rate, 57 | num_train_steps, warmup_ratio=0.1, 58 | decay_epochs=[], multi_step_epoch=-1): 59 | warmup_steps = int(warmup_ratio*num_train_steps) 60 | if decay == 'linear': 61 | lr_this_step = learning_rate * warmup_linear( 62 | global_step, warmup_steps, num_train_steps) 63 | elif decay == 'invsqrt': 64 | lr_this_step = learning_rate * noam_schedule( 65 | global_step, warmup_steps) 66 | elif decay == 'constant': 67 | lr_this_step = learning_rate 68 | elif decay == "multi_step": 69 | assert multi_step_epoch >= 0 70 | lr_this_step = learning_rate * multi_step_schedule( 71 | multi_step_epoch, decay_epochs, global_step, warmup_steps) 72 | if lr_this_step <= 0: 73 | # save guard for possible miscalculation of train steps 74 | lr_this_step = 1e-8 75 | return lr_this_step 76 | -------------------------------------------------------------------------------- /LF-VILA/src/utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | references: UNITER 3 | """ 4 | 5 | import logging 6 | from tensorboardX import SummaryWriter 7 | import os 8 | 9 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' 10 | _DATE_FMT = '%m/%d/%Y %H:%M:%S' 11 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO) 12 | LOGGER = logging.getLogger('__main__') # this is the global logger 13 | 14 | 15 | def add_log_to_file(log_path): 16 | fh = logging.FileHandler(log_path) 17 | formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT) 18 | fh.setFormatter(formatter) 19 | LOGGER.addHandler(fh) 20 | 21 | 22 | class TensorboardLogger(object): 23 | def __init__(self): 24 | self._logger = None 25 | self._global_step = 0 26 | 27 | def create(self, path): 28 | if "AZUREML_TB_PATH" in os.environ: 29 | self._logger = SummaryWriter(os.environ["AZUREML_TB_PATH"]) 30 | else: 31 | self._logger = SummaryWriter(path) 32 | 33 | def noop(self, *args, **kwargs): 34 | return 35 | 36 | def step(self): 37 | self._global_step += 1 38 | 39 | @property 40 | def global_step(self): 41 | return self._global_step 42 | 43 | @global_step.setter 44 | def global_step(self, step): 45 | self._global_step = step 46 | 47 | def log_scalar_dict(self, log_dict, prefix=''): 48 | """ log a dictionary of scalar values""" 49 | if self._logger is None: 50 | return 51 | if prefix: 52 | prefix = f'{prefix}_' 53 | for name, value in log_dict.items(): 54 | if isinstance(value, dict): 55 | self.log_scalar_dict(value, self._global_step, 56 | prefix=f'{prefix}{name}') 57 | else: 58 | self._logger.add_scalar(f'{prefix}{name}', value, 59 | self._global_step) 60 | 61 | def __getattr__(self, name): 62 | if self._logger is None: 63 | return self.noop 64 | return self._logger.__getattribute__(name) 65 | 66 | 67 | TB_LOGGER = TensorboardLogger() 68 | 69 | 70 | class RunningMeter(object): 71 | """ running meteor of a scalar value 72 | (useful for monitoring training loss) 73 | """ 74 | def __init__(self, name, val=None, smooth=0.99): 75 | self._name = name 76 | self._sm = smooth 77 | self._val = val 78 | 79 | def __call__(self, value): 80 | self._val = (value if self._val is None 81 | else value*(1-self._sm) + self._val*self._sm) 82 | 83 | def __str__(self): 84 | return f'{self._name}: {self._val:.4f}' 85 | 86 | @property 87 | def val(self): 88 | return self._val 89 | 90 | @property 91 | def name(self): 92 | return self._name 93 | -------------------------------------------------------------------------------- /hd-vila-100m/README.md: -------------------------------------------------------------------------------- 1 | # HD-VILA-100M Dataset 2 | 3 | ## What is HD-VILA-100M? 4 | HD-VILA-100M is a large-scale, high-resolution, and 5 | diversified video-language dataset to facilitate the multimodal representation learning. 6 | 7 |

8 | examples for hd-vila 9 |

10 |

11 | Examples of video clips and ASR generated transcriptions in the proposed HD-VILA-100M dataset. 12 |

13 | 14 | ## Data statistics 15 | The dataset contains 3.3 million videos in total, which are of high quality and distributed in 15 categories in balance. 16 |

17 | statistics 18 |

19 |

20 | The distribution of categories in HD-VILA-100M dataset. 21 |

22 | 23 | The details of our dataset are presented in the table below. 24 | | Dataset | Domain | #Video clips | #Sentence | Avg len(sec) | Sent len | Duration(h) | Resolution 25 | | :-----| :---- | :---- | :---- | :---- | :---- | :---- | :---- | 26 | | HD-VILA-100M | open | 100M | 100M | 13.4 | 32.5 | 371.5K | 720p | 27 | 28 | 29 | ## Download 30 | 31 | You can download all the urls through this [link](https://hdvila.blob.core.windows.net/dataset/hdvila100m.zip?sp=r&st=2022-06-28T03:33:11Z&se=2026-01-01T11:33:11Z&spr=https&sv=2021-06-08&sr=b&sig=VaqQkLFDqKinfkaPNs1jJ1EQIYCB%2FUPYiqFqmjWye6Y%3D) (updated 6/28/2022). Together we also offer all the timestamps to divide the videos into clips. The format of the data is: 32 | ``` 33 | { 34 | 'video_id':'QMi8x8o55Ns', 35 | 'url': 'https://www.youtube.com/watch?v=QMi8x8o55Ns', 36 | 'clip': [ 37 | {'clip_id': 'QMi8x8o55Ns.1.mp4', 'span': ['00:00:17.759', '00:00:23.279']} 38 | ... 39 | {'clip_id': 'QMi8x8o55Ns.16.mp4', 'span': ['00:04:52.140', '00:05:03.350']} 40 | ], 41 | } 42 | ``` 43 | 44 | You can download the raw videos from YouTube and use [src/cut_videos.py](./src/cut_videos.py) to cut the videos to clips. 45 | 46 | 47 | ## License 48 | 49 | The license of the collected dataset is [here](./LICENSE). 50 | 51 | ## Citing HD-VILA 52 | 53 | If you find this dataset useful for your research, please consider citing our paper. :blush: 54 | 55 | ```bibtex 56 | @inproceedings{xue2022hdvila, 57 | title={Advancing High-Resolution Video-Language Representation with Large-Scale Video Transcriptions}, 58 | author={Xue, Hongwei and Hang, Tiankai and Zeng, Yanhong and Sun, Yuchong and Liu, Bei and Yang, Huan and Fu, Jianlong and Guo, Baining}, 59 | booktitle={International Conference on Computer Vision and Pattern Recognition (CVPR)}, 60 | year={2022} 61 | } 62 | ``` 63 | 64 | ## Contact Information 65 | 66 | For further request about dataset or problems using the dataset, you can contact [Bei Liu]() (`bei.liu@microsoft.com`). 67 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/pretrain/pretrain_vip_base_16.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "hdvila", 5 | "vis_format": "videoframe", 6 | "txt": "datasets/hdvila/hdvila_subtitles_92m_db", 7 | "vis": "youtube_data/ytt180m/video_clips_3fps", 8 | "vid_cap_path": "datasets/hdvila/hdvila_captions_db", 9 | "vid_txt": "subtitle", 10 | "img_dir": "", 11 | "cap_path": "", 12 | "img_source": "", 13 | "img_ratio": 0 14 | } 15 | ], 16 | "val_datasets": [ 17 | { 18 | "name": "msrvtt", 19 | "vis_format": "video", 20 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 21 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 22 | }, 23 | { 24 | "name": "how2", 25 | "vis_format": "video", 26 | "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl", 27 | "vis": "youtube_data/ytt180m/video_clips_3fps" 28 | }, 29 | { 30 | "name": "ours", 31 | "vis_format": "video", 32 | "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl", 33 | "vis": "youtube_data/ytt180m/video_clips_3fps" 34 | } 35 | ], 36 | 37 | "train_n_clips": 1, 38 | "train_num_frms": 12, 39 | "test_n_clips": 1, 40 | "test_num_frms": 12, 41 | "sample_rate": 0, 42 | "sample_jitter": 1, 43 | "video_res": [240, 320], 44 | "input_res": [224, 224], 45 | "max_txt_len": 70, 46 | 47 | "e2e_weights_path": null, 48 | "clip_weights": "openai/clip-vit-base-patch16", 49 | "clip_config": "openai/clip-vit-base-patch16", 50 | "clip_vision_additional_config": { 51 | "type": "ViP", 52 | "temporal_size": 12, 53 | "if_use_temporal_embed": 1, 54 | "logit_scale_init_value": 4.60, 55 | "add_cls_num": 3 56 | }, 57 | 58 | "train_batch_size": 16, 59 | "test_batch_size": 16, 60 | "max_n_example_per_group": 1, 61 | "gradient_accumulation_steps": 1, 62 | "n_workers": 8, 63 | "pin_mem": 1, 64 | "fp16": 1, 65 | "amp_level": "O2", 66 | "seed": 42, 67 | 68 | "optim": "adamw", 69 | "betas": [0.9, 0.98], 70 | "learning_rate": 5e-6, 71 | "weight_decay": 0.05, 72 | "lr_mul": 1, 73 | "lr_mul_prefix": "", 74 | "loss_config": { 75 | "loss_name": "NCELearnableTempLoss_vsc_fc", 76 | "if_gather": 1 77 | }, 78 | "warmup_ratio": 0.01, 79 | "decay": "cosine", 80 | "grad_norm": 5.0, 81 | 82 | "num_train_epochs": 5, 83 | "min_valid_steps": 1, 84 | "num_valid": 100, 85 | "only_valid_steps": 1000, 86 | "save_steps_ratio": 0.01, 87 | "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_16/", 88 | "if_tb_log": 1, 89 | "if_model_saver": 1, 90 | "if_log2file": 1, 91 | "dummy_data": 0 92 | } 93 | -------------------------------------------------------------------------------- /CLIP-ViP/src/configs/pretrain/pretrain_vip_base_32.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "hdvila", 5 | "vis_format": "videoframe", 6 | "txt": "datasets/hdvila/hdvila_subtitles_92m_db", 7 | "vis": "youtube_data/ytt180m/video_clips_3fps", 8 | "vid_cap_path": "datasets/hdvila/hdvila_captions_db", 9 | "vid_txt": "subtitle", 10 | "img_dir": "", 11 | "cap_path": "", 12 | "img_source": "", 13 | "img_ratio": 0 14 | } 15 | ], 16 | "val_datasets": [ 17 | { 18 | "name": "msrvtt", 19 | "vis_format": "video", 20 | "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl", 21 | "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps" 22 | }, 23 | { 24 | "name": "how2", 25 | "vis_format": "video", 26 | "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl", 27 | "vis": "youtube_data/ytt180m/video_clips_3fps" 28 | }, 29 | { 30 | "name": "ours", 31 | "vis_format": "video", 32 | "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl", 33 | "vis": "youtube_data/ytt180m/video_clips_3fps" 34 | } 35 | ], 36 | 37 | "train_n_clips": 1, 38 | "train_num_frms": 12, 39 | "test_n_clips": 1, 40 | "test_num_frms": 12, 41 | "sample_rate": 0, 42 | "sample_jitter": 1, 43 | "video_res": [240, 320], 44 | "input_res": [224, 224], 45 | "max_txt_len": 70, 46 | 47 | "e2e_weights_path": null, 48 | "clip_weights": "openai/clip-vit-base-patch32", 49 | "clip_config": "openai/clip-vit-base-patch32", 50 | "clip_vision_additional_config": { 51 | "type": "ViP", 52 | "temporal_size": 12, 53 | "if_use_temporal_embed": 1, 54 | "logit_scale_init_value": 4.60, 55 | "add_cls_num": 3 56 | }, 57 | 58 | "train_batch_size": 32, 59 | "test_batch_size": 32, 60 | "max_n_example_per_group": 1, 61 | "gradient_accumulation_steps": 1, 62 | "n_workers": 8, 63 | "pin_mem": 1, 64 | "fp16": 1, 65 | "amp_level": "O2", 66 | "seed": 42, 67 | 68 | "optim": "adamw", 69 | "betas": [0.9, 0.98], 70 | "learning_rate": 5e-6, 71 | "weight_decay": 0.05, 72 | "lr_mul": 1, 73 | "lr_mul_prefix": "", 74 | "loss_config": { 75 | "loss_name": "NCELearnableTempLoss_vsc_fc", 76 | "if_gather": 1 77 | }, 78 | "warmup_ratio": 0.01, 79 | "decay": "cosine", 80 | "grad_norm": 5.0, 81 | 82 | "num_train_epochs": 5, 83 | "min_valid_steps": 1, 84 | "num_valid": 100, 85 | "only_valid_steps": 1000, 86 | "save_steps_ratio": 0.01, 87 | "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_32/", 88 | "if_tb_log": 1, 89 | "if_model_saver": 1, 90 | "if_log2file": 1, 91 | "dummy_data": 0 92 | } 93 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /hd-vila/src/utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | references: UNITER 3 | """ 4 | 5 | import logging 6 | from tensorboardX import SummaryWriter 7 | import os 8 | 9 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' 10 | _DATE_FMT = '%m/%d/%Y %H:%M:%S' 11 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO) 12 | LOGGER = logging.getLogger('__main__') # this is the global logger 13 | 14 | 15 | def add_log_to_file(log_path): 16 | fh = logging.FileHandler(log_path) 17 | formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT) 18 | fh.setFormatter(formatter) 19 | LOGGER.addHandler(fh) 20 | 21 | 22 | class TensorboardLogger(object): 23 | def __init__(self): 24 | self._logger = None 25 | self._global_step = 0 26 | 27 | def create(self, path): 28 | if "AZUREML_TB_PATH" in os.environ: 29 | self._logger = SummaryWriter(os.environ["AZUREML_TB_PATH"]) 30 | else: 31 | self._logger = SummaryWriter(path) 32 | 33 | def noop(self, *args, **kwargs): 34 | return 35 | 36 | def step(self): 37 | self._global_step += 1 38 | 39 | @property 40 | def global_step(self): 41 | return self._global_step 42 | 43 | @global_step.setter 44 | def global_step(self, step): 45 | self._global_step = step 46 | 47 | def log_scalar_dict(self, log_dict, prefix=''): 48 | """ log a dictionary of scalar values""" 49 | if self._logger is None: 50 | return 51 | if prefix: 52 | prefix = f'{prefix}_' 53 | for name, value in log_dict.items(): 54 | if isinstance(value, dict): 55 | self.log_scalar_dict(value, self._global_step, 56 | prefix=f'{prefix}{name}') 57 | else: 58 | self._logger.add_scalar(f'{prefix}{name}', value, 59 | self._global_step) 60 | 61 | def __getattr__(self, name): 62 | if self._logger is None: 63 | return self.noop 64 | return self._logger.__getattribute__(name) 65 | 66 | 67 | TB_LOGGER = TensorboardLogger() 68 | 69 | 70 | class RunningMeter(object): 71 | """ running meteor of a scalar value 72 | (useful for monitoring training loss) 73 | """ 74 | def __init__(self, name, val=None, smooth=0.99): 75 | self._name = name 76 | self._sm = smooth 77 | self._val = val 78 | 79 | def __call__(self, value): 80 | self._val = (value if self._val is None 81 | else value*(1-self._sm) + self._val*self._sm) 82 | 83 | def __str__(self): 84 | return f'{self._name}: {self._val:.4f}' 85 | 86 | @property 87 | def val(self): 88 | return self._val 89 | 90 | @property 91 | def name(self): 92 | return self._name 93 | -------------------------------------------------------------------------------- /CLIP-ViP/src/utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | references: UNITER 3 | """ 4 | 5 | import logging 6 | from tensorboardX import SummaryWriter 7 | import os 8 | 9 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' 10 | _DATE_FMT = '%m/%d/%Y %H:%M:%S' 11 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO) 12 | LOGGER = logging.getLogger('__main__') # this is the global logger 13 | 14 | 15 | def add_log_to_file(log_path): 16 | fh = logging.FileHandler(log_path) 17 | formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT) 18 | fh.setFormatter(formatter) 19 | LOGGER.addHandler(fh) 20 | 21 | 22 | class TensorboardLogger(object): 23 | def __init__(self): 24 | self._logger = None 25 | self._global_step = 0 26 | 27 | def create(self, path): 28 | if "AZUREML_TB_PATH" in os.environ: 29 | self._logger = SummaryWriter(os.environ["AZUREML_TB_PATH"]) 30 | else: 31 | self._logger = SummaryWriter(path) 32 | 33 | def noop(self, *args, **kwargs): 34 | return 35 | 36 | def step(self): 37 | self._global_step += 1 38 | 39 | @property 40 | def global_step(self): 41 | return self._global_step 42 | 43 | @global_step.setter 44 | def global_step(self, step): 45 | self._global_step = step 46 | 47 | def log_scalar_dict(self, log_dict, prefix=''): 48 | """ log a dictionary of scalar values""" 49 | if self._logger is None: 50 | return 51 | if prefix: 52 | prefix = f'{prefix}_' 53 | for name, value in log_dict.items(): 54 | if isinstance(value, dict): 55 | self.log_scalar_dict(value, self._global_step, 56 | prefix=f'{prefix}{name}') 57 | else: 58 | self._logger.add_scalar(f'{prefix}{name}', value, 59 | self._global_step) 60 | 61 | def __getattr__(self, name): 62 | if self._logger is None: 63 | return self.noop 64 | return self._logger.__getattribute__(name) 65 | 66 | 67 | TB_LOGGER = TensorboardLogger() 68 | 69 | 70 | class RunningMeter(object): 71 | """ running meteor of a scalar value 72 | (useful for monitoring training loss) 73 | """ 74 | def __init__(self, name, val=None, smooth=0.99): 75 | self._name = name 76 | self._sm = smooth 77 | self._val = val 78 | 79 | def __call__(self, value): 80 | self._val = (value if self._val is None 81 | else value*(1-self._sm) + self._val*self._sm) 82 | 83 | def __str__(self): 84 | return f'{self._name}: {self._val:.4f}' 85 | 86 | @property 87 | def val(self): 88 | return self._val 89 | 90 | @property 91 | def name(self): 92 | return self._name 93 | -------------------------------------------------------------------------------- /LF-VILA/src/models/lfvila_video_classification.py: -------------------------------------------------------------------------------- 1 | from locale import LC_NUMERIC 2 | from src.models.bert import ( 3 | BertConfig, BertModel, BertOnlyMLMHead, BertOnlyNSPHead, BertForMaskedLM) 4 | from src.models.video_encoder import SwinTransformer3D 5 | from src.models.text_encoder import TextEncoderForPretraining 6 | import torch 7 | import torch.nn.functional as F 8 | from torch import nn 9 | import numpy as np 10 | import random 11 | import einops 12 | from src.utils.logger import LOGGER 13 | from timm.models.vision_transformer import Block 14 | 15 | 16 | class LFVILA_Video_Classification(nn.Module): 17 | def __init__(self, args, config): 18 | super().__init__() 19 | 20 | self.cfg = config 21 | self.video_encoder = SwinTransformer3D(**config.VideoEncoder) 22 | bert_config = BertConfig.from_json_file(config.bert_config) 23 | 24 | self.video_downsample = nn.MaxPool2d((2,3), stride=(1,1)) 25 | 26 | self.video_global_proj = nn.Linear(bert_config.hidden_size, bert_config.hidden_size) 27 | self.video_frame_proj = nn.Linear(bert_config.hidden_size, bert_config.hidden_size) 28 | 29 | self.classifier = nn.Linear(bert_config.hidden_size, self.cfg.DATA.classification_labels) 30 | 31 | 32 | def downsample_video_embd(self, video_embd): 33 | B, N, H, W, C = video_embd.size() # B, N, H, W, C 34 | video_embd = video_embd.permute(0,1,4,2,3) 35 | video_embd = self.video_downsample(video_embd.view(B*N, C, H, W)) 36 | video_embd = video_embd.permute(0,2,3,1) # B*N, H, W, C 37 | video_embd = video_embd.view(B, N, video_embd.size(-3), video_embd.size(-2),video_embd.size(-1)) 38 | video_embd = video_embd.flatten(2,3) # B, N, X, C 39 | 40 | video_feat = video_embd.mean(dim=[1, 2]) 41 | video_frame_feat = video_embd.mean(dim=2) 42 | 43 | return video_feat, video_frame_feat 44 | 45 | 46 | def forward(self, video_frames, labels = None): 47 | B, C, N, H, W = video_frames.size() 48 | video_global_embd, _ = self.video_encoder(video_frames) # B, N, H, W, C 49 | video_global_feat, video_frame_feat = self.downsample_video_embd(video_global_embd) 50 | 51 | video_global_feat = F.normalize(self.video_global_proj(video_global_feat),dim=-1) 52 | 53 | video_frame_feat = F.normalize(self.video_frame_proj(video_frame_feat),dim=-1) 54 | 55 | 56 | logits = self.classifier(video_global_feat) 57 | 58 | loss_fct = nn.CrossEntropyLoss() 59 | loss = loss_fct(logits, labels) 60 | 61 | acc = logits.max(dim=-1)[1] == labels 62 | acc = acc.float().mean(dim=0, keepdim=True) 63 | 64 | return dict(video_global_feat = video_global_feat, 65 | video_frame_feat = video_frame_feat, 66 | prediction = logits, 67 | loss = loss, 68 | acc = acc) 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /hd-vila/src/configs/tgif_frame_qa.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "tgif_qa", 5 | "txt": { 6 | "action": "data/tgif_qa/action_train.jsonl", 7 | "transition": "data/tgif_qa/transition_train.jsonl", 8 | "frameqa": "data/tgif_qa/frameqa_train.jsonl" 9 | }, 10 | "vis": "data/tgif_qa/videos_mp4" 11 | } 12 | ], 13 | "val_datasets": [ 14 | { 15 | "name": "tgif_qa", 16 | "txt": { 17 | "action": "data/tgif_qa/action_val.jsonl", 18 | "transition": "data/tgif_qa/transition_val.jsonl", 19 | "frameqa": "data/tgif_qa/frameqa_val.jsonl" 20 | }, 21 | "vis": "data/tgif_qa/videos_mp4" 22 | } 23 | ], 24 | "ans2label_path": "data/tgif_qa/frameqa_trainval_ans2label.json", 25 | "max_txt_len": 30, 26 | "max_img_size": 192, 27 | "sample_rate": 4, 28 | "reshape_size": [180, 288], 29 | "crop_size": [160, 256], 30 | "pad_value": 1, 31 | "img_pixel_mean": [123.675, 116.28, 103.53], 32 | "img_pixel_std": [1.0, 1.0, 1.0], 33 | "img_input_format": "BGR", 34 | "fps": 2, 35 | "num_frm": 7, 36 | "train_n_clips": 1, 37 | "max_n_example_per_group": 1, 38 | "model_config": "src/configs/base_model_large.json", 39 | "e2e_weights_path": "data/pretrained/hdvila_stage2.pt", 40 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 41 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 42 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 43 | "output_dir": "data/output/videoqa/tgif_qa_frame", 44 | "train_batch_size": 14, 45 | "val_batch_size": 14, 46 | "gradient_accumulation_steps": 4, 47 | "num_train_epochs": 40, 48 | "min_valid_steps": 1, 49 | "num_valid": 40, 50 | "save_steps_ratio": 0.2, 51 | "learning_rate": 4e-5, 52 | "weight_decay": 0.3, 53 | "decay": "multi_step", 54 | "step_decay_epochs":[10,15,20,25,30,35], 55 | "optim": "adamw", 56 | "betas": [0.9, 0.98], 57 | "dropout": 0.1, 58 | "grad_norm": 5.0, 59 | "cnn_learning_rate": 4e-5, 60 | "cnn_weight_decay": 0.3, 61 | "cnn_lr_decay": "multi_step", 62 | "cnn_step_decay_epochs":[10,15,20,25,30,35], 63 | "align_learning_rate": 4e-5, 64 | "align_weight_decay": 0.3, 65 | "seed": 66, 66 | "fp16": 1, 67 | "classifier": "mlp", 68 | "cls_hidden_scale": 2, 69 | "task": "frameqa", 70 | "n_workers": 4, 71 | 72 | "resnet_depth": 50, 73 | "resnet_frozen_stage": -1, 74 | "timesformer_depth": 4, 75 | "timesformer_heads": 16, 76 | "backbone_channels": [256, 512, 1024, 2048], 77 | "backbone_downsample": [4, 8, 16, 32], 78 | "backbone_channel_in_size": 2048, 79 | "hidden_size": 1024, 80 | 81 | 82 | "inference_model_step": 0, 83 | "inference_txt_db": "data/tgif_qa/frameqa_test.jsonl", 84 | "inference_img_db": "data/tgif_qa/videos_mp4", 85 | "inference_batch_size": 4, 86 | "inference_n_clips": 8 87 | } -------------------------------------------------------------------------------- /hd-vila/src/configs/tgif_action_qa.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "tgif_qa", 5 | "txt": { 6 | "action": "data/tgif_qa/action_train.jsonl", 7 | "transition": "data/tgif_qa/transition_train.jsonl", 8 | "frameqa": "data/tgif_qa/frameqa_train.jsonl" 9 | }, 10 | "vis": "data/tgif_qa/videos_mp4" 11 | } 12 | ], 13 | "val_datasets": [ 14 | { 15 | "name": "tgif_qa", 16 | "txt": { 17 | "action": "data/tgif_qa/action_val.jsonl", 18 | "transition": "data/tgif_qa/transition_val.jsonl", 19 | "frameqa": "data/tgif_qa/frameqa_val.jsonl" 20 | }, 21 | "vis": "data/tgif_qa/videos_mp4" 22 | } 23 | ], 24 | "ans2label_path": "data/tgif_qa/frameqa_trainval_ans2label.json", 25 | "max_txt_len": 30, 26 | "max_img_size": 192, 27 | "sample_rate": 4, 28 | "reshape_size": [180, 288], 29 | "crop_size": [160, 256], 30 | "pad_value": 1, 31 | "img_pixel_mean": [123.675, 116.28, 103.53], 32 | "img_pixel_std": [1.0, 1.0, 1.0], 33 | "img_input_format": "BGR", 34 | "fps": 2, 35 | "num_frm": 7, 36 | "train_n_clips": 1, 37 | "max_n_example_per_group": 1, 38 | "model_config": "src/configs/base_model_large.json", 39 | "e2e_weights_path": "data/pretrained/hdvila_stage2.pt", 40 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 41 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 42 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 43 | "output_dir": "data/output/videoqa/tgif_qa_action", 44 | "train_batch_size": 12, 45 | "val_batch_size": 12, 46 | "gradient_accumulation_steps": 4, 47 | "num_train_epochs": 80, 48 | "min_valid_steps": 1, 49 | "num_valid": 80, 50 | "save_steps_ratio": 0.2, 51 | "learning_rate": 5e-5, 52 | "weight_decay": 1e-1, 53 | "decay": "multi_step", 54 | "step_decay_epochs":[10,20,30,40,50,60,70], 55 | "optim": "adamw", 56 | "betas": [0.9, 0.98], 57 | "dropout": 0.1, 58 | "grad_norm": 5.0, 59 | "cnn_learning_rate": 5e-5, 60 | "cnn_weight_decay": 1e-1, 61 | "cnn_lr_decay": "multi_step", 62 | "cnn_step_decay_epochs":[10,20,30,40,50,60,70], 63 | "align_learning_rate": 5e-5, 64 | "align_weight_decay": 1e-1, 65 | "seed": 66, 66 | "fp16": 1, 67 | "classifier": "mlp", 68 | "cls_hidden_scale": 2, 69 | "task": "action", 70 | "n_workers": 4, 71 | 72 | "resnet_depth": 50, 73 | "resnet_frozen_stage": -1, 74 | "timesformer_depth": 4, 75 | "timesformer_heads": 16, 76 | "backbone_channels": [256, 512, 1024, 2048], 77 | "backbone_downsample": [4, 8, 16, 32], 78 | "backbone_channel_in_size": 2048, 79 | "hidden_size": 1024, 80 | 81 | 82 | "inference_model_step": 0, 83 | "inference_txt_db": "data/tgif_qa/action_test.jsonl", 84 | "inference_img_db": "data/tgif_qa/videos_mp4", 85 | "inference_batch_size": 4, 86 | "inference_n_clips": 8 87 | } -------------------------------------------------------------------------------- /hd-vila/src/configs/tgif_transition_qa.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "name": "tgif_qa", 5 | "txt": { 6 | "action": "data/tgif_qa/action_train.jsonl", 7 | "transition": "data/tgif_qa/transition_train.jsonl", 8 | "frameqa": "data/tgif_qa/frameqa_train.jsonl" 9 | }, 10 | "vis": "data/tgif_qa/videos_mp4" 11 | } 12 | ], 13 | "val_datasets": [ 14 | { 15 | "name": "tgif_qa", 16 | "txt": { 17 | "action": "data/tgif_qa/action_val.jsonl", 18 | "transition": "data/tgif_qa/transition_val.jsonl", 19 | "frameqa": "data/tgif_qa/frameqa_val.jsonl" 20 | }, 21 | "vis": "data/tgif_qa/videos_mp4" 22 | } 23 | ], 24 | "ans2label_path": "data/tgif_qa/frameqa_trainval_ans2label.json", 25 | "max_txt_len": 30, 26 | "max_img_size": 192, 27 | "sample_rate": 4, 28 | "reshape_size": [180, 288], 29 | "crop_size": [160, 256], 30 | "pad_value": 1, 31 | "img_pixel_mean": [123.675, 116.28, 103.53], 32 | "img_pixel_std": [1.0, 1.0, 1.0], 33 | "img_input_format": "BGR", 34 | "fps": 2, 35 | "num_frm": 7, 36 | "train_n_clips": 1, 37 | "max_n_example_per_group": 1, 38 | "model_config": "src/configs/base_model_large.json", 39 | "e2e_weights_path": "data/pretrained/hdvila_stage2.pt", 40 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 41 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 42 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 43 | "output_dir": "data/output/videoqa/tgif_qa_transition/", 44 | "train_batch_size": 12, 45 | "val_batch_size": 12, 46 | "gradient_accumulation_steps": 4, 47 | "num_train_epochs": 80, 48 | "min_valid_steps": 1, 49 | "num_valid": 80, 50 | "save_steps_ratio": 0.2, 51 | "learning_rate": 5e-5, 52 | "weight_decay": 1e-1, 53 | "decay": "multi_step", 54 | "step_decay_epochs":[10,20,30,40,50,60,70], 55 | "optim": "adamw", 56 | "betas": [0.9, 0.98], 57 | "dropout": 0.1, 58 | "grad_norm": 5.0, 59 | "cnn_learning_rate": 5e-5, 60 | "cnn_weight_decay": 1e-1, 61 | "cnn_lr_decay": "multi_step", 62 | "cnn_step_decay_epochs":[10,20,30,40,50,60,70], 63 | "align_learning_rate": 5e-5, 64 | "align_weight_decay": 1e-1, 65 | "seed": 66, 66 | "fp16": 1, 67 | "classifier": "mlp", 68 | "cls_hidden_scale": 2, 69 | "task": "transition", 70 | "n_workers": 4, 71 | 72 | "resnet_depth": 50, 73 | "resnet_frozen_stage": -1, 74 | "timesformer_depth": 4, 75 | "timesformer_heads": 16, 76 | "backbone_channels": [256, 512, 1024, 2048], 77 | "backbone_downsample": [4, 8, 16, 32], 78 | "backbone_channel_in_size": 2048, 79 | "hidden_size": 1024, 80 | 81 | 82 | "inference_model_step": 0, 83 | "inference_txt_db": "data/tgif_qa/transition_test.jsonl", 84 | "inference_img_db": "data/tgif_qa/videos_mp4", 85 | "inference_batch_size": 4, 86 | "inference_n_clips": 8 87 | } -------------------------------------------------------------------------------- /CLIP-ViP/src/optimization/sched.py: -------------------------------------------------------------------------------- 1 | """ 2 | optimizer learning rate scheduling helpers 3 | """ 4 | import math 5 | from math import ceil 6 | from collections import Counter 7 | 8 | 9 | def noam_schedule(step, warmup_step=4000): 10 | if step <= warmup_step: 11 | return step / warmup_step 12 | return (warmup_step ** 0.5) * (step ** -0.5) 13 | 14 | 15 | def warmup_linear(step, warmup_step, tot_step): 16 | if step < warmup_step: 17 | return step / warmup_step 18 | return max(0, (tot_step-step)/(tot_step-warmup_step)) 19 | 20 | def warmup_cosine(step, warmup_step, tot_step): 21 | if step < warmup_step: 22 | return step / warmup_step 23 | progress = (step - warmup_step) / (tot_step - warmup_step) 24 | return 0.5 * (1.0 + math.cos(math.pi * progress)) 25 | 26 | def multi_step_schedule(n_epoch, milestones, step, warmup_step,gamma=0.5): 27 | if step <= warmup_step: 28 | return step / warmup_step 29 | 30 | milestones = list(sorted(milestones)) 31 | for i, m in enumerate(milestones): 32 | if n_epoch < m: 33 | return gamma**i 34 | return gamma**(len(milestones)+1) 35 | 36 | class AutoStep(): 37 | def __init__(self, tolerance, gamma): 38 | self.tolerance = tolerance 39 | self.coeff_mem = 1 40 | self.gamma = gamma 41 | self.best_score = 0. 42 | self.count = 0 43 | 44 | def step(self, score): 45 | if score <= self.best_score: 46 | self.count += 1 47 | else: 48 | self.count = 0 49 | self.best_score = score 50 | if self.count > self.tolerance: 51 | self.count = 0 52 | self.coeff_mem = self.coeff_mem * self.gamma 53 | 54 | def get_lr(self, global_step, learning_rate, num_train_steps, warmup_ratio=0.1): 55 | warmup_steps = int(warmup_ratio * num_train_steps) 56 | if global_step <= warmup_steps: 57 | return learning_rate * global_step / warmup_steps 58 | 59 | return max(self.coeff_mem * learning_rate, 1e-8) 60 | 61 | 62 | def get_lr_sched(global_step, decay, learning_rate, 63 | num_train_steps, warmup_ratio=0.1, 64 | decay_epochs=[], multi_step_epoch=-1): 65 | warmup_steps = int(warmup_ratio*num_train_steps) 66 | if decay == 'linear': 67 | lr_this_step = learning_rate * warmup_linear( 68 | global_step, warmup_steps, num_train_steps) 69 | elif decay == 'cosine': 70 | lr_this_step = learning_rate * warmup_cosine( 71 | global_step, warmup_steps, num_train_steps) 72 | elif decay == 'invsqrt': 73 | lr_this_step = learning_rate * noam_schedule( 74 | global_step, warmup_steps) 75 | elif decay == 'constant': 76 | lr_this_step = learning_rate 77 | elif decay == "multi_step": 78 | assert multi_step_epoch >= 0 79 | lr_this_step = learning_rate * multi_step_schedule( 80 | multi_step_epoch, decay_epochs, global_step, warmup_steps) 81 | if lr_this_step <= 0: 82 | # save guard for possible miscalculation of train steps 83 | lr_this_step = 1e-8 84 | return lr_this_step 85 | -------------------------------------------------------------------------------- /hd-vila/src/configs/lsmdc_retrieval.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "lsmdc-101k", 5 | "vis_format": "frame", 6 | "txt": "data/lsmdc_retrieval/train_101k_frame.jsonl", 7 | "vis": "data/lsmdc_retrieval/video_frames" 8 | }, 9 | "val_datasets": [ 10 | { 11 | "name": "lsmdc-1k", 12 | "vis_format": "frame", 13 | "txt": "data/lsmdc_retrieval/test_1k_frame.jsonl", 14 | "vis": "data/lsmdc_retrieval/video_frames" 15 | } 16 | ], 17 | "inference_datasets": [ 18 | { 19 | "name": "lsmdc-1k", 20 | "vis_format": "frame", 21 | "txt": "data/lsmdc_retrieval/test_1k_frame.jsonl", 22 | "vis": "data/lsmdc_retrieval/video_frames" 23 | } 24 | ], 25 | "img_pixel_mean": [123.675, 116.28, 103.53], 26 | "img_pixel_std": [58.395, 57.12, 57.375], 27 | "model_config": "src/configs/base_model_large.json", 28 | "e2e_weights_path": "data/pretrained/hdvila_stage2.pt", 29 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 30 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 31 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 32 | "output_dir": "data/output/retrieval/lsmdc_retrieval", 33 | "vis_steps":0, 34 | "warmup_ratio":0.01, 35 | "resnet_depth": 50, 36 | "resnet_frozen_stage": -1, 37 | "bert_frozen_stage": -1, 38 | "bert_mean":1, 39 | "timesformer_type": "new", 40 | "timesformer_depth": 4, 41 | "timesformer_heads": 16, 42 | "max_txt_len": 50, 43 | "score_agg_func": "lse", 44 | "loss_type": "ce", 45 | "train_n_clips": 2, 46 | "inference_n_clips": 4, 47 | "num_frm": 11, 48 | "sample_rate": 3, 49 | "crop_size": [160,256], 50 | "out_size": [256, 128, 64, 3], 51 | "train_batch_size": 8, 52 | "val_batch_size": 8, 53 | "max_n_example_per_group": 1, 54 | "gradient_accumulation_steps": 1, 55 | "num_train_epochs": 20, 56 | "min_valid_steps": 1, 57 | "num_valid": 20, 58 | "only_valid_steps": 500, 59 | "save_steps_ratio": 0.05, 60 | "learning_rate": 5e-6, 61 | "decay": "multi_step", 62 | "step_decay_epochs":[4,8,16], 63 | "cnn_step_decay_epochs":[4,8,16], 64 | "optim": "adamw", 65 | "betas": [0.9, 0.98], 66 | "dropout": 0.1, 67 | "weight_decay": 1e-3, 68 | "grad_norm": 5.0, 69 | "cnn_learning_rate": 5e-6, 70 | "cnn_weight_decay": 1e-3, 71 | "cnn_lr_decay": "multi_step", 72 | "align_learning_rate": 5e-6, 73 | "align_weight_decay": 1e-3, 74 | "generator_learning_rate": 5e-3, 75 | "generator_weight_decay": 0.0, 76 | "low_level_tasks": ["none"], 77 | "pixel_random_sampling_size": 160, 78 | "seed":24, 79 | "fp16": 1, 80 | "amp_level": "O2", 81 | "use_itm": 0, 82 | "use_itc": 1, 83 | "use_mlm": 0, 84 | 85 | "n_workers": 4, 86 | 87 | "hframe":1, 88 | "lframe":11, 89 | 90 | 91 | "backbone_channels": [256, 512, 1024, 2048], 92 | "backbone_downsample": [4, 8, 16, 32], 93 | "backbone_channel_in_size": 2048, 94 | "hidden_size": 1024, 95 | 96 | "temp": 0.08, 97 | "loss_config":{ 98 | "loss_name":"NCEContrastiveLoss", 99 | "temp":0.08 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /hd-vila/src/configs/didemo_retrieval.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "didemo-train", 5 | "vis_format": "frame", 6 | "txt": "data/didemo_retrieval/train_frame.jsonl", 7 | "vis": "data/didemo_retrieval/video_frames" 8 | }, 9 | "val_datasets": [ 10 | { 11 | "name": "didemo-test", 12 | "vis_format": "frame", 13 | "txt": "data/didemo_retrieval/test_frame.jsonl", 14 | "vis": "data/didemo_retrieval/video_frames" 15 | } 16 | ], 17 | "inference_datasets": [ 18 | { 19 | "name": "didemo-test", 20 | "vis_format": "frame", 21 | "txt": "data/didemo_retrieval/test_frame.jsonl", 22 | "vis": "data/didemo_retrieval/video_frames" 23 | } 24 | ], 25 | "img_pixel_mean": [123.675, 116.28, 103.53], 26 | "img_pixel_std": [58.395, 57.12, 57.375], 27 | "model_config": "src/configs/base_model_large.json", 28 | "e2e_weights_path": "data/pretrained/hdvila_stage2.pt", 29 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 30 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 31 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 32 | "output_dir": "data/output/retrieval/didemo_retrieval", 33 | "vis_steps":0, 34 | "warmup_ratio":0.1, 35 | "resnet_depth": 50, 36 | "resnet_frozen_stage": -1, 37 | "bert_frozen_stage": -1, 38 | "bert_mean":1, 39 | "timesformer_type": "new", 40 | "timesformer_depth": 4, 41 | "timesformer_heads": 16, 42 | "max_txt_len": 50, 43 | "score_agg_func": "lse", 44 | "loss_type": "ce", 45 | "train_n_clips": 4, 46 | "inference_n_clips": 8, 47 | "num_frm": 11, 48 | "sample_rate": 2, 49 | "crop_size": [160,256], 50 | "out_size": [256, 128, 64, 3], 51 | "train_batch_size": 4, 52 | "val_batch_size": 8, 53 | "max_n_example_per_group": 1, 54 | "gradient_accumulation_steps": 1, 55 | "num_train_epochs": 100, 56 | "min_valid_steps": 1, 57 | "num_valid": 20, 58 | "only_valid_steps": 500, 59 | "save_steps_ratio": 0.05, 60 | "learning_rate": 5e-6, 61 | "decay": "multi_step", 62 | "step_decay_epochs":[8,16,32,64], 63 | "cnn_step_decay_epochs":[8,16,32,64], 64 | "optim": "adamw", 65 | "betas": [0.9, 0.98], 66 | "dropout": 0.1, 67 | "weight_decay": 1e-3, 68 | "grad_norm": 5.0, 69 | "cnn_learning_rate": 5e-6, 70 | "cnn_weight_decay": 1e-1, 71 | "cnn_lr_decay": "multi_step", 72 | "align_learning_rate": 5e-6, 73 | "align_weight_decay": 1e-1, 74 | "generator_learning_rate": 5e-3, 75 | "generator_weight_decay": 0.0, 76 | "low_level_tasks": ["none"], 77 | "pixel_random_sampling_size": 160, 78 | "seed":24, 79 | "fp16": 1, 80 | "amp_level": "O2", 81 | "use_itm": 0, 82 | "use_itc": 1, 83 | "use_mlm": 0, 84 | 85 | "n_workers": 4, 86 | 87 | "hframe":1, 88 | "lframe":11, 89 | 90 | 91 | "backbone_channels": [256, 512, 1024, 2048], 92 | "backbone_downsample": [4, 8, 16, 32], 93 | "backbone_channel_in_size": 2048, 94 | "hidden_size": 1024, 95 | 96 | "temp": 0.08, 97 | "loss_config":{ 98 | "loss_name":"NCEContrastiveLoss", 99 | "temp":0.08 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /hd-vila/src/configs/actnet_retrieval.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "actnet-train", 5 | "vis_format": "frame", 6 | "txt": "data/activitynet_retrieval/train.jsonl", 7 | "vis": "data/activitynet_retrieval/video_frames" 8 | }, 9 | "val_datasets": [ 10 | { 11 | "name": "actnet-test", 12 | "vis_format": "frame", 13 | "txt": "data/activitynet_retrieval/val1.jsonl", 14 | "vis": "data/activitynet_retrieval/video_frames" 15 | } 16 | ], 17 | "inference_datasets": [ 18 | { 19 | "name": "actnet-test", 20 | "vis_format": "frame", 21 | "txt": "data/activitynet_retrieval/val1.jsonl", 22 | "vis": "data/activitynet_retrieval/video_frames" 23 | } 24 | ], 25 | "img_pixel_mean": [123.675, 116.28, 103.53], 26 | "img_pixel_std": [58.395, 57.12, 57.375], 27 | "model_config": "src/configs/base_model_large.json", 28 | "e2e_weights_path": "data/pretrained/hdvila_stage2.pt", 29 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 30 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 31 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 32 | "output_dir": "data/output/retrieval/actnet_retrieval", 33 | "vis_steps":0, 34 | "warmup_ratio":0.1, 35 | "resnet_depth": 50, 36 | "resnet_frozen_stage": -1, 37 | "bert_frozen_stage": -1, 38 | "bert_mean":1, 39 | "timesformer_type": "new", 40 | "timesformer_depth": 4, 41 | "timesformer_heads": 16, 42 | "max_txt_len": 50, 43 | "score_agg_func": "lse", 44 | "loss_type": "ce", 45 | "train_n_clips": 4, 46 | "inference_n_clips": 8, 47 | "num_frm": 13, 48 | "sample_rate":4, 49 | "crop_size": [160,256], 50 | "out_size": [256, 128, 64, 3], 51 | "train_batch_size": 4, 52 | "val_batch_size": 8, 53 | "max_n_example_per_group": 1, 54 | "gradient_accumulation_steps": 1, 55 | "num_train_epochs": 100, 56 | "min_valid_steps": 1, 57 | "num_valid": 20, 58 | "only_valid_steps": 500, 59 | "save_steps_ratio": 0.05, 60 | "learning_rate": 5e-6, 61 | "decay": "multi_step", 62 | "step_decay_epochs":[8,16,32,64], 63 | "cnn_step_decay_epochs":[8,16,32,64], 64 | "optim": "adamw", 65 | "betas": [0.9, 0.98], 66 | "dropout": 0.1, 67 | "weight_decay": 1e-3, 68 | "grad_norm": 5.0, 69 | "cnn_learning_rate": 5e-6, 70 | "cnn_weight_decay": 1e-3, 71 | "cnn_lr_decay": "multi_step", 72 | "align_learning_rate": 5e-6, 73 | "align_weight_decay": 1e-3, 74 | "generator_learning_rate": 5e-3, 75 | "generator_weight_decay": 0.0, 76 | "low_level_tasks": ["none"], 77 | "pixel_random_sampling_size": 160, 78 | "seed":24, 79 | "fp16": 1, 80 | "amp_level": "O2", 81 | "use_itm": 0, 82 | "use_itc": 1, 83 | "use_mlm": 0, 84 | 85 | "n_workers": 4, 86 | 87 | "hframe":1, 88 | "lframe":11, 89 | 90 | 91 | "backbone_channels": [256, 512, 1024, 2048], 92 | "backbone_downsample": [4, 8, 16, 32], 93 | "backbone_channel_in_size": 2048, 94 | "hidden_size": 1024, 95 | 96 | "temp": 0.08, 97 | "loss_config":{ 98 | "loss_name":"NCEContrastiveLoss", 99 | "temp":0.08 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /hd-vila/src/configs/msrvtt_retrieval.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": 3 | { 4 | "name": "msrvtt-9k", 5 | "vis_format": "video", 6 | "txt": "data/msrvtt_retrieval/train9k.jsonl", 7 | "vis": "data/msrvtt_retrieval/videos_6fps" 8 | }, 9 | "val_datasets": [ 10 | 11 | { 12 | "name": "msrvtt-1ka", 13 | "vis_format": "video", 14 | "txt": "data/msrvtt_retrieval/test1ka.jsonl", 15 | "vis": "data/msrvtt_retrieval/videos_6fps" 16 | } 17 | ], 18 | "inference_datasets": [ 19 | { 20 | "name": "msrvtt-1ka", 21 | "vis_format": "video", 22 | "txt": "data/msrvtt_retrieval/test1ka.jsonl", 23 | "vis": "data/msrvtt_retrieval/videos_6fps" 24 | } 25 | ], 26 | "img_pixel_mean": [123.675, 116.28, 103.53], 27 | "img_pixel_std": [58.395, 57.12, 57.375], 28 | "model_config": "src/configs/base_model_large.json", 29 | "e2e_weights_path": "data/pretrained/hdvila_stage2.pt", 30 | "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth", 31 | "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin", 32 | "tokenizer_dir": "data/pretrained/bert-base-uncased/", 33 | "output_dir": "data/output/retrieval/msrvtt_retrieval", 34 | "vis_steps":0, 35 | 36 | "resnet_depth": 50, 37 | "resnet_frozen_stage": -1, 38 | "bert_frozen_stage": -1, 39 | "bert_mean":1, 40 | "timesformer_depth": 4, 41 | "timesformer_heads": 16, 42 | "timesformer_type": "new", 43 | "max_txt_len": 50, 44 | "score_agg_func": "lse", 45 | "loss_type": "ce", 46 | "train_n_clips": 2, 47 | "inference_n_clips": 4, 48 | 49 | "crop_size": [160,256], 50 | "out_size": [256, 128, 64, 3], 51 | "train_batch_size": 8, 52 | "val_batch_size": 8, 53 | "max_n_example_per_group": 1, 54 | "gradient_accumulation_steps": 1, 55 | "num_train_epochs": 200, 56 | "min_valid_steps": 1, 57 | "num_valid": 10, 58 | "only_valid_steps": 100, 59 | "save_steps_ratio": 0.1, 60 | "learning_rate": 1e-5, 61 | "decay": "multi_step", 62 | "step_decay_epochs":[32, 64, 128, 256], 63 | "cnn_step_decay_epochs":[32, 64, 128, 256], 64 | "optim": "adamw", 65 | "betas": [0.9, 0.98], 66 | "dropout": 0.1, 67 | "weight_decay": 1e-4, 68 | "grad_norm": 5.0, 69 | "cnn_learning_rate": 1e-5, 70 | "cnn_weight_decay": 1e-4, 71 | "cnn_lr_decay": "multi_step", 72 | "align_learning_rate": 5e-6, 73 | "align_weight_decay": 1e-3, 74 | "generator_learning_rate": 5e-3, 75 | "generator_weight_decay": 0.0, 76 | "low_level_tasks": ["none"], 77 | "pixel_random_sampling_size": 160, 78 | "seed":24, 79 | "fp16": 1, 80 | "amp_level": "O2", 81 | "use_itm": 0, 82 | "use_itc": 1, 83 | "use_mlm": 0, 84 | 85 | "n_workers": 4, 86 | 87 | "pos_num":1, 88 | 89 | "backbone_channels": [256, 512, 1024, 2048], 90 | "backbone_downsample": [4, 8, 16, 32], 91 | "backbone_channel_in_size": 2048, 92 | "hidden_size": 1024, 93 | 94 | "hframe":1, 95 | "lframe":11, 96 | 97 | 98 | "num_frm": 7, 99 | "sample_rate": 4, 100 | "warmup_ratio":0.01, 101 | 102 | "temp": 0.1, 103 | "loss_config":{ 104 | "loss_name":"NCEContrastiveLoss", 105 | "temp":0.08 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /hd-vila/scripts/process_raw_video/decode_frames.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | os.system('pip install Pillow') 4 | os.system('pip install decord') 5 | import jsonlines 6 | from tqdm import tqdm 7 | import time 8 | from PIL import Image 9 | import decord 10 | import multiprocessing 11 | from joblib import Parallel, delayed 12 | from glob import glob 13 | import numpy as np 14 | 15 | def parse_args(): 16 | parser = argparse.ArgumentParser(description='decode frames') 17 | parser.add_argument('--workdir', default='/data',type=str, help='workdir') 18 | parser.add_argument('--inputfile', default='train.jsonl', type=str, help='inputfile') 19 | parser.add_argument("--outputfile",type=str, default="train_result.jsonl", help="outputfile") 20 | 21 | args = parser.parse_args() 22 | return args 23 | 24 | def check_dirs(dirs): 25 | if not os.path.exists(dirs): 26 | os.makedirs(dirs) 27 | 28 | 29 | def load_clip_text(args): 30 | p = os.path.join(args.workdir,'lsmdc/', args.inputfile) 31 | data = [] 32 | with open(p,'r') as f: 33 | for l in jsonlines.Reader(f): 34 | data.append(l) 35 | return data 36 | 37 | def extract_single_clip(clip_text): 38 | try: 39 | clip_id = clip_text['clip_id'] 40 | clip_path = os.path.join(args.workdir, 'lsmdc/videos/{}.avi'.format(clip_id)) 41 | if os.path.exists(clip_path): 42 | 43 | out_folder = os.path.join(os.path.join(args.workdir, 'lsmdc/video_frames',clip_id)) 44 | out_folder_lr = os.path.join(os.path.join(args.workdir, 'lsmdc/video_frames_lr',clip_id)) 45 | os.system('rm -rf {}'.format(out_folder)) 46 | 47 | check_dirs(out_folder) 48 | 49 | vr = decord.VideoReader(clip_path, ctx=decord.cpu(0)) 50 | fps = vr.get_avg_fps() 51 | sample_id = np.round(np.linspace(0, len(vr)-1, round(len(vr)/fps*6))).astype(int) 52 | if len(sample_id)<=20: 53 | sample_id = np.round(np.linspace(0, len(vr)-1, 20)).astype(int) 54 | 55 | 56 | for i in range(len(sample_id)): 57 | frame = vr[sample_id[i]].asnumpy() 58 | img = Image.fromarray(frame).convert("RGB") 59 | img.save(os.path.join(out_folder, clip_id.split('/')[-1]+'_{0:03d}.jpg'.format(i))) 60 | 61 | img = img.resize((288,180),Image.BICUBIC) 62 | img.save(os.path.join(out_folder_lr, clip_id.split('/')[-1]+'_{0:03d}.jpg'.format(i))) 63 | 64 | return {'clip_id':clip_id, 'num_frame':len(sample_id)} 65 | else: 66 | return None 67 | except: 68 | return None 69 | 70 | def main(args): 71 | 72 | clip_texts = load_clip_text(args) 73 | 74 | 75 | num_cores = multiprocessing.cpu_count() 76 | print(num_cores) 77 | results = Parallel(n_jobs=2)(delayed(extract_single_clip)(c) for c in tqdm(clip_texts)) 78 | results = [x for x in results if x is not None] 79 | 80 | print(len(results)) 81 | check_dirs(os.path.join(args.workdir,'lsmdc/decode_results')) 82 | save_path = os.path.join(args.workdir,'lsmdc/decode_results',args.outputfile) 83 | print(save_path) 84 | with jsonlines.open(save_path, 'w') as f: 85 | for i in tqdm(range(len(results))): 86 | f.write(results[i]) 87 | print('write done') 88 | 89 | 90 | if __name__ =='__main__': 91 | args = parse_args() 92 | 93 | print(args.workdir) 94 | main(args) 95 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/queryd_ret.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 1 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | 22 | WEIGHTS: 23 | model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin' 24 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 25 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 26 | pretrained_2d: True 27 | 28 | DATA: 29 | BATCH_SIZE_per_gpu: 16 30 | NUM_WORKERS: 12 31 | PIN_MEMORY: True 32 | 33 | sample_frame: 32 34 | sample_clip: 4 35 | input_res: [192, 320] 36 | center_crop: 200 37 | 38 | 39 | DATASET_train: { 40 | 'name': 'RetrievalDataset-train', 41 | 'type': 'RetrievalDataset', 42 | 'metadata_dir': 'datasets/lfvila_data/task/querydret/train.jsonl', 43 | 'video_path': 'datasets/queryd/queryd_video' 44 | } 45 | 46 | DATASET_val: [{ 47 | 'name': 'RetrievalDataset-val', 48 | 'type': 'RetrievalDataset', 49 | 'metadata_dir': 'datasets/lfvila_data/task/querydret/test.jsonl', 50 | 'video_path': 'datasets/queryd/queryd_video' 51 | } 52 | ] 53 | 54 | 55 | TRAINING: 56 | save_feats: 0 57 | do_eval2: false 58 | EPOCHS: 20 59 | WARMUP_EPOCHS: 1 60 | WARMUP_LR: 0. 61 | LR_SCHEDULER: { 62 | 'NAME': 'step', 63 | 'DECAY_EPOCHS': 5, 64 | 'DECAY_RATE': 0.25 65 | } 66 | 67 | use_mlm: false 68 | 69 | ct_global_loss_weight: 1 70 | 71 | 72 | temp: 0.05 73 | weight_decay: 0.05 74 | save_dir: "project/lfvila/lfvila_save/querydret" 75 | checkpoint_step: 20000 76 | save_step: 10000 77 | print_step: 25 78 | eval_step: 25 79 | 80 | deepspeed_config: { 81 | "train_micro_batch_size_per_gpu": 16, 82 | "gradient_accumulation_steps": 1, 83 | "steps_per_print": 500, 84 | 85 | 86 | "zero_optimization": { 87 | "stage": 2, 88 | "allgather_partitions": true, 89 | "allgather_bucket_size": 5.0e+8, 90 | "overlap_comm": false, 91 | "reduce_scatter": true, 92 | "reduce_bucket_size": 5.0e+8, 93 | "contiguous_gradients" : false, 94 | "stage3_gather_fp16_weights_on_model_save": true 95 | }, 96 | 97 | "fp16": { 98 | "enabled": true, 99 | "loss_scale": 0, 100 | "loss_scale_window": 1000, 101 | "initial_scale_power": 32, 102 | "hysteresis": 2, 103 | "min_loss_scale": 1 104 | }, 105 | 106 | "optimizer": { 107 | "type": "AdamW", 108 | "params": { 109 | "lr": 5.0e-5, 110 | "betas": [0.9, 0.98], 111 | "eps": 1.0e-8, 112 | "weight_decay": 5.0e-2 113 | } 114 | }, 115 | 116 | 117 | "sparse_attention": { 118 | "mode": "fixed", 119 | "block": 32, 120 | "different_layout_per_head": true, 121 | "num_local_blocks": 16, 122 | "num_global_blocks": 1, 123 | "attention": "bidirectional", 124 | "horizontal_global_attention": true, 125 | "num_different_global_patterns": 4 126 | } 127 | } 128 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/violin_qa.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 2 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | final_num_patches: 6 22 | 23 | qa_type: 'classification' 24 | 25 | WEIGHTS: 26 | model_weight: 'project/lfvila/pretrained/lfvila_stage2.bin' 27 | stage1_model_weight: '' 28 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 29 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 30 | pretrained_2d: True 31 | 32 | DATA: 33 | BATCH_SIZE_per_gpu: 12 34 | NUM_WORKERS: 8 35 | PIN_MEMORY: True 36 | 37 | sample_frame: 32 38 | sample_clip: 4 39 | input_res: [192, 320] 40 | center_crop: 200 41 | 42 | max_num_subtitle: 4 43 | 44 | classification_labels: 2 45 | 46 | DATASET_train: { 47 | 'name': 'QADataset-train', 48 | 'type': 'ViolinDataset', 49 | 'metadata_dir': 'datasets/lfvila_data/task/violin/violin_train.jsonl', 50 | 'video_path': 'datasets/violin/violin_video' 51 | } 52 | 53 | DATASET_val: [{ 54 | 'name': 'QADataset-val', 55 | 'type': 'ViolinDataset', 56 | 'metadata_dir': 'datasets/lfvila_data/task/violin/violin_test.jsonl', 57 | 'video_path': 'datasets/violin/violin_video' 58 | }] 59 | 60 | 61 | TRAINING: 62 | EPOCHS: 100 63 | WARMUP_EPOCHS: 10 64 | WARMUP_LR: 0. 65 | LR_SCHEDULER: { 66 | 'NAME': 'linear', 67 | 'DECAY_EPOCHS': 10, 68 | } 69 | 70 | 71 | use_mlm: false 72 | 73 | weight_decay: 0.1 74 | 75 | save_dir: "project/lfvila/lfvila_save/violin" 76 | checkpoint_step: 10000 77 | save_step: 5000 78 | print_step: 100 79 | eval_step: 500 80 | 81 | deepspeed_config: { 82 | "train_micro_batch_size_per_gpu": 12, 83 | "gradient_accumulation_steps": 1, 84 | "steps_per_print": 500, 85 | 86 | 87 | "zero_optimization": { 88 | "stage": 2, 89 | "allgather_partitions": true, 90 | "allgather_bucket_size": 5.0e+8, 91 | "overlap_comm": false, 92 | "reduce_scatter": true, 93 | "reduce_bucket_size": 5.0e+8, 94 | "contiguous_gradients" : false, 95 | "stage3_gather_fp16_weights_on_model_save": true 96 | }, 97 | 98 | "fp16": { 99 | "enabled": true, 100 | "loss_scale": 0, 101 | "loss_scale_window": 1000, 102 | "initial_scale_power": 32, 103 | "hysteresis": 2, 104 | "min_loss_scale": 1 105 | }, 106 | 107 | "optimizer": { 108 | "type": "AdamW", 109 | "params": { 110 | "lr": 5.0e-5, 111 | "betas": [0.9, 0.98], 112 | "eps": 1.0e-8, 113 | "weight_decay": 5.0e-2 114 | } 115 | }, 116 | 117 | 118 | "sparse_attention": { 119 | "mode": "fixed", 120 | "block": 32, 121 | "different_layout_per_head": true, 122 | "num_local_blocks": 16, 123 | "num_global_blocks": 1, 124 | "attention": "bidirectional", 125 | "horizontal_global_attention": true, 126 | "num_different_global_patterns": 4 127 | } 128 | } 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/actnet_qa.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 2 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | final_num_patches: 6 22 | 23 | 24 | qa_type: 'classification' 25 | 26 | WEIGHTS: 27 | model_weight: 'project/lfvila/pretrained/lfvila_stage2.bin' 28 | stage1_model_weight: '' 29 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 30 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 31 | pretrained_2d: True 32 | 33 | DATA: 34 | BATCH_SIZE_per_gpu: 16 35 | NUM_WORKERS: 4 36 | PIN_MEMORY: True 37 | 38 | sample_frame: 32 39 | sample_clip: 4 40 | input_res: [192, 320] 41 | center_crop: 200 42 | 43 | 44 | classification_labels: 1654 45 | 46 | DATASET_train: { 47 | 'name': 'QADataset-train', 48 | 'type': 'ActnetQADataset', 49 | 'metadata_dir': 'datasets/lfvila_data/task/actnet_qa/train.jsonl', 50 | 'video_path': 'datasets/activitynet/actnet_video' 51 | } 52 | 53 | DATASET_val: [{ 54 | 'name': 'QADataset-val', 55 | 'type': 'ActnetQADataset', 56 | 'metadata_dir': 'datasets/lfvila_data/task/actnet_qa/test.jsonl', 57 | 'video_path': 'datasets/activitynet/actnet_video' 58 | }] 59 | 60 | 61 | TRAINING: 62 | EPOCHS: 100 63 | WARMUP_EPOCHS: 10 64 | WARMUP_LR: 0. 65 | MIN_LR: 1.0e-8 66 | LR_SCHEDULER: { 67 | 'NAME': 'cosine', 68 | 'DECAY_EPOCHS': 10 69 | } 70 | 71 | use_mlm: false 72 | 73 | weight_decay: 0.1 74 | 75 | save_dir: "project/lfvila/lfvila_save/actnetqa" 76 | checkpoint_step: 10000 77 | save_step: 5000 78 | print_step: 100 79 | eval_step: 500 80 | 81 | deepspeed_config: { 82 | "train_micro_batch_size_per_gpu": 16, 83 | "gradient_accumulation_steps": 1, 84 | "steps_per_print": 500, 85 | 86 | 87 | "zero_optimization": { 88 | "stage": 2, 89 | "allgather_partitions": true, 90 | "allgather_bucket_size": 5.0e+8, 91 | "overlap_comm": false, 92 | "reduce_scatter": true, 93 | "reduce_bucket_size": 5.0e+8, 94 | "contiguous_gradients" : false, 95 | "stage3_gather_fp16_weights_on_model_save": true 96 | }, 97 | 98 | "fp16": { 99 | "enabled": true, 100 | "loss_scale": 0, 101 | "loss_scale_window": 1000, 102 | "initial_scale_power": 32, 103 | "hysteresis": 2, 104 | "min_loss_scale": 1 105 | }, 106 | 107 | "optimizer": { 108 | "type": "AdamW", 109 | "params": { 110 | "lr": 5.0e-5, 111 | "betas": [0.9, 0.98], 112 | "eps": 1.0e-8, 113 | "weight_decay": 5.0e-2 114 | } 115 | }, 116 | 117 | 118 | "sparse_attention": { 119 | "mode": "fixed", 120 | "block": 32, 121 | "different_layout_per_head": true, 122 | "num_local_blocks": 16, 123 | "num_global_blocks": 1, 124 | "attention": "bidirectional", 125 | "horizontal_global_attention": true, 126 | "num_different_global_patterns": 4 127 | } 128 | } 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /LF-VILA/src/optimization/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from timm.scheduler.cosine_lr import CosineLRScheduler 3 | from timm.scheduler.step_lr import StepLRScheduler 4 | from timm.scheduler.scheduler import Scheduler 5 | 6 | 7 | def build_scheduler(config, optimizer, n_iter_per_epoch): 8 | num_steps = int(config.TRAINING.EPOCHS * n_iter_per_epoch) 9 | warmup_steps = int(config.TRAINING.WARMUP_EPOCHS * n_iter_per_epoch) 10 | decay_steps = int(config.TRAINING.LR_SCHEDULER.DECAY_EPOCHS * n_iter_per_epoch) 11 | 12 | lr_scheduler = None 13 | if config.TRAINING.LR_SCHEDULER.NAME == 'cosine': 14 | lr_scheduler = CosineLRScheduler( 15 | optimizer, 16 | t_initial=num_steps, 17 | t_mul=1., 18 | lr_min=config.TRAINING.MIN_LR, 19 | warmup_lr_init=config.TRAINING.WARMUP_LR, 20 | warmup_t=warmup_steps, 21 | cycle_limit=1, 22 | t_in_epochs=False, 23 | ) 24 | elif config.TRAINING.LR_SCHEDULER.NAME == 'linear': 25 | lr_scheduler = LinearLRScheduler( 26 | optimizer, 27 | t_initial=num_steps, 28 | lr_min_rate=0.01, 29 | warmup_lr_init=config.TRAINING.WARMUP_LR, 30 | warmup_t=warmup_steps, 31 | t_in_epochs=False, 32 | ) 33 | elif config.TRAINING.LR_SCHEDULER.NAME == 'step': 34 | lr_scheduler = StepLRScheduler( 35 | optimizer, 36 | decay_t=decay_steps, 37 | decay_rate=config.TRAINING.LR_SCHEDULER.DECAY_RATE, 38 | warmup_lr_init=config.TRAINING.WARMUP_LR, 39 | warmup_t=warmup_steps, 40 | t_in_epochs=False, 41 | ) 42 | 43 | return lr_scheduler 44 | 45 | 46 | class LinearLRScheduler(Scheduler): 47 | def __init__(self, 48 | optimizer: torch.optim.Optimizer, 49 | t_initial: int, 50 | lr_min_rate: float, 51 | warmup_t=0, 52 | warmup_lr_init=0., 53 | t_in_epochs=True, 54 | noise_range_t=None, 55 | noise_pct=0.67, 56 | noise_std=1.0, 57 | noise_seed=42, 58 | initialize=True, 59 | ) -> None: 60 | super().__init__( 61 | optimizer, param_group_field="lr", 62 | noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, 63 | initialize=initialize) 64 | 65 | self.t_initial = t_initial 66 | self.lr_min_rate = lr_min_rate 67 | self.warmup_t = warmup_t 68 | self.warmup_lr_init = warmup_lr_init 69 | self.t_in_epochs = t_in_epochs 70 | if self.warmup_t: 71 | self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] 72 | super().update_groups(self.warmup_lr_init) 73 | else: 74 | self.warmup_steps = [1 for _ in self.base_values] 75 | 76 | def _get_lr(self, t): 77 | if t < self.warmup_t: 78 | lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] 79 | else: 80 | t = t - self.warmup_t 81 | total_t = self.t_initial - self.warmup_t 82 | lrs = [v - ((v - v * self.lr_min_rate) * (t / total_t)) for v in self.base_values] 83 | return lrs 84 | 85 | def get_epoch_values(self, epoch: int): 86 | if self.t_in_epochs: 87 | return self._get_lr(epoch) 88 | else: 89 | return None 90 | 91 | def get_update_values(self, num_updates: int): 92 | if not self.t_in_epochs: 93 | return self._get_lr(num_updates) 94 | else: 95 | return None -------------------------------------------------------------------------------- /LF-VILA/src/configs/didemo_ret.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 1 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | 22 | WEIGHTS: 23 | model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin' 24 | stage1_model_weight: '' 25 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 26 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 27 | pretrained_2d: True 28 | 29 | DATA: 30 | BATCH_SIZE_per_gpu: 16 31 | NUM_WORKERS: 12 32 | PIN_MEMORY: True 33 | 34 | sample_frame: 32 35 | sample_clip: 4 36 | input_res: [192, 320] 37 | center_crop: 200 38 | 39 | 40 | DATASET_train: { 41 | 'name': 'RetrievalDataset-train', 42 | 'type': 'RetrievalDataset', 43 | 'metadata_dir': 'datasets/lfvila_data/task/didemo/train.jsonl', 44 | 'video_path': 'datasets/didemo/didemo_video' 45 | } 46 | 47 | DATASET_val: [{ 48 | 'name': 'RetrievalDataset-val', 49 | 'type': 'RetrievalDataset', 50 | 'metadata_dir': 'datasets/lfvila_data/task/didemo/test.jsonl', 51 | 'video_path': 'datasets/didemo/didemo_video' 52 | } 53 | ] 54 | 55 | 56 | TRAINING: 57 | save_feats: 0 58 | do_eval2: false 59 | EPOCHS: 20 60 | WARMUP_EPOCHS: 1 61 | WARMUP_LR: 0. 62 | LR_SCHEDULER: { 63 | 'NAME': 'step', 64 | 'DECAY_EPOCHS': 5, 65 | 'DECAY_RATE': 0.25 66 | } 67 | 68 | use_mlm: false 69 | 70 | ct_global_loss_weight: 1 71 | 72 | temp: 0.05 73 | weight_decay: 0.05 74 | save_dir: "project/lfvila/lfvila_save/didemoret" 75 | checkpoint_step: 20000 76 | save_step: 10000 77 | print_step: 25 78 | eval_step: 25 79 | 80 | deepspeed_config: { 81 | "train_micro_batch_size_per_gpu": 16, 82 | "gradient_accumulation_steps": 1, 83 | "steps_per_print": 500, 84 | 85 | 86 | "zero_optimization": { 87 | "stage": 2, 88 | "allgather_partitions": true, 89 | "allgather_bucket_size": 5.0e+8, 90 | "overlap_comm": false, 91 | "reduce_scatter": true, 92 | "reduce_bucket_size": 5.0e+8, 93 | "contiguous_gradients" : false, 94 | "stage3_gather_fp16_weights_on_model_save": true 95 | }, 96 | 97 | "fp16": { 98 | "enabled": true, 99 | "loss_scale": 0, 100 | "loss_scale_window": 1000, 101 | "initial_scale_power": 32, 102 | "hysteresis": 2, 103 | "min_loss_scale": 1 104 | }, 105 | 106 | "optimizer": { 107 | "type": "AdamW", 108 | "params": { 109 | "lr": 5.0e-5, 110 | "betas": [0.9, 0.98], 111 | "eps": 1.0e-8, 112 | "weight_decay": 5.0e-2 113 | } 114 | }, 115 | 116 | 117 | "sparse_attention": { 118 | "mode": "fixed", 119 | "block": 32, 120 | "different_layout_per_head": true, 121 | "num_local_blocks": 16, 122 | "num_global_blocks": 1, 123 | "attention": "bidirectional", 124 | "horizontal_global_attention": true, 125 | "num_different_global_patterns": 4 126 | } 127 | } 128 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /LF-VILA/src/utils/dist.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | import torch 3 | import math 4 | 5 | def master_process(args): 6 | return (dist.get_rank() == 0) or (args.distributed == False) 7 | 8 | @torch.no_grad() 9 | def concat_all_gather(tensor): 10 | """ 11 | Performs all_gather operation on the provided tensors. 12 | *** Warning ***: torch.distributed.all_gather has no gradient. 13 | """ 14 | tensors_gather = [torch.ones_like(tensor) 15 | for _ in range(torch.distributed.get_world_size())] 16 | dist.all_gather(tensors_gather, tensor, async_op=False) 17 | 18 | output = torch.cat(tensors_gather, dim=0) 19 | return output 20 | 21 | class SyncFunction(torch.autograd.Function): 22 | 23 | @staticmethod 24 | def forward(ctx, tensor): 25 | ctx.batch_size = tensor.shape[0] 26 | 27 | gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())] 28 | 29 | torch.distributed.all_gather(gathered_tensor, tensor) 30 | gathered_tensor = torch.cat(gathered_tensor, 0) 31 | 32 | return gathered_tensor 33 | 34 | @staticmethod 35 | def backward(ctx, grad_output): 36 | grad_input = grad_output.clone() 37 | torch.distributed.all_reduce(grad_input, op=torch.distributed.ReduceOp.SUM, async_op=False) 38 | 39 | idx_from = torch.distributed.get_rank() * ctx.batch_size 40 | idx_to = (torch.distributed.get_rank() + 1) * ctx.batch_size 41 | return grad_input[idx_from:idx_to] 42 | 43 | 44 | class SequentialDistributedSampler(torch.utils.data.sampler.Sampler): 45 | """ 46 | Distributed Sampler that subsamples indices sequentially, making it easier to collate all results at the end. 47 | 48 | Even though we only use this sampler for eval and predict (no training), which means that the model params won't 49 | have to be synced (i.e. will not hang for synchronization even if varied number of forward passes), we still add 50 | extra samples to the sampler to make it evenly divisible (like in `DistributedSampler`) to make it easy to `gather` 51 | or `reduce` resulting tensors at the end of the loop. 52 | """ 53 | 54 | def __init__(self, dataset, num_replicas=None, rank=None): 55 | if num_replicas is None: 56 | if not torch.distributed.is_available(): 57 | raise RuntimeError("Requires distributed package to be available") 58 | num_replicas = torch.distributed.get_world_size() 59 | if rank is None: 60 | if not torch.distributed.is_available(): 61 | raise RuntimeError("Requires distributed package to be available") 62 | rank = torch.distributed.get_rank() 63 | self.dataset = dataset 64 | self.num_replicas = num_replicas 65 | self.rank = rank 66 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 67 | self.total_size = self.num_samples * self.num_replicas 68 | 69 | def __iter__(self): 70 | indices = list(range(len(self.dataset))) 71 | 72 | # add extra samples to make it evenly divisible 73 | indices += indices[: (self.total_size - len(indices))] 74 | assert ( 75 | len(indices) == self.total_size 76 | ), f"Indices length {len(indices)} and total size {self.total_size} mismatched" 77 | 78 | # subsample 79 | indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples] 80 | assert ( 81 | len(indices) == self.num_samples 82 | ), f"Indices length {len(indices)} and sample number {self.num_samples} mismatched" 83 | 84 | return iter(indices) 85 | 86 | def __len__(self): 87 | return self.num_samples -------------------------------------------------------------------------------- /LF-VILA/src/configs/cmovie_ret.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 1 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | 22 | 23 | WEIGHTS: 24 | model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin' 25 | stage1_model_weight: '' 26 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 27 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 28 | pretrained_2d: True 29 | 30 | DATA: 31 | BATCH_SIZE_per_gpu: 16 32 | NUM_WORKERS: 12 33 | PIN_MEMORY: True 34 | 35 | sample_frame: 32 36 | sample_clip: 1 37 | input_res: [192, 320] 38 | center_crop: 200 39 | 40 | 41 | DATASET_train: { 42 | 'name': 'RetrievalDataset-train', 43 | 'type': 'RetrievalDataset', 44 | 'metadata_dir': 'datasets/lfvila_data/task/cmovie/train.jsonl', 45 | 'video_path': 'datasets/CondensedMovies/cmovie_video' 46 | } 47 | 48 | DATASET_val: [{ 49 | 'name': 'RetrievalDataset-val', 50 | 'type': 'RetrievalDataset', 51 | 'metadata_dir': 'datasets/lfvila_data/task/cmovie/val.jsonl', 52 | 'video_path': 'datasets/CondensedMovies/cmovie_video' 53 | } 54 | ] 55 | 56 | 57 | TRAINING: 58 | save_feats: 0 59 | do_eval2: false 60 | EPOCHS: 20 61 | WARMUP_EPOCHS: 1 62 | WARMUP_LR: 0. 63 | LR_SCHEDULER: { 64 | 'NAME': 'step', 65 | 'DECAY_EPOCHS': 1, 66 | 'DECAY_RATE': 0.5 67 | } 68 | 69 | use_mlm: false 70 | 71 | ct_global_loss_weight: 1 72 | 73 | temp: 0.05 74 | weight_decay: 0.05 75 | save_dir: "project/lfvila/lfvila_save/cmovieret" 76 | checkpoint_step: 20000 77 | save_step: 10000 78 | print_step: 100 79 | eval_step: 100 80 | 81 | deepspeed_config: { 82 | "train_micro_batch_size_per_gpu": 16, 83 | "gradient_accumulation_steps": 1, 84 | "steps_per_print": 500, 85 | 86 | 87 | "zero_optimization": { 88 | "stage": 2, 89 | "allgather_partitions": true, 90 | "allgather_bucket_size": 5.0e+8, 91 | "overlap_comm": false, 92 | "reduce_scatter": true, 93 | "reduce_bucket_size": 5.0e+8, 94 | "contiguous_gradients" : false, 95 | "stage3_gather_fp16_weights_on_model_save": true 96 | }, 97 | 98 | "fp16": { 99 | "enabled": true, 100 | "loss_scale": 0, 101 | "loss_scale_window": 1000, 102 | "initial_scale_power": 32, 103 | "hysteresis": 2, 104 | "min_loss_scale": 1 105 | }, 106 | 107 | "optimizer": { 108 | "type": "AdamW", 109 | "params": { 110 | "lr": 5.0e-5, 111 | "betas": [0.9, 0.98], 112 | "eps": 1.0e-8, 113 | "weight_decay": 5.0e-2 114 | } 115 | }, 116 | 117 | 118 | "sparse_attention": { 119 | "mode": "fixed", 120 | "block": 32, 121 | "different_layout_per_head": true, 122 | "num_local_blocks": 16, 123 | "num_global_blocks": 1, 124 | "attention": "bidirectional", 125 | "horizontal_global_attention": true, 126 | "num_different_global_patterns": 4 127 | } 128 | } 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/actnet_ret.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 1 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | 22 | WEIGHTS: 23 | model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin' 24 | stage1_model_weight: '' 25 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 26 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 27 | pretrained_2d: True 28 | 29 | DATA: 30 | BATCH_SIZE_per_gpu: 16 31 | NUM_WORKERS: 6 32 | NUM_WORKERS: 6 33 | PIN_MEMORY: True 34 | 35 | sample_frame: 32 36 | sample_clip: 4 37 | input_res: [192, 320] 38 | center_crop: 200 39 | 40 | 41 | DATASET_train: { 42 | 'name': 'RetrievalDataset-train', 43 | 'type': 'RetrievalDataset', 44 | 'metadata_dir': 'datasets/lfvila_data/task/actnet/train.jsonl', 45 | 'video_path': 'datasets/activitynet/actnet_video' 46 | } 47 | 48 | DATASET_val: [{ 49 | 'name': 'RetrievalDataset-val', 50 | 'type': 'RetrievalDataset', 51 | 'metadata_dir': 'datasets/lfvila_data/task/actnet/val1.jsonl', 52 | 'video_path': 'datasets/activitynet/actnet_video' 53 | } 54 | ] 55 | 56 | 57 | TRAINING: 58 | save_feats: 0 59 | do_eval2: false 60 | EPOCHS: 20 61 | WARMUP_EPOCHS: 1 62 | WARMUP_LR: 0. 63 | LR_SCHEDULER: { 64 | 'NAME': 'step', 65 | 'DECAY_EPOCHS': 5, 66 | 'DECAY_RATE': 0.25 67 | } 68 | 69 | use_mlm: false 70 | 71 | ct_global_loss_weight: 1 72 | 73 | temp: 0.05 74 | weight_decay: 0.05 75 | save_dir: "project/lfvila/lfvila_save/actnetret" 76 | checkpoint_step: 20000 77 | save_step: 10000 78 | print_step: 100 79 | eval_step: 100 80 | 81 | deepspeed_config: { 82 | "train_micro_batch_size_per_gpu": 16, 83 | "gradient_accumulation_steps": 1, 84 | "steps_per_print": 500, 85 | 86 | 87 | "zero_optimization": { 88 | "stage": 2, 89 | "allgather_partitions": true, 90 | "allgather_bucket_size": 5.0e+8, 91 | "overlap_comm": false, 92 | "reduce_scatter": true, 93 | "reduce_bucket_size": 5.0e+8, 94 | "contiguous_gradients" : false, 95 | "stage3_gather_fp16_weights_on_model_save": true 96 | }, 97 | 98 | "fp16": { 99 | "enabled": true, 100 | "loss_scale": 0, 101 | "loss_scale_window": 1000, 102 | "initial_scale_power": 32, 103 | "hysteresis": 2, 104 | "min_loss_scale": 1 105 | }, 106 | 107 | "optimizer": { 108 | "type": "AdamW", 109 | "params": { 110 | "lr": 5.0e-5, 111 | "betas": [0.9, 0.98], 112 | "eps": 1.0e-8, 113 | "weight_decay": 5.0e-2 114 | } 115 | }, 116 | 117 | 118 | "sparse_attention": { 119 | "mode": "fixed", 120 | "block": 32, 121 | "different_layout_per_head": true, 122 | "num_local_blocks": 16, 123 | "num_global_blocks": 1, 124 | "attention": "bidirectional", 125 | "horizontal_global_attention": true, 126 | "num_different_global_patterns": 4 127 | } 128 | } 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/coin_cls.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 1 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | 22 | 23 | WEIGHTS: 24 | model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin' 25 | stage1_model_weight: '' 26 | bert_weight: '' 27 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 28 | pretrained_2d: True 29 | 30 | DATA: 31 | BATCH_SIZE_per_gpu: 16 32 | NUM_WORKERS: 12 33 | PIN_MEMORY: True 34 | 35 | sample_frame: 32 36 | sample_clip: 4 37 | input_res: [192, 320] 38 | center_crop: 200 39 | 40 | classification_labels: 180 41 | 42 | tokenizer_dir: 'project/lfvideo/pretrained/bert-large-uncased/' 43 | 44 | DATASET_train: { 45 | 'name': 'VideoClassificationDataset-train', 46 | 'type': 'VideoClassificationDataset', 47 | 'metadata_dir': 'datasets/lfvila_data/task/COIN/coin_train.jsonl', 48 | 'video_path': 'datasets/COIN/coin_video' 49 | } 50 | 51 | DATASET_val: [{ 52 | 'name': 'VideoClassificationDataset-val', 53 | 'type': 'VideoClassificationDataset', 54 | 'metadata_dir': 'datasets/lfvila_data/task/COIN/coin_test.jsonl', 55 | 'video_path': 'datasets/COIN/coin_video' 56 | } 57 | ] 58 | 59 | 60 | TRAINING: 61 | save_feats: 0 62 | only_val: 0 63 | EPOCHS: 500 64 | WARMUP_EPOCHS: 1 65 | WARMUP_LR: 0. 66 | LR_SCHEDULER: { 67 | 'NAME': 'linear', 68 | 'DECAY_EPOCHS': 10, 69 | } 70 | 71 | use_mlm: false 72 | 73 | 74 | temp: 0.05 75 | weight_decay: 0.05 76 | save_dir: "project/lfvideo/lfvideo_save/coin" 77 | checkpoint_step: 20000 78 | save_step: 10000 79 | print_step: 50 80 | eval_step: 200 81 | 82 | deepspeed_config: { 83 | "train_micro_batch_size_per_gpu": 16, 84 | "gradient_accumulation_steps": 1, 85 | "steps_per_print": 500, 86 | 87 | 88 | "zero_optimization": { 89 | "stage": 2, 90 | "allgather_partitions": true, 91 | "allgather_bucket_size": 5.0e+8, 92 | "overlap_comm": false, 93 | "reduce_scatter": true, 94 | "reduce_bucket_size": 5.0e+8, 95 | "contiguous_gradients" : false, 96 | "stage3_gather_fp16_weights_on_model_save": true 97 | }, 98 | 99 | "fp16": { 100 | "enabled": true, 101 | "loss_scale": 0, 102 | "loss_scale_window": 1000, 103 | "initial_scale_power": 32, 104 | "hysteresis": 2, 105 | "min_loss_scale": 1 106 | }, 107 | 108 | "optimizer": { 109 | "type": "AdamW", 110 | "params": { 111 | "lr": 5.0e-5, 112 | "betas": [0.9, 0.98], 113 | "eps": 1.0e-8, 114 | "weight_decay": 5.0e-2 115 | } 116 | }, 117 | 118 | 119 | "sparse_attention": { 120 | "mode": "fixed", 121 | "block": 32, 122 | "different_layout_per_head": true, 123 | "num_local_blocks": 16, 124 | "num_global_blocks": 1, 125 | "attention": "bidirectional", 126 | "horizontal_global_attention": true, 127 | "num_different_global_patterns": 4 128 | } 129 | } 130 | 131 | 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/lvu_scene_cls.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 1 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | 22 | 23 | WEIGHTS: 24 | model_weight: 'project/lfvila/saved_model/lfvila_stage1.bin' 25 | stage1_model_weight: '' 26 | bert_weight: '' 27 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 28 | pretrained_2d: True 29 | 30 | DATA: 31 | BATCH_SIZE_per_gpu: 16 32 | NUM_WORKERS: 12 33 | PIN_MEMORY: True 34 | 35 | sample_frame: 32 36 | sample_clip: 4 37 | input_res: [192, 320] 38 | center_crop: 200 39 | 40 | classification_labels: 6 41 | 42 | tokenizer_dir: 'project/lfvila/pretrained/bert-large-uncased/' 43 | 44 | DATASET_train: { 45 | 'name': 'VideoClassificationDataset-train', 46 | 'type': 'VideoClassificationDataset', 47 | 'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/scene_train.jsonl', 48 | 'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video' 49 | } 50 | 51 | DATASET_val: [{ 52 | 'name': 'VideoClassificationDataset-val', 53 | 'type': 'VideoClassificationDataset', 54 | 'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/scene_test.jsonl', 55 | 'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video' 56 | } 57 | ] 58 | 59 | 60 | TRAINING: 61 | save_feats: 0 62 | only_val: 0 63 | EPOCHS: 500 64 | WARMUP_EPOCHS: 1 65 | WARMUP_LR: 0. 66 | LR_SCHEDULER: { 67 | 'NAME': 'linear', 68 | 'DECAY_EPOCHS': 10, 69 | } 70 | 71 | use_mlm: false 72 | 73 | temp: 0.05 74 | weight_decay: 0.05 75 | save_dir: "project/lfvila/lfvila_save/lvu_scene" 76 | checkpoint_step: 20000 77 | save_step: 10000 78 | print_step: 10 79 | eval_step: 50 80 | 81 | deepspeed_config: { 82 | "train_micro_batch_size_per_gpu": 16, 83 | "gradient_accumulation_steps": 1, 84 | "steps_per_print": 500, 85 | 86 | 87 | "zero_optimization": { 88 | "stage": 2, 89 | "allgather_partitions": true, 90 | "allgather_bucket_size": 5.0e+8, 91 | "overlap_comm": false, 92 | "reduce_scatter": true, 93 | "reduce_bucket_size": 5.0e+8, 94 | "contiguous_gradients" : false, 95 | "stage3_gather_fp16_weights_on_model_save": true 96 | }, 97 | 98 | "fp16": { 99 | "enabled": true, 100 | "loss_scale": 0, 101 | "loss_scale_window": 1000, 102 | "initial_scale_power": 32, 103 | "hysteresis": 2, 104 | "min_loss_scale": 1 105 | }, 106 | 107 | "optimizer": { 108 | "type": "AdamW", 109 | "params": { 110 | "lr": 5.0e-5, 111 | "betas": [0.9, 0.98], 112 | "eps": 1.0e-8, 113 | "weight_decay": 5.0e-2 114 | } 115 | }, 116 | 117 | 118 | "sparse_attention": { 119 | "mode": "fixed", 120 | "block": 32, 121 | "different_layout_per_head": true, 122 | "num_local_blocks": 16, 123 | "num_global_blocks": 1, 124 | "attention": "bidirectional", 125 | "horizontal_global_attention": true, 126 | "num_different_global_patterns": 4 127 | } 128 | } 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/lvu_relationship_cls.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 1 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | 22 | 23 | WEIGHTS: 24 | model_weight: 'project/lfvila/saved_model/lfvila_stage1.bin' 25 | stage1_model_weight: '' 26 | bert_weight: '' 27 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 28 | pretrained_2d: True 29 | 30 | DATA: 31 | BATCH_SIZE_per_gpu: 16 32 | NUM_WORKERS: 12 33 | PIN_MEMORY: True 34 | 35 | sample_frame: 32 36 | sample_clip: 4 37 | input_res: [192, 320] 38 | center_crop: 200 39 | 40 | classification_labels: 4 41 | 42 | tokenizer_dir: 'project/lfvila/pretrained/bert-large-uncased/' 43 | 44 | DATASET_train: { 45 | 'name': 'VideoClassificationDataset-train', 46 | 'type': 'VideoClassificationDataset', 47 | 'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/relationship_train.jsonl', 48 | 'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video' 49 | } 50 | 51 | DATASET_val: [{ 52 | 'name': 'VideoClassificationDataset-val', 53 | 'type': 'VideoClassificationDataset', 54 | 'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/relationship_test.jsonl', 55 | 'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video' 56 | } 57 | ] 58 | 59 | 60 | TRAINING: 61 | save_feats: 0 62 | only_val: 0 63 | EPOCHS: 500 64 | WARMUP_EPOCHS: 1 65 | WARMUP_LR: 0. 66 | LR_SCHEDULER: { 67 | 'NAME': 'linear', 68 | 'DECAY_EPOCHS': 10, 69 | } 70 | 71 | use_mlm: false 72 | 73 | temp: 0.05 74 | weight_decay: 0.05 75 | save_dir: "project/lfvila/lfvila_save/lvu_relation" 76 | checkpoint_step: 20000 77 | save_step: 10000 78 | print_step: 5 79 | eval_step: 5 80 | 81 | deepspeed_config: { 82 | "train_micro_batch_size_per_gpu": 16, 83 | "gradient_accumulation_steps": 1, 84 | "steps_per_print": 500, 85 | 86 | 87 | "zero_optimization": { 88 | "stage": 2, 89 | "allgather_partitions": true, 90 | "allgather_bucket_size": 5.0e+8, 91 | "overlap_comm": false, 92 | "reduce_scatter": true, 93 | "reduce_bucket_size": 5.0e+8, 94 | "contiguous_gradients" : false, 95 | "stage3_gather_fp16_weights_on_model_save": true 96 | }, 97 | 98 | "fp16": { 99 | "enabled": true, 100 | "loss_scale": 0, 101 | "loss_scale_window": 1000, 102 | "initial_scale_power": 32, 103 | "hysteresis": 2, 104 | "min_loss_scale": 1 105 | }, 106 | 107 | "optimizer": { 108 | "type": "AdamW", 109 | "params": { 110 | "lr": 5.0e-5, 111 | "betas": [0.9, 0.98], 112 | "eps": 1.0e-8, 113 | "weight_decay": 5.0e-2 114 | } 115 | }, 116 | 117 | 118 | "sparse_attention": { 119 | "mode": "fixed", 120 | "block": 32, 121 | "different_layout_per_head": true, 122 | "num_local_blocks": 16, 123 | "num_global_blocks": 1, 124 | "attention": "bidirectional", 125 | "horizontal_global_attention": true, 126 | "num_different_global_patterns": 4 127 | } 128 | } 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/how2_qa.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | bert_config: "src/configs/bert_large_config.json" 15 | stage: 2 16 | type_vocab_size: 8 17 | num_local_layers: 8 18 | stage1_layers: 12 19 | bert_frozen_stage: -1 20 | final_num_patches: 6 21 | use_simple_merge_qas: false 22 | 23 | qa_type: 'multichoice' 24 | 25 | WEIGHTS: 26 | model_weight: 'project/lfvila/pretrained/lfvila_stage2.bin' 27 | stage1_model_weight: '' 28 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 29 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 30 | pretrained_2d: True 31 | 32 | subtitle_fuse: false 33 | 34 | DATA: 35 | BATCH_SIZE_per_gpu: 4 36 | NUM_WORKERS: 8 37 | PIN_MEMORY: True 38 | 39 | sample_frame: 32 40 | sample_clip: 4 41 | input_res: [192, 320] 42 | center_crop: 200 43 | 44 | use_subtitle: true 45 | max_num_subtitle: 6 46 | max_text_lenght: 50 47 | 48 | DATASET_train: { 49 | 'name': 'QADataset-train', 50 | 'type': 'How2QADataset', 51 | 'metadata_dir': 'datasets/lfvila_data/task/how2qa/how2qa_train.jsonl', 52 | 'video_path': 'datasets/how2qa/how2qa_video' 53 | } 54 | 55 | DATASET_val: [{ 56 | 'name': 'QADataset-val', 57 | 'type': 'How2QADataset', 58 | 'metadata_dir': 'datasets/lfvila_data/task/how2qa/how2qa_val.jsonl', 59 | 'video_path': 'datasets/how2qa/how2qa_video' 60 | }] 61 | 62 | 63 | TRAINING: 64 | EPOCHS: 30 65 | WARMUP_EPOCHS: 10 66 | WARMUP_LR: 0. 67 | LR_SCHEDULER: { 68 | 'NAME': 'linear', 69 | 'DECAY_EPOCHS': 10, 70 | } 71 | 72 | use_span_loss: true 73 | span_loss_weight: 1.0 74 | 75 | use_mlm: false 76 | 77 | weight_decay: 0.05 78 | 79 | save_dir: "project/lfvila/lfvila_save/how2qa" 80 | checkpoint_step: 10000 81 | save_step: 5000 82 | print_step: 100 83 | eval_step: 500 84 | 85 | deepspeed_config: { 86 | "train_micro_batch_size_per_gpu": 4, 87 | "gradient_accumulation_steps": 1, 88 | "steps_per_print": 500, 89 | 90 | 91 | "zero_optimization": { 92 | "stage": 2, 93 | "allgather_partitions": true, 94 | "allgather_bucket_size": 5.0e+8, 95 | "overlap_comm": false, 96 | "reduce_scatter": true, 97 | "reduce_bucket_size": 5.0e+8, 98 | "contiguous_gradients" : false, 99 | "stage3_gather_fp16_weights_on_model_save": true 100 | }, 101 | 102 | "fp16": { 103 | "enabled": true, 104 | "loss_scale": 0, 105 | "loss_scale_window": 1000, 106 | "initial_scale_power": 32, 107 | "hysteresis": 2, 108 | "min_loss_scale": 1 109 | }, 110 | 111 | "optimizer": { 112 | "type": "AdamW", 113 | "params": { 114 | "lr": 5.0e-5, 115 | "betas": [0.9, 0.98], 116 | "eps": 1.0e-8, 117 | "weight_decay": 5.0e-2 118 | } 119 | }, 120 | 121 | 122 | "sparse_attention": { 123 | "mode": "fixed", 124 | "block": 32, 125 | "different_layout_per_head": true, 126 | "num_local_blocks": 16, 127 | "num_global_blocks": 1, 128 | "attention": "bidirectional", 129 | "horizontal_global_attention": true, 130 | "num_different_global_patterns": 4 131 | } 132 | } 133 | 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/lvu_wayspeaking_cls.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 1 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: -1 21 | 22 | 23 | WEIGHTS: 24 | model_weight: 'project/lfvila/saved_model/lfvila_stage1.bin' 25 | stage1_model_weight: '' 26 | bert_weight: '' 27 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 28 | pretrained_2d: True 29 | 30 | DATA: 31 | BATCH_SIZE_per_gpu: 16 32 | NUM_WORKERS: 12 33 | PIN_MEMORY: True 34 | 35 | sample_frame: 32 36 | sample_clip: 4 37 | input_res: [192, 320] 38 | center_crop: 200 39 | 40 | classification_labels: 5 41 | 42 | tokenizer_dir: 'project/lfvila/pretrained/bert-large-uncased/' 43 | 44 | DATASET_train: { 45 | 'name': 'VideoClassificationDataset-train', 46 | 'type': 'VideoClassificationDataset', 47 | 'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/wayspeaking_train.jsonl', 48 | 'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video' 49 | } 50 | 51 | DATASET_val: [{ 52 | 'name': 'VideoClassificationDataset-val', 53 | 'type': 'VideoClassificationDataset', 54 | 'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/wayspeaking_test.jsonl', 55 | 'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video' 56 | } 57 | ] 58 | 59 | 60 | TRAINING: 61 | save_feats: 0 62 | only_val: 0 63 | EPOCHS: 500 64 | WARMUP_EPOCHS: 1 65 | WARMUP_LR: 0. 66 | LR_SCHEDULER: { 67 | 'NAME': 'linear', 68 | 'DECAY_EPOCHS': 10, 69 | } 70 | 71 | use_mlm: false 72 | 73 | temp: 0.05 74 | weight_decay: 0.05 75 | save_dir: "project/lfvila/lfvila_save/lvu_wayspeaking" 76 | checkpoint_step: 20000 77 | save_step: 10000 78 | print_step: 10 79 | eval_step: 10 80 | 81 | deepspeed_config: { 82 | "train_micro_batch_size_per_gpu": 16, 83 | "gradient_accumulation_steps": 1, 84 | "steps_per_print": 500, 85 | 86 | 87 | "zero_optimization": { 88 | "stage": 2, 89 | "allgather_partitions": true, 90 | "allgather_bucket_size": 5.0e+8, 91 | "overlap_comm": false, 92 | "reduce_scatter": true, 93 | "reduce_bucket_size": 5.0e+8, 94 | "contiguous_gradients" : false, 95 | "stage3_gather_fp16_weights_on_model_save": true 96 | }, 97 | 98 | "fp16": { 99 | "enabled": true, 100 | "loss_scale": 0, 101 | "loss_scale_window": 1000, 102 | "initial_scale_power": 32, 103 | "hysteresis": 2, 104 | "min_loss_scale": 1 105 | }, 106 | 107 | "optimizer": { 108 | "type": "AdamW", 109 | "params": { 110 | "lr": 5.0e-5, 111 | "betas": [0.9, 0.98], 112 | "eps": 1.0e-8, 113 | "weight_decay": 5.0e-2 114 | } 115 | }, 116 | 117 | 118 | "sparse_attention": { 119 | "mode": "fixed", 120 | "block": 32, 121 | "different_layout_per_head": true, 122 | "num_local_blocks": 16, 123 | "num_global_blocks": 1, 124 | "attention": "bidirectional", 125 | "horizontal_global_attention": true, 126 | "num_different_global_patterns": 4 127 | } 128 | } 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /hd-vila/scripts/process_raw_video/compress_video.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import subprocess 4 | import time 5 | from multiprocessing import cpu_count 6 | import subprocess 7 | import multiprocessing 8 | from joblib import Parallel, delayed 9 | import jsonlines 10 | from tqdm import tqdm 11 | from multiprocessing import Pool 12 | from glob import glob 13 | from decord import VideoReader, cpu 14 | 15 | 16 | def parse_args(): 17 | parser = argparse.ArgumentParser(description='video processing') 18 | parser.add_argument('--workdir', default='/data',type=str, help='work dir') 19 | parser.add_argument('--inputdir', default='datasets/msrvtt/videos', type=str, help='inputdir') 20 | parser.add_argument('--outputdir', default='datasets/msrvtt/videos_6fps', type=str, help='outputdir') 21 | parser.add_argument('--vidfile', default='datasets/msrvtt/train.jsonl', type=str, help='video id') 22 | args = parser.parse_args() 23 | return args 24 | 25 | def check_dirs(dirs): 26 | if not os.path.exists(dirs): 27 | print(dirs) 28 | os.makedirs(dirs, exist_ok=True) 29 | 30 | 31 | class CompressVideo(): 32 | def __init__(self, vidfile, workdir, inputdir, outputdir): 33 | self.workdir = workdir 34 | self.vidfile = vidfile 35 | self.inputdir = inputdir 36 | self.outputdir = outputdir 37 | self.vids = self.loadvids() 38 | 39 | def loadvids(self): 40 | vids = [] 41 | with open(os.path.join(self.workdir,self.vidfile), 'r') as f: 42 | for l in jsonlines.Reader(f): 43 | vids.append(l) 44 | return vids 45 | 46 | def run(self, cmd): 47 | proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 48 | out, _ = proc.communicate() 49 | return out.decode('utf-8') 50 | 51 | def compress_single_clip(self,data): 52 | vid = data['clip_id'] 53 | 54 | input_video_path = os.path.join(self.workdir, self.inputdir, '{}.mp4'.format(vid)) 55 | 56 | vr = VideoReader(input_video_path, ctx=cpu(0)) 57 | time = len(vr) * vr.get_avg_fps() 58 | 59 | output_video_path = os.path.join(self.workdir,self.outputdir, vid+'.mp4') 60 | check_dirs(os.path.join(self.workdir,self.outputdir)) 61 | 62 | cmd = ['ffmpeg', 63 | '-y', # (optional) overwrite output file if it exists 64 | '-i', input_video_path, 65 | '-max_muxing_queue_size', '9999', 66 | '-r', '6', 67 | output_video_path] 68 | 69 | 70 | self.run(cmd) 71 | 72 | if os.path.isfile(output_video_path): 73 | return vid + '*' + str(len(vr)) 74 | else: 75 | return None 76 | 77 | 78 | def compress_clips(self): 79 | 80 | results = [] 81 | print('start process') 82 | for vid in tqdm(self.vids): 83 | result = self.compress_single_clip(vid) 84 | results.append(result) 85 | print(len(results)) 86 | 87 | 88 | def compress_clips_parallel(self): 89 | num_cores = multiprocessing.cpu_count() 90 | print(num_cores) 91 | print('start process') 92 | results = Parallel(n_jobs=20, backend = 'threading')(delayed(self.compress_single_clip)(v) for v in tqdm(self.vids)) 93 | 94 | results = [x for x in results if x is not None] 95 | 96 | print(len(results)) 97 | 98 | 99 | if __name__ == "__main__": 100 | 101 | args = parse_args() 102 | print(args) 103 | 104 | cpv = CompressVideo(args.vidfile, args.blob_mount_dir, args.inputdir, args.outputdir) 105 | cpv.compress_clips_parallel() 106 | -------------------------------------------------------------------------------- /LF-VILA/src/datasets/video_classification_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import jsonlines 4 | import decord 5 | import lmdb 6 | from decord import VideoReader, cpu 7 | import numpy as np 8 | import torch 9 | from torch.utils.data import Dataset 10 | from torch.utils.data.dataloader import default_collate 11 | from src.utils.logger import LOGGER 12 | 13 | decord.bridge.set_bridge("torch") 14 | 15 | class VideoClassificationDataset(Dataset): 16 | def __init__(self, 17 | cfg, 18 | metadata_dir, 19 | video_path, 20 | sample_frame, 21 | sample_clip, 22 | tokenizer, 23 | transform=None, 24 | return_index=False, 25 | is_train=True, 26 | **kwargs 27 | ): 28 | self.cfg = cfg 29 | self.metadata_dir = metadata_dir 30 | self.transform = transform 31 | self.video_path = video_path 32 | self.return_index = return_index 33 | self.reliable_idx_list = [] 34 | self.sample_frame = sample_frame 35 | 36 | self._load_metadata() 37 | self.is_train = is_train 38 | 39 | def _load_metadata(self): 40 | data = [] 41 | with open(self.metadata_dir) as f: 42 | for l in jsonlines.Reader(f): 43 | data.append(l) 44 | self.metadata = data 45 | 46 | def _read_video(self, video_id, sample_frame_num): 47 | ''' 48 | read frames from long video 49 | args: 50 | video_id: str, 51 | sample_frame_num: frames used 52 | return: 53 | img_arrays: [num_frm, 3, H, W] 54 | chunk_mask: [num_frm, n_clip], , mask for indicating frames belong to each clip 55 | 56 | ''' 57 | 58 | video_path = os.path.join(self.video_path, video_id + '.mp4') 59 | vr = VideoReader(video_path, ctx=cpu(0)) 60 | num_frame = len(vr) 61 | 62 | if self.is_train: 63 | interval = int(num_frame / (sample_frame_num - 1)) 64 | start = np.random.randint(0, interval+1) 65 | end = np.random.randint(num_frame-1-interval, num_frame) 66 | frame_idx = np.linspace(start, end, num=sample_frame_num).astype(int) 67 | else: 68 | frame_idx = np.linspace(0, num_frame-1, num=sample_frame_num).astype(int) 69 | 70 | img_arrays = vr.get_batch(frame_idx) 71 | 72 | img_arrays = img_arrays.float() / 255 73 | 74 | img_arrays = img_arrays.permute(0, 3, 1, 2) # N,C,H,W 75 | 76 | return img_arrays 77 | 78 | def __len__(self): 79 | return len(self.metadata) 80 | 81 | def __getitem__(self, index): 82 | num_retries = 10 83 | for j in range(num_retries): 84 | try: 85 | item = self.metadata[index] 86 | 87 | video_id = item['video_id'] 88 | 89 | video = self._read_video(video_id, self.sample_frame) 90 | 91 | 92 | label = int(item['recipe_type']) 93 | 94 | if self.transform is not None: 95 | video = self.transform(video) # N, C, H, W 96 | video = video.permute(1, 0, 2, 3) # C, N, H, W 97 | 98 | data = { 99 | 'video_frames': video, # C, N, H, W 100 | 'label': torch.tensor(label) 101 | } 102 | except: 103 | index = random.randint(0, len(self) - 1) 104 | continue 105 | else: 106 | break 107 | 108 | if self.return_index: 109 | data['index'] = torch.tensor(index) 110 | 111 | return data 112 | 113 | 114 | -------------------------------------------------------------------------------- /CLIP-ViP/src/utils/stop_words.py: -------------------------------------------------------------------------------- 1 | """List of stop words.""" 2 | # This list of English stop words is taken from the "Glasgow Information 3 | # Retrieval Group". The original list can be found at 4 | # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words 5 | ENGLISH_STOP_WORDS = frozenset([ 6 | "a", "about", "above", "across", "actually", "after", "afterwards", "again", 7 | "against", "all", "almost", "alone", "along", "already", "also", "although", 8 | "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", 9 | "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", 10 | "are", "around", "as", "at", "back", "be", "became", "because", "become", 11 | "becomes", "becoming", "been", "before", "beforehand", "behind", "being", 12 | "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", 13 | "but", "by", "call", "can", "cannot", "cant", "can't", "co", "con", "could", 14 | "couldnt", "cry", "de", "describe", "detail", "do", "done", "don't", "down", 15 | "due", "during", "each", "easy", "eg", "eight", "either", "eleven", "else", 16 | "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", 17 | "everything", "everywhere", "except", "few", "fifteen", "fifty", "find", 18 | "fire", "first", "five", "for", "former", "formerly", "forty", "found", 19 | "four", "from", "further", "give", "had", "has", "hasnt", "have", "he", 20 | "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", 21 | "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", 22 | "if", "i'm", "i'll", "i've", "in", "inc", "indeed", "interest", "is", "it", 23 | "it'll", "its", "it's", "itself", "just", "keep", "last", "latter", 24 | "latterly", "least", "less", "like", "ltd", "made", "many", "may", "me", 25 | "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", 26 | "much", "must", "my", "myself", "name", "namely", "neither", "never", 27 | "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", 28 | "not", "nothing", "now", "nowhere", "of", "off", "often", "ok", "okay", 29 | "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", 30 | "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", 31 | "please", "put", "rather", "re", "really", "same", "see", "seem", "seemed", 32 | "seeming", "seems", "serious", "several", "she", "should", "show", "side", 33 | "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", 34 | "something", "sometime", "sometimes", "somewhere", "still", "such", "take", 35 | "ten", "than", "thank", "thanks", "that", "that's", "the", "their", "them", 36 | "themselves", "then", "thence", "there", "thereafter", "thereby", 37 | "therefore", "therein", "thereupon", "these", "they", "third", "this", 38 | "those", "though", "three", "through", "throughout", "thru", "thus", "to", 39 | "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", 40 | "un", "until", "up", "upon", "us", "very", "via", "view", "viewing", 41 | "viewer", "was", "we", "we'll", "well", "welcome", "were", "what", 42 | "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", 43 | "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", 44 | "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", 45 | "with", "within", "without", "would", "wont", "won't", "yet", "you", "your", 46 | "yours", "you've", "you'll", "yourself", "yourselves", "youtube", "going", 47 | "want", "right", "you're", "we're", "know", "gonna", "need", "bit", "look", 48 | "yeah", "guys", "sure", "let's", "video", "oh", "let", "today", "they're", 49 | "did", "looks", "different", "great", "different", "say", "um", "probably", 50 | "kind", "doesn't", "does", "maybe", "hey", "we've", "better", "hope", 51 | "there's", "try" 52 | ]) -------------------------------------------------------------------------------- /hd-vila/src/utils/stop_words.py: -------------------------------------------------------------------------------- 1 | """List of stop words.""" 2 | # This list of English stop words is taken from the "Glasgow Information 3 | # Retrieval Group". The original list can be found at 4 | # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words 5 | ENGLISH_STOP_WORDS = frozenset([ 6 | "a", "about", "above", "across", "actually", "after", "afterwards", "again", 7 | "against", "all", "almost", "alone", "along", "already", "also", "although", 8 | "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", 9 | "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", 10 | "are", "around", "as", "at", "back", "be", "became", "because", "become", 11 | "becomes", "becoming", "been", "before", "beforehand", "behind", "being", 12 | "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", 13 | "but", "by", "call", "can", "cannot", "cant", "can't", "co", "con", "could", 14 | "couldnt", "cry", "de", "describe", "detail", "do", "done", "don't", "down", 15 | "due", "during", "each", "easy", "eg", "eight", "either", "eleven", "else", 16 | "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", 17 | "everything", "everywhere", "except", "few", "fifteen", "fifty", "find", 18 | "fire", "first", "five", "for", "former", "formerly", "forty", "found", 19 | "four", "from", "further", "give", "had", "has", "hasnt", "have", "he", 20 | "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", 21 | "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie", 22 | "if", "i'm", "i'll", "i've", "in", "inc", "indeed", "interest", "is", "it", 23 | "it'll", "its", "it's", "itself", "just", "keep", "last", "latter", 24 | "latterly", "least", "less", "like", "ltd", "made", "many", "may", "me", 25 | "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", 26 | "much", "must", "my", "myself", "name", "namely", "neither", "never", 27 | "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", 28 | "not", "nothing", "now", "nowhere", "of", "off", "often", "ok", "okay", 29 | "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", 30 | "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", 31 | "please", "put", "rather", "re", "really", "same", "see", "seem", "seemed", 32 | "seeming", "seems", "serious", "several", "she", "should", "show", "side", 33 | "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", 34 | "something", "sometime", "sometimes", "somewhere", "still", "such", "take", 35 | "ten", "than", "thank", "thanks", "that", "that's", "the", "their", "them", 36 | "themselves", "then", "thence", "there", "thereafter", "thereby", 37 | "therefore", "therein", "thereupon", "these", "they", "third", "this", 38 | "those", "though", "three", "through", "throughout", "thru", "thus", "to", 39 | "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", 40 | "un", "until", "up", "upon", "us", "very", "via", "view", "viewing", 41 | "viewer", "was", "we", "we'll", "well", "welcome", "were", "what", 42 | "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", 43 | "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", 44 | "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", 45 | "with", "within", "without", "would", "wont", "won't", "yet", "you", "your", 46 | "yours", "you've", "you'll", "yourself", "yourselves", "youtube", "going", 47 | "want", "right", "you're", "we're", "know", "gonna", "need", "bit", "look", 48 | "yeah", "guys", "sure", "let's", "video", "oh", "let", "today", "they're", 49 | "did", "looks", "different", "great", "different", "say", "um", "probably", 50 | "kind", "doesn't", "does", "maybe", "hey", "we've", "better", "hope", 51 | "there's", "try" 52 | ]) -------------------------------------------------------------------------------- /hd-vila-100m/LICENSE: -------------------------------------------------------------------------------- 1 | Research Use of Data Agreement v1.0 2 | 3 | 4 | 5 | This is the Research Use of Data Agreement, Version 1.0 (the “R-UDA”). Capitalized terms are defined in Section 5. Data Provider and you agree as follows: 6 | 7 | 8 | 9 | 10 | 11 | 1. Provision of the Data 12 | 13 | 1.1. You may use, modify, and distribute the Data made available to you by the Data Provider under this R-UDA for Research Use if you follow the R-UDA’s terms. 14 | 15 | 1.2. Data Provider will not sue you or any Downstream Recipient for any claim arising out of the use, modification, or distribution of the Data provided you meet the terms of the R-UDA. 16 | 17 | 1.3. This R-UDA does not restrict your use, modification, or distribution of any portions of the Data that are in the public domain or that may be used, modified, or distributed under any other legal exception or limitation. 18 | 19 | 20 | 21 | 22 | 23 | 2. Restrictions 24 | 25 | 2.1. You agree that you will use the Data solely for Computational Use for non-commercial research. This restriction means that you may engage in non-commercial research activities (including non-commercial research undertaken by or funded via a commercial entity), but you may not use the Data or any Results in any commercial offering, including as part of a product or service (or to improve any product or service) you use or provide to others. 26 | 27 | 2.2. You may not receive money or other consideration in exchange for use or redistribution of Data. 28 | 29 | 30 | 31 | 32 | 33 | 3. Redistribution of Data 34 | 35 | 3.1. You may redistribute the Data, so long as: 36 | 37 | 3.1.1. You include with any Data you redistribute all credit or attribution information that you received with the Data, and your terms require any Downstream Recipient to do the same; and 38 | 39 | 3.1.2. You bind each recipient to whom you redistribute the Data to the terms of the R-UDA. 40 | 41 | 42 | 43 | 44 | 45 | 4. No Warranty, Limitation of Liability 46 | 47 | 4.1. Data Provider does not represent or warrant that it has any rights whatsoever in the Data. 48 | 49 | 4.2. THE DATA IS PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 50 | 51 | 4.3. NEITHER DATA PROVIDER NOR ANY UPSTREAM DATA PROVIDER SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE DATA OR RESULTS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 52 | 53 | 54 | 55 | 56 | 57 | 5. Definitions 58 | 59 | 5.1. “Computational Use” means activities necessary to enable the use of Data (alone or along with other material) for analysis by a computer. 60 | 61 | 5.2. “Data” means the material you receive under the R-UDA in modified or unmodified form, but not including Results. 62 | 63 | 5.3. “Data Provider” means the source from which you receive the Data and with whom you enter into the R-UDA. 64 | 65 | 5.4. “Downstream Recipient” means any person or persons who receives the Data directly or indirectly from you in accordance with the R-UDA. 66 | 67 | 5.5. “Result” means anything that you develop or improve from your use of Data that does not include more than a de minimis portion of the Data on which the use is based. Results may include de minimis portions of the Data necessary to report on or explain use that has been conducted with the Data, such as figures in scientific papers, but do not include more. Artificial intelligence models trained on Data (and which do not include more than a de minimis portion of Data) are Results. 68 | 69 | 5.6. “Upstream Data Providers” means the source or sources from which the Data Provider directly or indirectly received, under the terms of the R-UDA, material that is included in the Data. 70 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/pretrain_stage1.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8 11 | } 12 | 13 | 14 | bert_config: "src/configs/bert_large_config.json" 15 | stage: 1 16 | type_vocab_size: 8 17 | num_local_layers: 8 18 | stage1_layers: 12 19 | bert_frozen_stage: -1 20 | 21 | log_tb: true 22 | 23 | 24 | WEIGHTS: 25 | model_weight: '' 26 | stage1_model_weight: '' 27 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 28 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 29 | pretrained_2d: True 30 | 31 | DATA: 32 | use_lmdb_train_data: True 33 | len_lmdb_train_data: 8523237 34 | BATCH_SIZE_per_gpu: 16 35 | NUM_WORKERS: 12 36 | PIN_MEMORY: True 37 | 38 | sample_frame: 32 39 | sample_clip: 4 40 | input_res: [192, 320] 41 | center_crop: 200 42 | 43 | DATASET_train: { 44 | 'name': 'PreTrainDataset-train', 45 | 'type': 'PreTrainDataset', 46 | 'metadata_dir': 'datasets/lfvila_data/pretrain/train_db', 47 | 'video_path': 'datasets/hdvila100m/video_clip_3fps' 48 | } 49 | 50 | DATASET_val: [{ 51 | 'name': 'RetrievalDataset-val', 52 | 'type': 'RetrievalDataset', 53 | 'metadata_dir': 'datasets/lfvila_data/task/actnet/val_s.jsonl', 54 | 'video_path': 'datasets/activitynet/actnet_video' 55 | }, 56 | { 57 | 'name': 'PreTrainDataset-val', 58 | 'type': 'PreTrainDataset', 59 | 'metadata_dir': 'datasets/lfvila_data/pretrain/val.jsonl', 60 | 'video_path': 'datasets/hdvila100m/video_clip_3fps' 61 | } 62 | ] 63 | 64 | 65 | TRAINING: 66 | BREAK_STEP: 10000000000 67 | EPOCHS: 10 68 | WARMUP_EPOCHS: 1 69 | WARMUP_LR: 0. 70 | LR_SCHEDULER: { 71 | 'NAME': 'linear', 72 | 'DECAY_EPOCHS': 10, 73 | } 74 | 75 | use_mlm: false 76 | 77 | ct_global_loss_weight: 1 78 | 79 | 80 | use_time_match: true 81 | ct_time_loss_weight: 0.25 82 | num_key: 2 83 | num_value: 2 84 | num_other_neg: 3 85 | time_temp: 0.05 86 | use_mask_equal: false 87 | 88 | 89 | temp: 0.05 90 | weight_decay: 0.05 91 | 92 | save_dir: "project/lfvila/lfvila_save/pretrain_stage1" 93 | checkpoint_step: 4000 94 | save_step: 2000 95 | print_step: 100 96 | eval_step: 500 97 | 98 | deepspeed_config: { 99 | "train_micro_batch_size_per_gpu": 16, 100 | "gradient_accumulation_steps": 1, 101 | "steps_per_print": 500, 102 | 103 | 104 | "zero_optimization": { 105 | "stage": 2, 106 | "allgather_partitions": true, 107 | "allgather_bucket_size": 5.0e+8, 108 | "overlap_comm": false, 109 | "reduce_scatter": true, 110 | "reduce_bucket_size": 5.0e+8, 111 | "contiguous_gradients" : false, 112 | "stage3_gather_fp16_weights_on_model_save": true 113 | }, 114 | 115 | "fp16": { 116 | "enabled": true, 117 | "loss_scale": 0, 118 | "loss_scale_window": 1000, 119 | "initial_scale_power": 32, 120 | "hysteresis": 2, 121 | "min_loss_scale": 1 122 | }, 123 | 124 | "optimizer": { 125 | "type": "AdamW", 126 | "params": { 127 | "lr": 5.0e-5, 128 | "betas": [0.9, 0.98], 129 | "eps": 1.0e-8, 130 | "weight_decay": 5.0e-2 131 | } 132 | }, 133 | 134 | 135 | "sparse_attention": { 136 | "mode": "fixed", 137 | "block": 32, 138 | "different_layout_per_head": true, 139 | "num_local_blocks": 16, 140 | "num_global_blocks": 1, 141 | "attention": "bidirectional", 142 | "horizontal_global_attention": true, 143 | "num_different_global_patterns": 4 144 | } 145 | } 146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XPretrain 2 | 3 | This repo includes some recent research works in **multi-modality learning**, especially with **pre-training** method from [MSM group](https://www.microsoft.com/en-us/research/group/multimedia-search-and-mining/) of Microsoft Research. 4 | 5 | ## Multi-modality Learning 6 | 7 | ### ***** Video & Language ***** 8 | 9 | #### Dataset 10 | 11 | > [**HD-VILA-100M dataset**](https://github.com/microsoft/XPretrain/tree/main/hd-vila-100m): high-resolution and diversified video-language dataset 12 | 13 | #### Pre-training model 14 | 15 | > [**HD-VILA (CVPR 2022)**](https://github.com/microsoft/XPretrain/tree/main/hd-vila): high-resolution and diversified video-language pre-training model 16 | 17 | > [**LF-VILA (NeurIPS 2022)**](https://github.com/microsoft/XPretrain/tree/main/LF-VILA): long-form video-language pre-training model 18 | 19 | > [**CLIP-ViP (ICLR 2023)**](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP): adapting image-language pre-training to video-language pretraining model 20 | 21 | ### ***** Image & Language ***** 22 | 23 | #### Pre-training model 24 | 25 | > [**Pixel-BERT**](https://arxiv.org/pdf/2004.00849.pdf): end-to-end image and language pre-training model 26 | 27 | > [**SOHO (CVPR 2021 oral)**](https://github.com/researchmm/soho): improved end-to-end image and language pre-training model with quantized visual tokens 28 | 29 | > [**VisualParsing (NeurIPS 2021)**](https://github.com/microsoft/XPretrain/tree/main/visualparsing): Transformer-based end-to-end image and language pre-training model 30 | 31 | ## News 32 | - :smiley:**March, 2023: the code of [**CLIP-ViP**](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP) and [**LF-VILA**](https://github.com/microsoft/XPretrain/tree/main/LF-VILA) was released.** 33 | - January, 2023: our paper [**CLIP-ViP**](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP) to adapt image-language pre-training model to video-language pretraining was accepted by ICLR 2023. 34 | - September, 2022: our paper [**LF-VILA**](https://github.com/microsoft/XPretrain/tree/main/LF-VILA) on long-form video-language pre-training was accepted by NeurIPS 2022. 35 | - September, 2022: the code of [**HD-VILA**](https://github.com/microsoft/XPretrain/tree/main/hd-vila) was released. 36 | - March, 2022: [**HD-VILA-100M dataset**](https://github.com/microsoft/XPretrain/tree/main/hd-vila-100m) was released publicly. 37 | - March, 2022: [**HD-VILA**](https://github.com/microsoft/XPretrain/tree/main/hd-vila) was accepted by CVPR 2022. 38 | 39 | 40 | ## Contributing 41 | 42 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 43 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 44 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 45 | 46 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 47 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 48 | provided by the bot. You will only need to do this once across all repos using our CLA. 49 | 50 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 51 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 52 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 53 | 54 | ## Trademarks 55 | 56 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 57 | trademarks or logos is subject to and must follow 58 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 59 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 60 | Any use of third-party trademarks or logos are subject to those third-party's policies. 61 | 62 | ## Contact Information 63 | 64 | For help or issues using the pre-trained models, please submit an issue. 65 | For other communications, please contact [Bei Liu]() (`bei.liu@microsoft.com`) and [Jianlong Fu]() (`jianf@microsoft.com`). 66 | -------------------------------------------------------------------------------- /LF-VILA/src/configs/pretrain_stage2.yaml: -------------------------------------------------------------------------------- 1 | VideoEncoder: { 2 | "patch_size": [1,8,8], 3 | "embed_dim": 128, 4 | "depths":[2, 2, 14, 2, 2, 2], 5 | "downsample_stages":[0, 1, 4], 6 | "stages":[0, 1, 2, 2, 2, 3], 7 | "num_heads":[4, 8, 16, 16, 16, 32], 8 | "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w 9 | "patch_norm": True, 10 | "local_window": 8, 11 | "frozen_stages": 6 12 | } 13 | 14 | 15 | bert_config: "src/configs/bert_large_config.json" 16 | stage: 2 17 | type_vocab_size: 8 18 | num_local_layers: 8 19 | stage1_layers: 12 20 | bert_frozen_stage: 12 21 | final_num_patches: 6 22 | 23 | log_tb: true 24 | 25 | 26 | WEIGHTS: 27 | model_weight: '' 28 | stage1_model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin' 29 | bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin' 30 | swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth' 31 | pretrained_2d: True 32 | 33 | DATA: 34 | use_lmdb_train_data: true 35 | len_lmdb_train_data: 8523237 36 | BATCH_SIZE_per_gpu: 48 37 | NUM_WORKERS: 12 38 | PIN_MEMORY: True 39 | 40 | sample_frame: 32 41 | sample_clip: 4 42 | input_res: [192, 320] 43 | center_crop: 200 44 | 45 | 46 | DATASET_train: { 47 | 'name': 'PreTrainDataset-train', 48 | 'type': 'PreTrainDataset', 49 | 'metadata_dir': 'datasets/lfvila_data/pretrain/train_db', 50 | 'video_path': 'datasets/hdvila100m/video_clip_3fps' 51 | } 52 | 53 | DATASET_val: [{ 54 | 'name': 'RetrievalDataset-val', 55 | 'type': 'RetrievalDataset', 56 | 'metadata_dir': 'datasets/lfvila_data/task/actnet/val_s.jsonl', 57 | 'video_path': 'datasets/activitynet/actnet_video' 58 | }, 59 | { 60 | 'name': 'PreTrainDataset-val', 61 | 'type': 'PreTrainDataset', 62 | 'metadata_dir': 'datasets/lfvila_data/pretrain/val.jsonl', 63 | 'video_path': 'datasets/hdvila100m/video_clip_3fps' 64 | } 65 | ] 66 | 67 | 68 | TRAINING: 69 | BREAK_STEP: 10000000000 70 | EPOCHS: 10 71 | WARMUP_EPOCHS: 1 72 | WARMUP_LR: 0. 73 | LR_SCHEDULER: { 74 | 'NAME': 'linear', 75 | 'DECAY_EPOCHS': 10, 76 | } 77 | 78 | use_mlm: true 79 | mlm_loss_weight: 1 80 | vtm_loss_weight: 10 81 | 82 | ct_global_loss_weight: 1 83 | 84 | 85 | use_time_match: true 86 | ct_time_loss_weight: 0.25 87 | num_key: 2 88 | num_value: 2 89 | num_other_neg: 3 90 | time_temp: 0.05 91 | use_mask_equal: false 92 | 93 | temp: 0.05 94 | weight_decay: 0.05 95 | 96 | save_dir: "project/lfvila/lfvila_save/pretrain_stage2" 97 | checkpoint_step: 2000 98 | save_step: 2000 99 | print_step: 50 100 | eval_step: 250 101 | 102 | deepspeed_config: { 103 | "train_micro_batch_size_per_gpu": 48, 104 | "gradient_accumulation_steps": 1, 105 | "steps_per_print": 500, 106 | 107 | 108 | "zero_optimization": { 109 | "stage": 2, 110 | "allgather_partitions": true, 111 | "allgather_bucket_size": 5.0e+8, 112 | "overlap_comm": false, 113 | "reduce_scatter": true, 114 | "reduce_bucket_size": 5.0e+8, 115 | "contiguous_gradients" : false, 116 | "stage3_gather_fp16_weights_on_model_save": true 117 | }, 118 | 119 | "fp16": { 120 | "enabled": true, 121 | "loss_scale": 0, 122 | "loss_scale_window": 1000, 123 | "initial_scale_power": 32, 124 | "hysteresis": 2, 125 | "min_loss_scale": 1 126 | }, 127 | 128 | "optimizer": { 129 | "type": "AdamW", 130 | "params": { 131 | "lr": 5.0e-5, 132 | "betas": [0.9, 0.98], 133 | "eps": 1.0e-8, 134 | "weight_decay": 5.0e-2 135 | } 136 | }, 137 | 138 | 139 | "sparse_attention": { 140 | "mode": "fixed", 141 | "block": 32, 142 | "different_layout_per_head": true, 143 | "num_local_blocks": 16, 144 | "num_global_blocks": 1, 145 | "attention": "bidirectional", 146 | "horizontal_global_attention": true, 147 | "num_different_global_patterns": 4 148 | } 149 | } 150 | 151 | 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /hd-vila-100m/src/cut_videos.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | import os 3 | from tqdm import tqdm 4 | import logging 5 | import argparse 6 | import re 7 | import subprocess 8 | import multiprocessing 9 | from joblib import Parallel, delayed 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser(description='youtube video processing') 14 | parser.add_argument('--workdir', default='./hdvila_100m',type=str, help='Working Directory') 15 | parser.add_argument('--metafile', default='meta_part0.jsonl', type=str, help='youtube video meta') 16 | parser.add_argument('--resultfile', default='cut_part0.jsonl', type=str, help='processed videos') 17 | parser.add_argument('--log', default='log_part0.log', type=str, help='log') 18 | args = parser.parse_args() 19 | return args 20 | 21 | 22 | def check_dirs(dirs): 23 | if not os.path.exists(dirs): 24 | os.makedirs(dirs, exist_ok=True) 25 | 26 | 27 | class Cutvideos(): 28 | def __init__(self, metafile, workdir, resultfile): 29 | self.workdir = workdir 30 | self.metafile = metafile 31 | self.resultfile = resultfile 32 | self.metas = self.loadmetas() 33 | 34 | def loadmetas(self): 35 | metas = [] 36 | with open(self.metafile, 'r') as f: 37 | for l in jsonlines.Reader(f): 38 | metas.append(l) 39 | return metas 40 | 41 | def hhmmss(self, timestamp1, timestamp2): 42 | hh,mm,s = timestamp1.split(':') 43 | ss,ms = s.split('.') 44 | timems1 = 3600*1000*int((hh)) + 60*1000*int(mm) + 1000*int(ss) + int(ms) 45 | hh,mm,s = timestamp2.split(':') 46 | ss,ms = s.split('.') 47 | timems2 = 3600*1000*int((hh)) + 60*1000*int(mm) + 1000*int(ss) + int(ms) 48 | dur = (timems2 - timems1)/1000 49 | return str(dur) 50 | 51 | def run(self, cmd): 52 | proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 53 | out, _ = proc.communicate() 54 | return out.decode('utf-8') 55 | 56 | def extract_single_clip(self,sb, in_filepath, out_filepath): 57 | cmd = ['ffmpeg', '-ss', sb[0], '-t', self.hhmmss(sb[0], sb[1]),'-accurate_seek', '-i', in_filepath, '-c', 'copy', 58 | '-avoid_negative_ts', '1', '-reset_timestamps', '1', 59 | '-y', '-hide_banner', '-loglevel', 'panic', '-map', '0',out_filepath] 60 | self.run(cmd) 61 | if not os.path.isfile(out_filepath): 62 | raise Exception(f"{out_filepath}: ffmpeg clip extraction failed") 63 | 64 | def extract_clips(self, meta): 65 | clips = meta['clip'] 66 | vid = meta['video_id'] 67 | outfolder = os.path.join(self.workdir,'video_clips', vid) 68 | check_dirs(outfolder) 69 | result = [] 70 | # try: 71 | for c in clips: 72 | self.extract_single_clip(c['span'], os.path.join(self.workdir,'download_videos', vid + '.mp4'), os.path.join(outfolder, c['clip_id'])) 73 | result.append(c['clip_id']) 74 | # except: 75 | # pass 76 | 77 | return result 78 | 79 | def extract_all_clip(self): 80 | results = [] 81 | for v in tqdm(self.metas): 82 | result = self.extract_clips(v) 83 | results.extend(result) 84 | 85 | logger.info(f"Number of clips processed: {len(results)}") 86 | with jsonlines.open(os.path.join(self.workdir, 'cut_video_results', self.resultfile), 'w') as f: 87 | for l in results: 88 | f.write(l) 89 | 90 | 91 | if __name__ == '__main__': 92 | args = parse_args() 93 | 94 | metafile = os.path.join(args.workdir, 'metafiles', args.metafile) 95 | logdir = os.path.join(args.workdir,'cut_video_log') 96 | 97 | check_dirs(os.path.join(args.workdir, 'video_clips')) 98 | check_dirs(os.path.join(args.workdir, 'cut_video_results')) 99 | check_dirs(logdir) 100 | 101 | logging.basicConfig(level=logging.INFO, 102 | filename=os.path.join(logdir, args.log), 103 | datefmt='%Y/%m/%d %H:%M:%S', 104 | format='%(asctime)s - %(name)s - %(levelname)s - %(lineno)d - %(module)s - %(message)s') 105 | 106 | logger = logging.getLogger(__name__) 107 | logger.info(args) 108 | 109 | cvd = Cutvideos(metafile, args.workdir, args.resultfile) 110 | cvd.extract_all_clip() -------------------------------------------------------------------------------- /LF-VILA/src/tasks/run_video_classification.py: -------------------------------------------------------------------------------- 1 | from poplib import LF 2 | import torch 3 | import torch.distributed as dist 4 | import deepspeed 5 | import argparse 6 | import os 7 | from mmcv import Config 8 | from src.models import LFVILA_Video_Classification 9 | 10 | from src.tools import Trainer_Video_Classification 11 | from src.datasets.dataloader import build_dataloader 12 | from src.optimization.lr_scheduler import build_scheduler 13 | from src.optimization.optimizer import build_optimizer_parameters 14 | 15 | from src.utils.logger import LOGGER, add_log_to_file 16 | from src.utils.dist import master_process 17 | from src.utils.misc import mkdirp, set_random_seed 18 | from src.utils.load import load_model_weights_with_mismatch 19 | 20 | def main(): 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--config', default='./src/configs/pretrain_test_stage2.yaml') 24 | parser.add_argument('--blob_mount_dir', default="/blob_mount") 25 | parser.add_argument('--deepspeed_sparse_attention',action='store_true') 26 | parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') 27 | parser.add_argument('--fp16', action='store_true', help='enable fp16') 28 | parser.add_argument('--seed', type=int, default=42, help='random seed') 29 | parser.add_argument('--distributed',action='store_true') 30 | parser.add_argument('--resume', action='store_true') 31 | # Include DeepSpeed configuration arguments 32 | parser = deepspeed.add_config_arguments(parser) 33 | 34 | args = parser.parse_args() 35 | 36 | set_random_seed(args.seed) 37 | 38 | config = Config.fromfile(args.config) 39 | 40 | 41 | LOGGER.info(config) 42 | LOGGER.info(args) 43 | 44 | if not master_process(args): 45 | LOGGER.disabled = True 46 | if master_process(args): 47 | mkdirp(os.path.join(args.blob_mount_dir, config.TRAINING.save_dir,"log")) 48 | add_log_to_file(os.path.join(args.blob_mount_dir, config.TRAINING.save_dir,"log/log.txt")) 49 | 50 | model = LFVILA_Video_Classification(args, config) 51 | 52 | if config.WEIGHTS.model_weight != '': 53 | LOGGER.info(f"Loading model weights from {config.WEIGHTS.model_weight}") 54 | load_model_weights_with_mismatch(model, os.path.join(args.blob_mount_dir, config.WEIGHTS.model_weight)) 55 | 56 | else: 57 | if config.WEIGHTS.swin_weight != '': 58 | LOGGER.info(f"Loading video encoder weights from {config.WEIGHTS.swin_weight}") 59 | 60 | load_model_weights_with_mismatch(model.video_encoder, 61 | os.path.join(args.blob_mount_dir, config.WEIGHTS.swin_weight), 62 | load_swin=True, 63 | pretrained2d=config.WEIGHTS.pretrained_2d) 64 | 65 | parameter_group = build_optimizer_parameters(config, model) 66 | 67 | 68 | # init deepspeed 69 | 70 | if args.distributed: 71 | 72 | model_engine, optimizer, _, _ = deepspeed.initialize(args = args, 73 | model=model, 74 | model_parameters=parameter_group, 75 | config=config.deepspeed_config 76 | ) 77 | print(dist.get_rank()) 78 | 79 | 80 | LOGGER.info(f'Training with {dist.get_world_size()} gpus') 81 | 82 | 83 | dataset_trains, dataset_vals, dataloader_trains, dataloader_vals = build_dataloader(args, config) 84 | 85 | dataloader_train = dataloader_trains['VideoClassificationDataset-train'] 86 | steps_per_epoch = len(dataloader_train) 87 | scheduler = build_scheduler(config, optimizer, steps_per_epoch) 88 | 89 | args.fp16 = model_engine.fp16_enabled() 90 | if args.fp16: 91 | LOGGER.info('Enable fp16 Training') 92 | 93 | 94 | trainer = Trainer_Video_Classification(args, config, model_engine, optimizer, scheduler, dataloader_train, dataloader_vals['VideoClassificationDataset-val']) 95 | 96 | LOGGER.info('start first evaluate') 97 | 98 | trainer.evaluate(dataloader_vals['VideoClassificationDataset-val']) 99 | 100 | if not config.TRAINING.only_val: 101 | trainer.train(args.resume) 102 | 103 | if __name__ == '__main__': 104 | deepspeed.init_distributed() 105 | main() 106 | 107 | 108 | -------------------------------------------------------------------------------- /LF-VILA/src/datasets/actnet_qa_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import jsonlines 4 | import decord 5 | import lmdb 6 | from decord import VideoReader, cpu 7 | import numpy as np 8 | import torch 9 | from torch.utils.data import Dataset 10 | from torch.utils.data.dataloader import default_collate 11 | from src.utils.logger import LOGGER 12 | 13 | decord.bridge.set_bridge("torch") 14 | 15 | class ActnetQADataset(Dataset): 16 | def __init__(self, 17 | cfg, 18 | metadata_dir, 19 | video_path, 20 | sample_frame, 21 | sample_clip, 22 | tokenizer, 23 | transform=None, 24 | is_train=True, 25 | return_rawtext=False, 26 | return_index=False, 27 | **kwargs 28 | ): 29 | self.cfg = cfg 30 | self.metadata_dir = metadata_dir 31 | self.transform = transform 32 | self.video_path = video_path 33 | self.return_rawtext = return_rawtext 34 | self.return_index = return_index 35 | self.reliable_idx_list = [] 36 | self.sample_frame = sample_frame 37 | self.sample_clip = sample_clip 38 | 39 | self._load_metadata() 40 | self.tokenizer = tokenizer 41 | self.is_train = is_train 42 | 43 | def _load_metadata(self): 44 | data = [] 45 | with open(self.metadata_dir) as f: 46 | for l in jsonlines.Reader(f): 47 | data.append(l) 48 | self.metadata = data 49 | 50 | def _read_video(self, video_id, sample_frame_num): 51 | ''' 52 | read frames from long video 53 | args: 54 | video_id: str, 55 | sample_frame_num: frames used 56 | return: 57 | img_arrays: [num_frm, 3, H, W] 58 | chunk_mask: [num_frm, n_clip], , mask for indicating frames belong to each clip 59 | 60 | ''' 61 | video_path = os.path.join(self.video_path, video_id + '.mp4') 62 | vr = VideoReader(video_path, ctx=cpu(0)) 63 | num_frame = len(vr) 64 | 65 | if self.is_train: 66 | interval = int(num_frame / (sample_frame_num - 1)) 67 | start = np.random.randint(0, interval+1) 68 | end = np.random.randint(num_frame-1-interval, num_frame) 69 | frame_idx = np.linspace(start, end, num=sample_frame_num).astype(int) 70 | else: 71 | frame_idx = np.linspace(0, num_frame-1, num=sample_frame_num).astype(int) 72 | 73 | img_arrays = vr.get_batch(frame_idx) 74 | 75 | img_arrays = img_arrays.float() / 255 76 | 77 | img_arrays = img_arrays.permute(0, 3, 1, 2) # N,C,H,W 78 | 79 | return img_arrays 80 | 81 | def tokenize(self, text_q, max_length = 50): 82 | text_q = [text_q] 83 | 84 | encoded_qa = [self.tokenizer(x, padding='max_length', truncation=True, max_length=max_length) for x in text_q] 85 | 86 | text_ids = torch.tensor([x.input_ids for x in encoded_qa]) 87 | attention_mask = torch.tensor([x.attention_mask for x in encoded_qa]) 88 | return text_ids, attention_mask 89 | 90 | def __len__(self): 91 | return len(self.metadata) 92 | 93 | 94 | def __getitem__(self, index): 95 | num_retries = 10 96 | for j in range(num_retries): 97 | try: 98 | item = self.metadata[index] 99 | 100 | clip_id = item['video_name'] 101 | 102 | video = self._read_video(clip_id, self.sample_frame) 103 | 104 | rawtext_q = item['question'] 105 | label_a = item['answer'] 106 | 107 | text_ids, attention_mask = self.tokenize(rawtext_q) 108 | 109 | 110 | if self.transform is not None: 111 | video = self.transform(video) 112 | video = video.permute(1, 0, 2, 3) 113 | 114 | data = { 115 | 'video_frames': video, 116 | 'text_ids': text_ids, 117 | 'attention_mask': attention_mask, 118 | 'label': label_a 119 | } 120 | except: 121 | index = random.randint(0, len(self) - 1) 122 | continue 123 | else: 124 | break 125 | 126 | if self.return_rawtext: 127 | data['rawtext'] = rawtext_q 128 | 129 | if self.return_index: 130 | data['index'] = torch.tensor(index) 131 | 132 | return data 133 | 134 | 135 | -------------------------------------------------------------------------------- /LF-VILA/src/tasks/run_retrieval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | import deepspeed 4 | import argparse 5 | import os 6 | from mmcv import Config 7 | from src.models import LFVILA_Retrieval 8 | 9 | from src.tools import Trainer_Retrieval 10 | from src.datasets.dataloader import build_dataloader 11 | from src.optimization.lr_scheduler import build_scheduler 12 | from src.optimization.optimizer import build_optimizer_parameters 13 | from src.utils.logger import LOGGER, add_log_to_file 14 | from src.utils.dist import master_process 15 | from src.utils.misc import mkdirp, set_random_seed 16 | from src.utils.load import load_model_weights_with_mismatch 17 | 18 | 19 | def main(): 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--config', default='./src/configs/pretrain_test_stage2.yaml') 23 | parser.add_argument('--blob_mount_dir', default="/blob_mount") 24 | parser.add_argument('--deepspeed_sparse_attention',action='store_true') 25 | parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') 26 | parser.add_argument('--fp16', action='store_true', help='enable fp16') 27 | parser.add_argument('--seed', type=int, default=42, help='random seed') 28 | parser.add_argument('--distributed',action='store_true') 29 | parser.add_argument('--resume', action='store_true') 30 | parser.add_argument('--only_val', action='store_true') 31 | # Include DeepSpeed configuration arguments 32 | parser = deepspeed.add_config_arguments(parser) 33 | 34 | args = parser.parse_args() 35 | 36 | set_random_seed(args.seed) 37 | 38 | config = Config.fromfile(args.config) 39 | 40 | LOGGER.info(config) 41 | LOGGER.info(args) 42 | 43 | if not master_process(args): 44 | LOGGER.disabled = True 45 | if master_process(args): 46 | mkdirp(os.path.join(args.blob_mount_dir, config.TRAINING.save_dir,"log")) 47 | add_log_to_file(os.path.join(args.blob_mount_dir, config.TRAINING.save_dir,"log/log.txt")) 48 | 49 | model = LFVILA_Retrieval(args, config) 50 | 51 | if config.WEIGHTS.model_weight != '': 52 | LOGGER.info(f"Loading model weights from {config.WEIGHTS.model_weight}") 53 | load_model_weights_with_mismatch(model, os.path.join(args.blob_mount_dir, config.WEIGHTS.model_weight)) 54 | else: 55 | if config.WEIGHTS.swin_weight != '': 56 | LOGGER.info(f"Loading video encoder weights from {config.WEIGHTS.swin_weight}") 57 | 58 | load_model_weights_with_mismatch(model.video_encoder, 59 | os.path.join(args.blob_mount_dir, config.WEIGHTS.swin_weight), 60 | load_swin=True, 61 | pretrained2d=config.WEIGHTS.pretrained_2d) 62 | if config.WEIGHTS.bert_weight != '': 63 | LOGGER.info(f"Loading bert weights from {config.WEIGHTS.bert_weight}") 64 | load_model_weights_with_mismatch(model.text_encoder, os.path.join(args.blob_mount_dir, config.WEIGHTS.bert_weight),load_bert=True) 65 | model._init_sent_embedding() 66 | 67 | parameter_group = build_optimizer_parameters(config, model) 68 | 69 | # init deepspeed 70 | if args.distributed: 71 | 72 | model_engine, optimizer, _, _ = deepspeed.initialize(args = args, 73 | model=model, 74 | model_parameters=parameter_group, 75 | config=config.deepspeed_config 76 | ) 77 | print(dist.get_rank()) 78 | 79 | 80 | LOGGER.info(f'Training with {dist.get_world_size()} gpus') 81 | 82 | dataset_trains, dataset_vals, dataloader_trains, dataloader_vals = build_dataloader(args, config) 83 | 84 | dataloader_train = dataloader_trains['RetrievalDataset-train'] 85 | steps_per_epoch = len(dataloader_train) 86 | scheduler = build_scheduler(config, optimizer, steps_per_epoch) 87 | 88 | args.fp16 = model_engine.fp16_enabled() 89 | if args.fp16: 90 | LOGGER.info('Enable fp16 Training') 91 | 92 | trainer = Trainer_Retrieval(args, config, model_engine, optimizer, scheduler, dataloader_train, dataloader_vals['RetrievalDataset-val']) 93 | 94 | LOGGER.info('start first evaluate') 95 | 96 | trainer.evaluate(dataloader_vals['RetrievalDataset-val']) 97 | 98 | if not args.only_val: 99 | trainer.train(args.resume) 100 | 101 | if __name__ == '__main__': 102 | deepspeed.init_distributed() 103 | main() 104 | 105 | -------------------------------------------------------------------------------- /CLIP-ViP/src/optimization/adamw.py: -------------------------------------------------------------------------------- 1 | """ 2 | AdamW optimizer (weight decay fix) 3 | copied from hugginface 4 | """ 5 | import math 6 | 7 | import torch 8 | from torch.optim import Optimizer 9 | 10 | 11 | class AdamW(Optimizer): 12 | """ Implements Adam algorithm with weight decay fix. 13 | Parameters: 14 | lr (float): learning rate. Default 1e-3. 15 | betas (tuple of 2 floats): Adams beta parameters (b1, b2). 16 | Default: (0.9, 0.999) 17 | eps (float): Adams epsilon. Default: 1e-6 18 | weight_decay (float): Weight decay. Default: 0.0 19 | correct_bias (bool): can be set to False to avoid correcting bias 20 | in Adam (e.g. like in Bert TF repository). Default True. 21 | """ 22 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, 23 | weight_decay=0.0, correct_bias=True): 24 | if lr < 0.0: 25 | raise ValueError( 26 | "Invalid learning rate: {} - should be >= 0.0".format(lr)) 27 | if not 0.0 <= betas[0] < 1.0: 28 | raise ValueError("Invalid beta parameter: {} - " 29 | "should be in [0.0, 1.0[".format(betas[0])) 30 | if not 0.0 <= betas[1] < 1.0: 31 | raise ValueError("Invalid beta parameter: {} - " 32 | "should be in [0.0, 1.0[".format(betas[1])) 33 | if not 0.0 <= eps: 34 | raise ValueError("Invalid epsilon value: {} - " 35 | "should be >= 0.0".format(eps)) 36 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, 37 | correct_bias=correct_bias) 38 | super(AdamW, self).__init__(params, defaults) 39 | 40 | def step(self, closure=None): 41 | """Performs a single optimization step. 42 | Arguments: 43 | closure (callable, optional): A closure that reevaluates the model 44 | and returns the loss. 45 | """ 46 | loss = None 47 | if closure is not None: 48 | loss = closure() 49 | 50 | for group in self.param_groups: 51 | for p in group['params']: 52 | if p.grad is None: 53 | continue 54 | grad = p.grad.data 55 | if grad.is_sparse: 56 | raise RuntimeError( 57 | 'Adam does not support sparse ' 58 | 'gradients, please consider SparseAdam instead') 59 | 60 | state = self.state[p] 61 | 62 | # State initialization 63 | if len(state) == 0: 64 | state['step'] = 0 65 | # Exponential moving average of gradient values 66 | state['exp_avg'] = torch.zeros_like(p.data) 67 | # Exponential moving average of squared gradient values 68 | state['exp_avg_sq'] = torch.zeros_like(p.data) 69 | 70 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 71 | beta1, beta2 = group['betas'] 72 | 73 | state['step'] += 1 74 | 75 | # Decay the first and second moment running average coefficient 76 | # In-place operations to update the averages at the same time 77 | exp_avg.mul_(beta1).add_(1.0 - beta1, grad) 78 | exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) 79 | denom = exp_avg_sq.sqrt().add_(group['eps']) 80 | 81 | step_size = group['lr'] 82 | if group['correct_bias']: # No bias correction for Bert 83 | bias_correction1 = 1.0 - beta1 ** state['step'] 84 | bias_correction2 = 1.0 - beta2 ** state['step'] 85 | step_size = (step_size * math.sqrt(bias_correction2) 86 | / bias_correction1) 87 | 88 | p.data.addcdiv_(-step_size, exp_avg, denom) 89 | 90 | # Just adding the square of the weights to the loss function is 91 | # *not* the correct way of using L2 regularization/weight decay 92 | # with Adam, since that will interact with the m and v 93 | # parameters in strange ways. 94 | # 95 | # Instead we want to decay the weights in a manner that doesn't 96 | # interact with the m/v parameters. This is equivalent to 97 | # adding the square of the weights to the loss with plain 98 | # (non-momentum) SGD. 99 | # Add weight decay at the end (fixed version) 100 | if group['weight_decay'] > 0.0: 101 | p.data.add_(-group['lr'] * group['weight_decay'], p.data) 102 | 103 | return loss 104 | -------------------------------------------------------------------------------- /CLIP-ViP/src/modeling/VidCLIP.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from functools import partial 4 | from transformers.models.clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig 5 | from src.modeling.CLIP_ViP import CLIPModel, clip_loss 6 | from src.modeling.CLIP import CLIPModel as CLIP 7 | 8 | class VidCLIP(nn.Module): 9 | def __init__(self, args): 10 | super(VidCLIP, self).__init__() 11 | clipconfig = CLIPConfig.from_pretrained(args.clip_config) 12 | setattr(clipconfig, "vision_additional_config", args.clip_vision_additional_config) 13 | self.vision_additional_config = args.clip_vision_additional_config 14 | if args.clip_weights: 15 | if self.vision_additional_config.type == "ViP": 16 | self.clipmodel = CLIPModel.from_pretrained(args.clip_weights, config=clipconfig) 17 | else: 18 | self.clipmodel = CLIP.from_pretrained(args.clip_weights, config=clipconfig) 19 | else: 20 | if self.vision_additional_config.type == "ViP": 21 | self.clipmodel = CLIPModel(clipconfig) 22 | else: 23 | self.clipmodel = CLIP(clipconfig) 24 | 25 | # init logit scale from 26 | logit_scale_value = self.vision_additional_config.logit_scale_init_value 27 | self.clipmodel.logit_scale.data.fill_(logit_scale_value) 28 | 29 | def overload_logit_scale(self, overload_logit_scale): 30 | self.clipmodel.logit_scale.data.fill_(overload_logit_scale) 31 | 32 | def forward(self, video, text_input_ids, text_input_mask, \ 33 | image=None, caption_ids=None, caption_masks=None): 34 | """ 35 | video [B, n_clips*num_frms, C, H, W] 36 | text_input_ids [B, L] 37 | text_input_mask [B, L] 38 | image [B, img_num, C, H, W] 39 | caption_ids [B, img_num, L] 40 | caption_masks [B, img_num, L] 41 | """ 42 | B, N, C, H, W = video.shape 43 | 44 | if self.vision_additional_config.type == "ViP": 45 | inputs = {"input_ids": text_input_ids, 46 | "attention_mask": text_input_mask, 47 | "pixel_values": video, 48 | "return_loss": False} 49 | outputs = self.clipmodel(**inputs) 50 | results = {} 51 | results["text_features"] = outputs["text_embeds"] 52 | results["vis_features"] = outputs["image_embeds"] 53 | # results["loss"] = outputs["loss"] 54 | else: 55 | video = video.reshape(-1, C, H, W) 56 | inputs = {"input_ids": text_input_ids, 57 | "attention_mask": text_input_mask, 58 | "pixel_values": video} 59 | outputs = self.clipmodel(**inputs) 60 | vis_features = outputs["vision_model_output"][1] 61 | 62 | vis_features = self.clipmodel.visual_projection(vis_features) 63 | vis_features = vis_features / vis_features.norm(dim=-1, keepdim=True) 64 | vis_features = vis_features.reshape(B, N, -1).mean(1) 65 | vis_features = vis_features / vis_features.norm(dim=-1, keepdim=True) 66 | 67 | results = {} 68 | results["text_features"] = outputs["text_embeds"] 69 | results["vis_features"] = vis_features 70 | if image is not None: 71 | B, img_num, C, H, W = image.shape 72 | L = caption_ids.shape[-1] 73 | inputs = {"input_ids": caption_ids.reshape(-1, L), 74 | "attention_mask": caption_masks.reshape(-1, L), 75 | "pixel_values": image.reshape(-1, 1, C, H, W), 76 | "return_loss": False} 77 | outputs = self.clipmodel(**inputs) 78 | results["img_features"] = outputs["image_embeds"] 79 | results["cap_features"] = outputs["text_embeds"] 80 | 81 | return results 82 | 83 | def forward_video(self, video): 84 | inputs = {"pixel_values": video, 85 | "if_norm": True} 86 | video_features = self.clipmodel.get_image_features(**inputs) 87 | return video_features 88 | 89 | def forward_text(self, text_input_ids, text_input_mask): 90 | inputs = {"input_ids": text_input_ids, 91 | "attention_mask": text_input_mask, 92 | "if_norm": True} 93 | text_features = self.clipmodel.get_text_features(**inputs) 94 | return text_features 95 | 96 | def freeze_text_encoder(self, freeze_text_proj): 97 | freeze_list = [self.clipmodel.text_model] 98 | if freeze_text_proj: 99 | freeze_list.append(self.clipmodel.text_projection) 100 | for m in freeze_list: 101 | m.eval() 102 | for param in m.parameters(): 103 | param.requires_grad = False 104 | 105 | --------------------------------------------------------------------------------