├── LF-VILA
    ├── src
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── video_classification_dataset.py
    │   │   └── actnet_qa_dataset.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── metrics.py
    │   │   ├── misc.py
    │   │   ├── data.py
    │   │   ├── logger.py
    │   │   └── dist.py
    │   ├── optimization
    │   │   ├── __init__.py
    │   │   ├── optimizer.py
    │   │   └── lr_scheduler.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── lfvila_video_classification.py
    │   ├── tools
    │   │   └── __init__.py
    │   ├── configs
    │   │   ├── bert_config.json
    │   │   ├── bert_large_config.json
    │   │   ├── queryd_ret.yaml
    │   │   ├── violin_qa.yaml
    │   │   ├── actnet_qa.yaml
    │   │   ├── didemo_ret.yaml
    │   │   ├── cmovie_ret.yaml
    │   │   ├── actnet_ret.yaml
    │   │   ├── coin_cls.yaml
    │   │   ├── lvu_scene_cls.yaml
    │   │   ├── lvu_relationship_cls.yaml
    │   │   ├── how2_qa.yaml
    │   │   ├── lvu_wayspeaking_cls.yaml
    │   │   ├── pretrain_stage1.yaml
    │   │   └── pretrain_stage2.yaml
    │   └── tasks
    │   │   ├── run_video_classification.py
    │   │   └── run_retrieval.py
    ├── figs
    │   ├── framework.png
    │   └── data_example.png
    ├── docker
    │   └── requirements.txt
    ├── setup.sh
    ├── scripts
    │   └── download_data.sh
    ├── launch_container.sh
    └── .gitignore
├── hd-vila
    ├── src
    │   ├── __init__.py
    │   ├── datasets
    │   │   └── __init__.py
    │   ├── modeling
    │   │   └── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── metrics.py
    │   │   ├── misc.py
    │   │   ├── logger.py
    │   │   └── stop_words.py
    │   ├── optimization
    │   │   ├── __init__.py
    │   │   └── sched.py
    │   └── configs
    │   │   ├── base_model.json
    │   │   ├── base_model_large.json
    │   │   ├── pretrain_stage2.json
    │   │   ├── msrvtt_qa.json
    │   │   ├── pretrain_stage1.json
    │   │   ├── tgif_frame_qa.json
    │   │   ├── tgif_action_qa.json
    │   │   ├── tgif_transition_qa.json
    │   │   ├── lsmdc_retrieval.json
    │   │   ├── didemo_retrieval.json
    │   │   ├── actnet_retrieval.json
    │   │   └── msrvtt_retrieval.json
    ├── figs
    │   └── framework.png
    ├── setup.sh
    ├── scripts
    │   ├── process_raw_video
    │   │   ├── gif2mp4.py
    │   │   ├── decode_frames.py
    │   │   └── compress_video.py
    │   └── download_data.sh
    └── launch_container.sh
├── CLIP-ViP
    ├── src
    │   ├── __init__.py
    │   ├── datasets
    │   │   └── __init__.py
    │   ├── modeling
    │   │   ├── __init__.py
    │   │   └── VidCLIP.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── misc.py
    │   │   ├── metrics.py
    │   │   ├── logger.py
    │   │   └── stop_words.py
    │   ├── optimization
    │   │   ├── __init__.py
    │   │   ├── sched.py
    │   │   └── adamw.py
    │   └── configs
    │   │   ├── lsmdc_retrieval
    │   │       ├── lsmdc_retrieval_vip_base_16.json
    │   │       └── lsmdc_retrieval_vip_base_32.json
    │   │   ├── didemo_retrieval
    │   │       ├── didemo_retrieval_vip_base_32.json
    │   │       └── didemo_retrieval_vip_base_16.json
    │   │   ├── msrvtt_retrieval
    │   │       ├── msrvtt_retrieval_vip_base_16.json
    │   │       └── msrvtt_retrieval_vip_base_32.json
    │   │   ├── actnet_retrieval
    │   │       ├── actnet_retrieval_vip_base_16.json
    │   │       └── actnet_retrieval_vip_base_32.json
    │   │   └── pretrain
    │   │       ├── pretrain_vip_base_16.json
    │   │       └── pretrain_vip_base_32.json
    ├── setup.sh
    ├── launch_container.sh
    └── LICENSE
├── hd-vila-100m
    ├── figs
    │   ├── statics.png
    │   └── examples.png
    ├── README.md
    ├── LICENSE
    └── src
    │   └── cut_videos.py
├── visualparsing
    ├── visualparsing.png
    └── README.md
├── CODE_OF_CONDUCT.md
├── SUPPORT.md
├── RAI.md
├── SECURITY.md
└── README.md


/LF-VILA/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hd-vila/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LF-VILA/src/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LF-VILA/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hd-vila/src/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hd-vila/src/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hd-vila/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/optimization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LF-VILA/src/optimization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hd-vila/src/optimization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LF-VILA/figs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/LF-VILA/figs/framework.png


--------------------------------------------------------------------------------
/hd-vila/figs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/hd-vila/figs/framework.png


--------------------------------------------------------------------------------
/LF-VILA/figs/data_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/LF-VILA/figs/data_example.png


--------------------------------------------------------------------------------
/hd-vila-100m/figs/statics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/hd-vila-100m/figs/statics.png


--------------------------------------------------------------------------------
/hd-vila-100m/figs/examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/hd-vila-100m/figs/examples.png


--------------------------------------------------------------------------------
/visualparsing/visualparsing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/XPretrain/HEAD/visualparsing/visualparsing.png


--------------------------------------------------------------------------------
/CLIP-ViP/setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # source setup.sh
4 | export DIR_PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
5 | export PYTHONPATH="$PYTHONPATH:$DIR_PWD"
6 | 
7 | echo $PYTHONPATH
8 | 


--------------------------------------------------------------------------------
/hd-vila/setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # source setup.sh
4 | export DIR_PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
5 | export PYTHONPATH="$PYTHONPATH:$DIR_PWD"
6 | 
7 | echo $PYTHONPATH
8 | 


--------------------------------------------------------------------------------
/LF-VILA/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | jsonlines
 2 | deepspeed==0.5.8
 3 | transformers==4.30.0
 4 | timm==0.4.12
 5 | einops==0.3.2
 6 | jsonlines==3.0.0
 7 | tensorboardX==2.4.1
 8 | decord==0.6.0
 9 | easydict==1.9
10 | ruamel_yaml


--------------------------------------------------------------------------------
/LF-VILA/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # source setup.sh
 4 | export DIR_PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 5 | export PYTHONPATH="$PYTHONPATH:$DIR_PWD"
 6 | 
 7 | pip install lmdb
 8 | 
 9 | echo $PYTHONPATH
10 | 


--------------------------------------------------------------------------------
/LF-VILA/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .lfvila_pretrain import LFVILA_Pretrain
2 | from .lfvila_qa_multichoice import LFVILA_QA_Multichoice
3 | from .lfvila_qa_classification import LFVILA_QA_Classification
4 | from .lfvila_retrieval import LFVILA_Retrieval
5 | from .lfvila_video_classification import LFVILA_Video_Classification


--------------------------------------------------------------------------------
/LF-VILA/src/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .trainer_pretrain import Trainer_Pretrain
2 | from .trainer_qa_multichoice import Trainer_QA_Multichoice
3 | from .trainer_qa_classification import Trainer_QA_Classification
4 | from .trainer_retrieval import Trainer_Retrieval
5 | from .trainer_video_classification import Trainer_Video_Classification


--------------------------------------------------------------------------------
/LF-VILA/scripts/download_data.sh:
--------------------------------------------------------------------------------
1 | # Download Models and Data:
2 | DOWNLOAD=$1
3 | 
4 | BLOB='https://hdvila.blob.core.windows.net/dataset/lfvila_release.zip?sp=r&st=2023-03-16T05:01:27Z&se=2027-03-01T13:01:27Z&spr=https&sv=2021-12-02&sr=b&sig=lxR7bZ4i3Jpm4Z93u%2BgqhGvfF6DZ4hyRgPFwhwO9i78%3D'
5 | 
6 | wget -nc $BLOB -O $DOWNLOAD/lfvila_release.zip
7 | unzip $DOWNLOAD/lfvila_release.zip -d $DOWNLOAD


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/hd-vila/scripts/process_raw_video/gif2mp4.py:
--------------------------------------------------------------------------------
 1 | import moviepy.editor as mp
 2 | import os
 3 | import glob
 4 | 
 5 | def gif2mp4(gif_dir, mp4_dir):
 6 |     gifs = glob.glob(os.path.join(gif_dir, "*.gif"))
 7 |     for gif in gifs:
 8 |         clip = mp.VideoFileClip(gif)
 9 |         target_path = os.path.join(mp4_dir, os.path.basename(gif).replace(".gif", ".mp4"))
10 |         clip.write_videofile(target_path)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     gif2mp4("path/to/gifs", "path/to/mp4")


--------------------------------------------------------------------------------
/CLIP-ViP/launch_container.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=$1
 2 | 
 3 | if [ -z $CUDA_VISIBLE_DEVICES ]; then
 4 |    CUDA_VISIBLE_DEVICES='all'
 5 | fi
 6 | 
 7 | docker run --gpus '"'device=$CUDA_VISIBLE_DEVICES'"' --ipc=host --rm -it \
 8 |    --mount src=$(pwd),dst=/VidCLIP,type=bind \
 9 |    --mount src=$DATA_DIR,dst=/blob_mount,type=bind \
10 |    -e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
11 |    -w /VidCLIP tiankaihang/azureml_docker:horovod \
12 |    bash -c "source /VidCLIP/setup.sh && export OMPI_MCA_btl_vader_single_copy_mechanism=none && bash"
13 | 
14 | 


--------------------------------------------------------------------------------
/LF-VILA/launch_container.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=$1
 2 | NAME=$2
 3 | 
 4 | 
 5 | if [ -z $CUDA_VISIBLE_DEVICES ]; then
 6 |    CUDA_VISIBLE_DEVICES='all'
 7 | fi
 8 | 
 9 | docker run --gpus device=$CUDA_VISIBLE_DEVICES --ipc=host --rm -it \
10 |    --name $NAME \
11 |    --shm-size=128g \
12 |    --mount src=$(pwd),dst=/LF-VILA,type=bind \
13 |    --mount src=$DATA_DIR,dst=/blob_mount,type=bind \
14 |    -e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
15 |    -w /LF-VILA ycsun1972/azureml_docker:horovod_deepspeed_v2 \
16 |    bash -c "source /LF-VILA/setup.sh && bash"
17 | 


--------------------------------------------------------------------------------
/hd-vila/launch_container.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR=$1
 2 | 
 3 | if [ -z $CUDA_VISIBLE_DEVICES ]; then
 4 |    CUDA_VISIBLE_DEVICES='all'
 5 | fi
 6 | 
 7 | docker run --gpus '"'device=$CUDA_VISIBLE_DEVICES'"' --ipc=host --rm -it \
 8 |    --mount src=$(pwd),dst=/HD-VILA,type=bind \
 9 |    --mount src=$DATA_DIR,dst=/data_mount,type=bind \
10 |    -e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
11 |    -w /HD-VILA tiankaihang/azureml_docker:horovod \
12 |    bash -c "source /HD-VILA/setup.sh && export OMPI_MCA_btl_vader_single_copy_mechanism=none && bash"
13 | 
14 | 


--------------------------------------------------------------------------------
/LF-VILA/src/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def compute_rt_metrics(x):
 5 |     sx = np.sort(-x, axis=1)
 6 |     d = np.diag(-x)
 7 |     d = d[:, np.newaxis]
 8 |     ind = sx - d
 9 |     ind = np.where(ind == 0)
10 |     ind = ind[1]
11 |     r1 = float(np.sum(ind == 0))  / len(ind)
12 |     r5 = float(np.sum(ind < 5))  / len(ind)
13 |     r10 = float(np.sum(ind < 10))  / len(ind)
14 |     r50 = float(np.sum(ind < 50))  / len(ind)
15 |     medr = np.median(ind) + 1
16 |     meanr  = np.mean(ind) + 1
17 |     return r1, r5, r10, r50, medr, meanr
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "gradient_checkpointing": false,
 7 |   "hidden_act": "gelu",
 8 |   "hidden_dropout_prob": 0.1,
 9 |   "hidden_size": 768,
10 |   "initializer_range": 0.02,
11 |   "intermediate_size": 3072,
12 |   "layer_norm_eps": 1e-12,
13 |   "max_position_embeddings": 512,
14 |   "model_type": "bert",
15 |   "num_attention_heads": 12,
16 |   "num_hidden_layers": 12,
17 |   "pad_token_id": 0,
18 |   "position_embedding_type": "absolute",
19 |   "transformers_version": "4.6.0.dev0",
20 |   "type_vocab_size": 2,
21 |   "use_cache": true,
22 |   "vocab_size": 30522
23 | }
24 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/bert_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "gradient_checkpointing": false,
 7 |   "hidden_act": "gelu",
 8 |   "hidden_dropout_prob": 0.1,
 9 |   "hidden_size": 1024,
10 |   "initializer_range": 0.02,
11 |   "intermediate_size": 4096,
12 |   "layer_norm_eps": 1e-12,
13 |   "max_position_embeddings": 512,
14 |   "model_type": "bert",
15 |   "num_attention_heads": 16,
16 |   "num_hidden_layers": 24,
17 |   "pad_token_id": 0,
18 |   "position_embedding_type": "absolute",
19 |   "transformers_version": "4.6.0.dev0",
20 |   "type_vocab_size": 2,
21 |   "use_cache": true,
22 |   "vocab_size": 30522
23 | }
24 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/base_model.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "max_temporal_position_embeddings": 100,
 3 |     "backbone_channel_in_size": 2048,
 4 |     "max_grid_row_position_embeddings": 100,
 5 |     "max_grid_col_position_embeddings": 100,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 768,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 3072,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 12,
16 |     "num_hidden_layers": 12,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | }
21 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/base_model_large.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "max_temporal_position_embeddings": 100,
 3 |     "backbone_channel_in_size": 2048,
 4 |     "max_grid_row_position_embeddings": 100,
 5 |     "max_grid_col_position_embeddings": 100,
 6 |     "attention_probs_dropout_prob": 0.1,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "hidden_size": 1024,
10 |     "initializer_range": 0.02,
11 |     "intermediate_size": 4096,
12 |     "layer_norm_eps": 1e-12,
13 |     "max_position_embeddings": 512,
14 |     "model_type": "bert",
15 |     "num_attention_heads": 16,
16 |     "num_hidden_layers": 24,
17 |     "pad_token_id": 0,
18 |     "type_vocab_size": 2,
19 |     "vocab_size": 30522
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/utils/misc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | modified from UNITER
 3 | """
 4 | import json
 5 | import random
 6 | import sys
 7 | 
 8 | import torch
 9 | import numpy as np
10 | 
11 | 
12 | class NoOp(object):
13 |     """ useful for distributed training No-Ops """
14 |     def __getattr__(self, name):
15 |         return self.noop
16 | 
17 |     def noop(self, *args, **kwargs):
18 |         return
19 | 
20 | 
21 | def set_random_seed(seed):
22 |     random.seed(seed)
23 |     np.random.seed(seed)
24 |     torch.manual_seed(seed)
25 |     torch.cuda.manual_seed_all(seed)
26 | 
27 | 
28 | def zero_none_grad(model):
29 |     for p in model.parameters():
30 |         if p.grad is None and p.requires_grad:
31 |             p.grad = p.data.new(p.size()).zero_()
32 | 


--------------------------------------------------------------------------------
/hd-vila/scripts/download_data.sh:
--------------------------------------------------------------------------------
 1 | # Download Models:
 2 | # 1, pretrained model
 3 | DOWNLOAD=$1
 4 | 
 5 | BLOB='https://hdvila.blob.core.windows.net/dataset/pretrained.zip?sp=r&st=2022-09-13T08:25:54Z&se=2024-12-31T16:25:54Z&spr=https&sv=2021-06-08&sr=b&sig=Zt8vmQ%2F5wU35507Dar4i4Qsk3dqf15aEBQOS4QqUUrc%3D'
 6 | 
 7 | # 1, pretrained model
 8 | wget -nc $BLOB -O $DOWNLOAD/pretrained.zip
 9 | unzip $DOWNLOAD/pretrained.zip -d $DOWNLOAD
10 | 
11 | BLOB='https://hdvila.blob.core.windows.net/dataset/data.zip?sp=r&st=2022-09-13T02:35:13Z&se=2024-12-31T10:35:13Z&spr=https&sv=2021-06-08&sr=b&sig=BjQXSegSvllCLpx%2B%2FhDH6VwVDE0e2XHZ%2FqwAo5ZpyeQ%3D'
12 | 
13 | # 2, downstream dataset
14 | wget -nc $BLOB -O $DOWNLOAD/data.zip
15 | unzip $DOWNLOAD/data.zip -d $DOWNLOAD
16 | mv $DOWNLOAD/downstream_data $DOWNLOAD/data
17 | 


--------------------------------------------------------------------------------
/hd-vila/src/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def cal_cossim(feats1, feats2):
 4 |     sim_matrix = np.dot(feats1, feats2.T)
 5 |     return sim_matrix
 6 | 
 7 | 
 8 | def compute_metrics(x):
 9 |     sx = np.sort(-x, axis=1)
10 |     d = np.diag(-x)
11 |     d = d[:, np.newaxis]
12 |     ind = sx - d
13 |     ind = np.where(ind == 0)
14 |     ind = ind[1]
15 |     r1 = float(np.sum(ind == 0))  / len(ind)
16 |     r5 = float(np.sum(ind < 5))  / len(ind)
17 |     r10 = float(np.sum(ind < 10))  / len(ind)
18 |     r50 = float(np.sum(ind < 50))  / len(ind)
19 |     medr = np.median(ind) + 1
20 |     meanr  = np.mean(ind) + 1
21 |     return r1, r5, r10, r50,  medr, meanr
22 | 
23 | 
24 | if __name__ == '__main__':
25 | 
26 |     sim_matrix = np.random.random((5,5))
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/hd-vila/src/utils/misc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | modified from UNITER
 3 | """
 4 | import json
 5 | import random
 6 | import sys
 7 | 
 8 | import torch
 9 | import numpy as np
10 | 
11 | 
12 | class NoOp(object):
13 |     """ useful for distributed training No-Ops """
14 |     def __getattr__(self, name):
15 |         return self.noop
16 | 
17 |     def noop(self, *args, **kwargs):
18 |         return
19 | 
20 | 
21 | def set_random_seed(seed):
22 |     random.seed(seed)
23 |     np.random.seed(seed)
24 |     torch.manual_seed(seed)
25 |     torch.cuda.manual_seed_all(seed)
26 | 
27 | 
28 | def zero_none_grad(model):
29 |     HAS_NAN = False
30 |     for p in model.parameters():
31 |         if p.grad is None and p.requires_grad:
32 |             HAS_NAN = True
33 |             p.grad = p.data.new(p.size()).zero_()
34 |     return HAS_NAN


--------------------------------------------------------------------------------
/LF-VILA/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode 
 2 | 
 3 | # script
 4 | tmp_all/script/
 5 | 
 6 | # Philly-realted #
 7 | pt/
 8 | .ptconfig
 9 | 
10 | 
11 | 
12 | # Project-related   #
13 | */*results*/
14 | *results*/
15 | tmp*/
16 | cache/*
17 | */cache*/
18 | tmp*.py
19 | *pickle
20 | 
21 | # compiled files #
22 | *.pyc
23 | **/__pycache__/
24 | 
25 | # Packages #
26 | ############
27 | # it's better to unpack these files and commit the raw source
28 | # git has its own built in compression methods
29 | *.7z
30 | *.dmg
31 | *.gz
32 | *.iso
33 | *.jar
34 | *.rar
35 | *.tar
36 | *.zip
37 | 
38 | # Logs and databases #
39 | ######################
40 | *.log
41 | *.sql
42 | *.sqlite
43 | .ipynb_checkpoints/
44 | *.swp
45 | *.vscode/
46 | *.idea/
47 | 
48 | # OS generated files #
49 | ######################
50 | .DS_Store
51 | .DS_Store?
52 | ._*
53 | .Spotlight-V100
54 | .Trashes
55 | ehthumbs.db
56 | Thumbs.db
57 | 


--------------------------------------------------------------------------------
/LF-VILA/src/optimization/optimizer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | 
 4 | 
 5 | 
 6 | def build_optimizer_parameters(config, model):
 7 | 
 8 |     param_optimizer = list(model.named_parameters())
 9 |     param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
10 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'pos_embed','relative_position_bias_table']
11 | 
12 |     if "weight_decay" in config.TRAINING.keys():
13 |         weight_decay = config.TRAINING["weight_decay"]
14 |     else:
15 |         weight_decay = 0.01
16 | 
17 | 
18 |     optimizer_grouped_parameters = [{
19 |         'params': [
20 |             p for n, p in param_optimizer
21 |             if not any(nd in n for nd in no_decay) and p.requires_grad
22 |         ],
23 |         'weight_decay':
24 |         weight_decay
25 |     }, {
26 |         'params':
27 |         [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad],
28 |         'weight_decay':
29 |         0.0
30 |     }]
31 |     
32 |     return optimizer_grouped_parameters


--------------------------------------------------------------------------------
/CLIP-ViP/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jie Lei
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/LF-VILA/src/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import numpy as np
 4 | import torch
 5 | import einops
 6 | 
 7 | def mkdirp(p):
 8 |     if not os.path.exists(p):
 9 |         os.makedirs(p)
10 | 
11 | def set_random_seed(seed):
12 |     random.seed(seed)
13 |     np.random.seed(seed)
14 |     torch.manual_seed(seed)
15 |     torch.cuda.manual_seed_all(seed)
16 | 
17 | def vector_gather(vectors, indices):
18 |     """
19 |         Gathers (batched) vectors according to indices.
20 |         Arguments:
21 |         vectors: Tensor[N, L, D]
22 |         indices: Tensor[N, K] or Tensor[N]
23 |         Returns:
24 |         Tensor[N, K, D] or Tensor[N, D]
25 |     """
26 |     N, L, D = vectors.shape
27 |     squeeze = False
28 |     if indices.ndim == 1:
29 |         squeeze = True
30 |         indices = indices.unsqueeze(-1)
31 |     N2, K = indices.shape
32 |     assert N == N2
33 |     indices = einops.repeat(indices, "N K -> N K D", D=D)
34 |     out = torch.gather(vectors, dim=1, index=indices)
35 |     if squeeze:
36 |         out = out.squeeze(1)
37 |     return out
38 | 
39 | class AverageMeter(object):
40 |     """Computes and stores the average and current/max/min value"""
41 |     def __init__(self):
42 |         self.val = 0
43 |         self.avg = 0
44 |         self.sum = 0
45 |         self.count = 0
46 |         self.max = -1e10
47 |         self.min = 1e10
48 |         self.reset()
49 | 
50 |     def reset(self):
51 |         self.val = 0
52 |         self.avg = 0
53 |         self.sum = 0
54 |         self.count = 0
55 |         self.max = -1e10
56 |         self.min = 1e10
57 | 
58 |     def update(self, val, n=1):
59 |         self.max = max(val, self.max)
60 |         self.min = min(val, self.min)
61 |         self.val = val
62 |         self.sum += val * n
63 |         self.count += n
64 |         self.avg = self.sum / self.count


--------------------------------------------------------------------------------
/visualparsing/README.md:
--------------------------------------------------------------------------------
 1 | # Visual Parsing
 2 | 
 3 | [Probing Inter-modality: Visual Parsing with Self-Attention for Vision-and-Language Pre-training](https://proceedings.neurips.cc/paper/2021/file/23fa71cc32babb7b91130824466d25a5-Paper.pdf) accepted by [NeurIPS 2021](https://nips.cc/Conferences/2021/).
 4 | 
 5 | By [Hongwei Xue](https://hellwayxue.github.io/), [Yupan Huang](https://hypjudy.github.io/), [Bei Liu](https://www.microsoft.com/en-us/research/people/libei/), [Houwen Peng](https://www.microsoft.com/en-us/research/people/hopeng/), [Jianlong Fu](https://www.microsoft.com/en-us/research/people/jianf/), [Houqiang Li](http://staff.ustc.edu.cn/~lihq/en/), and [Jiebo Luo](https://www.cs.rochester.edu/u/jluo/).
 6 | 
 7 | ## Introdution
 8 | 
 9 | We propose a fully Transformer visual embedding for Vision-Language Pre-training (VLP) to
10 | better learn visual relation and further promote inter-modal alignment. Specifically,
11 | we propose a metric named Inter-Modality Flow (IMF) to measure the interaction
12 | between vision and language (i.e., inter-modality). We also design a novel masking
13 | optimization mechanism named Masked Feature Regression (MFR) in Transformer
14 | to further promote the inter-modality learning.
15 | 
16 | <p align="center">
17 | <img src="visualparsing.png" alt="framework" width="80%"/>
18 | </p>
19 | <p align="center">
20 | <font size=2 color="gray">The framework of Visual Parsing.</font>
21 | </p>
22 | 
23 | ## Citing Our Paper
24 | 
25 | If you find our work useful for your research, please consider citing our paper. :blush:
26 | 
27 | ```bibtex
28 | @article{xue2021probing,
29 |   title={Probing Inter-modality: Visual Parsing with Self-Attention for Vision-and-Language Pre-training},
30 |   author={Xue, Hongwei and Huang, Yupan and Liu, Bei and Peng, Houwen and Fu, Jianlong and Li, Houqiang and Luo, Jiebo},
31 |   journal={Advances in Neural Information Processing Systems},
32 |   volume={34},
33 |   year={2021}
34 | }
35 | ```
36 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/lsmdc_retrieval/lsmdc_retrieval_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "lsmdc-101k",
 5 |       "vis_format": "video",
 6 |       "txt": "clip_data/vis_db/lsmdc/train_101k_frame.jsonl",
 7 |       "vis": "datasets/lsmdc"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "lsmdc-1k",
13 |       "vis_format": "video",
14 |       "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl",
15 |       "vis": "datasets/lsmdc"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "lsmdc-1k",
21 |       "vis_format": "video",
22 |       "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl",
23 |       "vis": "datasets/lsmdc"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch16",
39 |   "clip_config": "openai/clip-vit-base-patch16",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 10,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/lsmdc_retrieval/lsmdc_retrieval_vip_base_16",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/lsmdc_retrieval/lsmdc_retrieval_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "lsmdc-101k",
 5 |       "vis_format": "video",
 6 |       "txt": "clip_data/vis_db/lsmdc/train_101k_frame.jsonl",
 7 |       "vis": "datasets/lsmdc"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "lsmdc-1k",
13 |       "vis_format": "video",
14 |       "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl",
15 |       "vis": "datasets/lsmdc"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "lsmdc-1k",
21 |       "vis_format": "video",
22 |       "txt": "clip_data/vis_db/lsmdc/test_1k_frame.jsonl",
23 |       "vis": "datasets/lsmdc"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch32",
39 |   "clip_config": "openai/clip-vit-base-patch32",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 10,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/lsmdc_retrieval/lsmdc_retrieval_vip_base_32",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/didemo_retrieval/didemo_retrieval_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "didemo-train",
 5 |       "vis_format": "video",
 6 |       "txt": "datasets/lfvideo_data/task/didemo/train.jsonl",
 7 |       "vis": "datasets/didemo/didemo_video_xfps/"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "didemo-val",
13 |       "vis_format": "video",
14 |       "txt": "datasets/lfvideo_data/task/didemo/val.jsonl",
15 |       "vis": "datasets/didemo/didemo_video_xfps/"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "didemo-test",
21 |       "vis_format": "video",
22 |       "txt": "datasets/lfvideo_data/task/didemo/test.jsonl",
23 |       "vis": "datasets/didemo/didemo_video_xfps/"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch32",
39 |   "clip_config": "openai/clip-vit-base-patch32",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 20,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/didemo_retrieval/didemo_retrieval_vip_base_32",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/didemo_retrieval/didemo_retrieval_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "didemo-train",
 5 |       "vis_format": "video",
 6 |       "txt": "datasets/lfvideo_data/task/didemo/train.jsonl",
 7 |       "vis": "datasets/didemo/didemo_video_xfps/"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "didemo-val",
13 |       "vis_format": "video",
14 |       "txt": "datasets/lfvideo_data/task/didemo/val.jsonl",
15 |       "vis": "datasets/didemo/didemo_video_xfps/"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "didemo-test",
21 |       "vis_format": "video",
22 |       "txt": "datasets/lfvideo_data/task/didemo/test.jsonl",
23 |       "vis": "datasets/didemo/didemo_video_xfps/"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 70,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch16",
39 |   "clip_config": "openai/clip-vit-base-patch16",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 10,
63 |   "lr_mul_prefix": "logit_scale",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 20,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/didemo_retrieval/didemo_retrieval_vip_base_16",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/LF-VILA/src/utils/data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def mask_batch_text_tokens(
 4 |         inputs, tokenizer, mlm_probability=0.15, is_train=True):
 5 |     """ modified from transformers.data.data_collator
 6 |     Args:
 7 |         inputs: (B, L), 2D torch.Tensor, does not work for 1D. It has already been padded.
 8 |         tokenizer:
 9 |         mlm_probability: float
10 |         is_train: if True use random masking, else mask tokens at fixed position to remove randomness in evaluation.
11 |     """
12 |     if tokenizer.mask_token is None:
13 |         raise ValueError(
14 |             "This tokenizer does not have a mask token which is necessary for masked language modeling. "
15 |             "Remove the --mlm flag if you want to use this tokenizer."
16 |         )
17 | 
18 |     labels = inputs.clone()
19 |     # We sample a few tokens in each sequence for masked-LM training
20 |     # (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
21 |     probability_matrix = torch.full(labels.shape, mlm_probability)
22 |     special_tokens_mask = [
23 |         tokenizer.get_special_tokens_mask(
24 |             val, already_has_special_tokens=True) for val in labels.tolist()
25 |     ]
26 |     probability_matrix.masked_fill_(torch.tensor(
27 |         special_tokens_mask, dtype=torch.bool), value=0.0)
28 |     if tokenizer._pad_token is not None:
29 |         padding_mask = labels.eq(tokenizer.pad_token_id)
30 |         probability_matrix.masked_fill_(padding_mask, value=0.0)
31 |     masked_indices = torch.bernoulli(probability_matrix).bool()
32 |     labels[~masked_indices] = -100  # We only compute loss on masked tokens
33 | 
34 |     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
35 |     indices_replaced = torch.bernoulli(
36 |         torch.full(labels.shape, 0.8)).bool() & masked_indices
37 |     inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(
38 |         tokenizer.mask_token)
39 | 
40 |     # 10% of the time, we replace masked input tokens with random word
41 |     indices_random = torch.bernoulli(
42 |         torch.full(labels.shape, 0.5)
43 |         ).bool() & masked_indices & ~indices_replaced
44 |     random_words = torch.randint(
45 |         len(tokenizer), labels.shape,
46 |         dtype=torch.long)  # len(tokenizer) == #vocab
47 |     inputs[indices_random] = random_words[indices_random]
48 | 
49 |     # The rest of the time (10% of the time) we keep the masked input tokens unchanged
50 |     return inputs, labels


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/msrvtt_retrieval/msrvtt_retrieval_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "msrvtt-9k",
 5 |       "vis_format": "video",
 6 |       "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl",
 7 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "msrvtt-1ka",
13 |       "vis_format": "video",
14 |       "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
15 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "msrvtt-1ka",
21 |       "vis_format": "video",
22 |       "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
23 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch16",
39 |   "clip_config": "openai/clip-vit-base-patch16",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 100,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_16",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/msrvtt_retrieval/msrvtt_retrieval_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "msrvtt-9k",
 5 |       "vis_format": "video",
 6 |       "txt": "clip_data/vis_db/msrvtt_video_clips/train9k.jsonl",
 7 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "msrvtt-1ka",
13 |       "vis_format": "video",
14 |       "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
15 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "msrvtt-1ka",
21 |       "vis_format": "video",
22 |       "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
23 |       "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 12,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 12,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 50,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch32",
39 |   "clip_config": "openai/clip-vit-base-patch32",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 100,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/msrvtt_retrieval/msrvtt_retrieval_vip_base_32",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/actnet_retrieval/actnet_retrieval_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "actnet-train",
 5 |       "vis_format": "frame",
 6 |       "txt": "clip_data/vis_db/anet_retrieval/train.jsonl",
 7 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "actnet-test",
13 |       "vis_format": "frame",
14 |       "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl",
15 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "actnet-test",
21 |       "vis_format": "frame",
22 |       "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl",
23 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 32,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 32,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 70,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/16/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch16",
39 |   "clip_config": "openai/clip-vit-base-patch16",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 8,
49 |   "test_batch_size": 8,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 20,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/actnet_retrieval/actnet_retrieval_vip_base_16",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/actnet_retrieval/actnet_retrieval_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": 
 3 |     {
 4 |       "name": "actnet-train",
 5 |       "vis_format": "frame",
 6 |       "txt": "clip_data/vis_db/anet_retrieval/train.jsonl",
 7 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
 8 |     },
 9 |   "val_datasets": [
10 | 
11 |     {
12 |       "name": "actnet-test",
13 |       "vis_format": "frame",
14 |       "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl",
15 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
16 |     }
17 |   ],
18 |   "inference_datasets": [
19 |     {
20 |       "name": "actnet-test",
21 |       "vis_format": "frame",
22 |       "txt": "clip_data/vis_db/anet_retrieval/val1.jsonl",
23 |       "vis": "datasets/activitynet/ActivityNetVideoData2020Nov/video_frames_lr"
24 |     }
25 |   ],
26 | 
27 |   "train_n_clips": 1,
28 |   "train_num_frms": 32,
29 |   "test_n_clips": 1,
30 |   "test_num_frms": 32,
31 |   "sample_rate": 0,
32 |   "sample_jitter": 1,
33 |   "video_res": [240, 320],
34 |   "input_res": [224, 224],
35 |   "max_txt_len": 70,
36 | 
37 |   "e2e_weights_path": "path/to/CLIP-ViP-B/32/checkpoint",
38 |   "clip_weights": "openai/clip-vit-base-patch32",
39 |   "clip_config": "openai/clip-vit-base-patch32",
40 |   "clip_vision_additional_config": {
41 |       "type": "ViP",
42 |       "temporal_size": 12,
43 |       "if_use_temporal_embed": 1,
44 |       "logit_scale_init_value": 4.60,
45 |       "add_cls_num": 3
46 |   },
47 | 
48 |   "train_batch_size": 16,
49 |   "test_batch_size": 16,
50 |   "max_n_example_per_group": 1,
51 |   "gradient_accumulation_steps": 1,
52 |   "n_workers": 8,
53 |   "pin_mem": 1,
54 |   "fp16": 1,
55 |   "amp_level": "O2",
56 |   "seed": 42,
57 | 
58 |   "optim": "adamw",
59 |   "betas": [0.9, 0.98],
60 |   "learning_rate": 1e-6,
61 |   "weight_decay": 0.2,
62 |   "lr_mul": 1,
63 |   "lr_mul_prefix": "",
64 |   "loss_config": {
65 |     "loss_name": "NCELearnableTempLoss",
66 |     "if_gather": 1
67 |   },
68 |   "warmup_ratio": 0.01,
69 |   "decay": "cosine",
70 |   "grad_norm": 1.0,
71 | 
72 |   "num_train_epochs": 20,
73 |   "min_valid_steps": 1,
74 |   "num_valid": 1,
75 |   "only_valid_steps": 100,
76 |   "save_steps_ratio": 0.9,
77 |   "output_dir": "vidclip_data/output/actnet_retrieval/actnet_retrieval_vip_base_32",
78 |   "if_tb_log": 0,
79 |   "if_model_saver": 1,
80 |   "if_log2file": 1,
81 |   "dummy_data": 0
82 | }
83 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/pretrain_stage2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": [
 3 |     {
 4 |       "name": "hdvila_pretrain",
 5 |       "vis_format": "video",
 6 |       "txt": "data/hdvila/train_group/part1.jsonl",
 7 |       "vis": "data/hdvila/video_clips"
 8 |     }
 9 |   ],
10 |   "val_datasets": [
11 |     {
12 |       "name": "msrvtt",
13 |       "vis_format": "video",
14 |       "txt": "data/msrvtt_retrieval/test1ka.jsonl",
15 |       "vis": "data/msrvtt_retrieval/videos"
16 |     },
17 |     {
18 |       "name": "hdvila_test_full",
19 |       "vis_format": "video",
20 |       "txt": "data/hdvila/test_full_1k.jsonl",
21 |       "vis": "data/hdvila/video_clips"
22 |     }
23 |   ],
24 | 
25 |   "model_config": "src/configs/base_model_large.json",
26 |   "e2e_weights_path": "data/output/pretrain_stage1/ckpt/model_step_1129660.pt",
27 |   "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
28 |   "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
29 |   "tokenizer_dir": "data/pretrained/bert-base-uncased/",
30 |   "output_dir": "data/output/pretrain_stage2",
31 |   "resnet_depth": 50,
32 |   "resnet_frozen_stage": -1,
33 |   "timesformer_depth": 4,
34 |   "timesformer_heads": 16,
35 |   "max_txt_len": 50,
36 |   "score_agg_func": "lse",
37 |   "loss_type": "ce",
38 |   "train_n_clips": 2,
39 |   "num_frm": 7,
40 |   "sample_rate": 12,
41 |   "crop_size": [640, 1024],
42 |   "out_size": [256, 128, 64, 3],
43 |   "train_batch_size": 16,
44 |   "val_batch_size": 16,
45 |   "max_n_example_per_group": 1,
46 |   "gradient_accumulation_steps": 2,
47 |   "num_train_epochs": 10,
48 |   "min_valid_steps": 1,
49 |   "num_valid": 100,
50 |   "only_valid_steps": 500,
51 |   "save_steps_ratio": 0.01,
52 |   "learning_rate": 5e-5,
53 |   "decay": "linear",
54 |   "optim": "adamw",
55 |   "betas": [0.9, 0.98],
56 |   "dropout": 0.1,
57 |   "weight_decay": 1e-3,
58 |   "grad_norm": 5.0,
59 |   "cnn_learning_rate": 5e-5,
60 |   "cnn_weight_decay": 1e-3,
61 |   "cnn_lr_decay": "linear",
62 |   "align_learning_rate": 5e-5,
63 |   "align_weight_decay": 1e-3,
64 |   "pixel_random_sampling_size": 160,
65 |   "seed":24,
66 |   "fp16": 1,
67 |   "amp_level": "O2",
68 |   "freeze_s1": 1,
69 |   "use_itm": 0,
70 |   "use_itc": 0,
71 |   "use_mlm": 1,
72 |   
73 |   "bert_mean": 1,
74 |   "n_workers": 8,
75 | 
76 |   "backbone_channels": [256, 512, 1024, 2048],
77 |   "backbone_channel_in_size": 2048,
78 |   "hidden_size": 1024,
79 | 
80 |   "temp": 0.05
81 | }
82 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def cal_cossim(feats1, feats2):
 4 |     sim_matrix = np.dot(feats1, feats2.T)
 5 |     return sim_matrix
 6 | 
 7 | def np_softmax(X, theta = 1.0, axis = None):
 8 |     """
 9 |     Compute the softmax of each element along an axis of X.
10 | 
11 |     Parameters
12 |     ----------
13 |     X: ND-Array. Probably should be floats. 
14 |     theta (optional): float parameter, used as a multiplier
15 |         prior to exponentiation. Default = 1.0
16 |     axis (optional): axis to compute values along. Default is the 
17 |         first non-singleton axis.
18 | 
19 |     Returns an array the same size as X. The result will sum to 1
20 |     along the specified axis.
21 |     """
22 |     # make X at least 2d
23 |     y = np.atleast_2d(X)
24 |     # find axis
25 |     if axis is None:
26 |         axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)
27 |     # multiply y against the theta parameter, 
28 |     y = y * float(theta)
29 |     # subtract the max for numerical stability
30 |     y = y - np.expand_dims(np.max(y, axis = axis), axis)
31 |     # exponentiate y
32 |     y = np.exp(y)
33 |     # take the sum along the specified axis
34 |     ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)
35 |     # finally: divide elementwise
36 |     p = y / ax_sum
37 |     # flatten if X was 1D
38 |     if len(X.shape) == 1: p = p.flatten()
39 |     return p
40 | 
41 | def compute_metrics(x):
42 |     sx = np.sort(-x, axis=1)
43 |     d = np.diag(-x)
44 |     d = d[:, np.newaxis]
45 |     ind = sx - d
46 |     ind = np.where(ind == 0)
47 |     ind = ind[1]
48 |     r1 = float(np.sum(ind == 0))  / len(ind)
49 |     r5 = float(np.sum(ind < 5))  / len(ind)
50 |     r10 = float(np.sum(ind < 10))  / len(ind)
51 |     medr = np.median(ind) + 1
52 |     meanr  = np.mean(ind) + 1
53 |     return r1, r5, r10, medr, meanr
54 | 
55 | def compute_metrics_multi(x, t2v_labels_list):
56 |     sx = np.sort(-x, axis=1)
57 |     t2v_labels_list = np.array(t2v_labels_list)
58 |     arg = np.arange(x.shape[0])
59 |     d = -x[arg, t2v_labels_list]
60 |     d = d[:, np.newaxis]
61 |     ind = sx - d
62 |     ind = np.where(ind == 0)
63 |     ind = ind[1]
64 |     r1 = float(np.sum(ind == 0))  / len(ind)
65 |     r5 = float(np.sum(ind < 5))  / len(ind)
66 |     r10 = float(np.sum(ind < 10))  / len(ind)
67 |     medr = np.median(ind) + 1
68 |     meanr  = np.mean(ind) + 1
69 |     return r1, r5, r10, medr, meanr
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 |     sim_matrix = np.random.random((5,5))
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/RAI.md:
--------------------------------------------------------------------------------
 1 | Responsible AI Considerations
 2 | 
 3 | The proposed video-language dataset and pre-training model shows the capacity and generalization of learned VL representation which could benefit many applications of CV and NLP with a large range of uses across many domains. Each one of the uses has potential benefits and societal impacts. While we foresee that our technology could be used to find key information and improve efficiency and effectiveness for helpdesks, recommendation, retail and sales, we realize that it could also be used, in combination with new data, to fine-tune models to mislead, or otherwise harm people. We are also aware that this work uses considerable computation resources which itself, has environmental impacts. Therefore reducing the model size and computing effort is essential for future research.
 4 | 
 5 | Machine learning systems can display unfair behavior for different individuals or groups. This is a multi-dimensional, socio-technical challenge and is not explicitly addressed or captured in the current accuracy metrics for this research technology. In general, standardized fairness measures have not yet been agreed upon in academia or industry. We see opportunities for more work in this area to develop methods and benchmarks for measuring fairness aspects.
 6 | 
 7 | Given that user generated data is used, it is possible that certain demographic groups may not have enough representation. While we have balanced various video categories to mitigate for disparities, it is still likely that bias and fairness issues exist; this is an area of potential future work.  There may be a Western heteronormative bias, stereotypical depictions of historically marginalized populations and/or lack of representation among some groups. Although we have filtered the input data for explicit and violent content, it is possible that it hasn’t been totally eliminated in the training data and could have impacts on the results.
 8 | 
 9 | With visual generation techniques it is particularly important to do further work to prevent malicious use to misinform or harm people.
10 | 
11 | While some mitigations for potential harms can be done in the base model, it’s important to recognize that considering risks for fine tuning data for particular scenarios is critical as well. Ultimately, choosing the application scenario of any final model used in a production system will require careful consideration of potential harms specific to the scenario. 
12 | 
13 | For help or issues using the pre-trained models, please submit an issue or contact Bei Liu (bei.liu@microsoft.com) and Jianlong Fu (jianf@microsoft.com).
14 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/msrvtt_qa.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |       {
 4 |         "name": "msrvtt_qa",
 5 |         "txt": {
 6 |           "msrvtt_qa": "data/msrvtt_qa/train.jsonl"
 7 |         },
 8 |         "vis": "data/msrvtt_retrieval/videos_6fps"
 9 |       }
10 |     ],
11 |     "val_datasets": [
12 |       {
13 |         "name": "msrvtt_qa",
14 |         "txt": {
15 |           "msrvtt_qa": "data/msrvtt_qa/val.jsonl"
16 |         },
17 |         "vis": "data/msrvtt_retrieval/videos_6fps"
18 |       }
19 |     ],
20 |     "ans2label_path": "data/msrvtt_qa/train_ans2label.json",
21 |     "max_txt_len": 100,
22 |     "max_img_size": 448,
23 |     "fps": 2,
24 |     "reshape_size": [180, 288],
25 |     "crop_size": [160, 256],
26 |     "sample_rate": 4,
27 |     "num_frm": 7,
28 |     "train_n_clips": 1,
29 |     "score_agg_func": "lse",
30 |     "max_n_example_per_group": 1,
31 |     "model_config": "src/configs/base_model_large.json",
32 |     "e2e_weights_path": "data/pretrained/hdvila_stage2.pt",
33 |     "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
34 |     "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
35 |     "tokenizer_dir": "data/pretrained/bert-base-uncased/",
36 |     "output_dir": "data/output/videoqa/msrvtt_qa",
37 |     "train_batch_size": 16,
38 |     "val_batch_size": 16,
39 |     "gradient_accumulation_steps": 4,
40 |     "num_train_epochs": 20,
41 |     "min_valid_steps": 1,
42 |     "num_valid": 20,
43 |     "save_steps_ratio": 0.2,
44 |     "learning_rate": 1e-5,
45 |     "weight_decay": 0.3,
46 |     "decay": "linear",
47 |     "optim": "adamw",
48 |     "betas": [0.9, 0.98],
49 |     "dropout": 0.3,
50 |     "grad_norm": 5.0,
51 |     "cnn_learning_rate": 1e-5,
52 |     "cnn_weight_decay": 0.3,
53 |     "cnn_lr_decay": "linear",
54 |     "align_learning_rate": 1e-5,
55 |     "align_weight_decay": 0.3,
56 |     "seed":66,
57 |     "fp16": 1,
58 |     "classifier": "mlp",
59 |     "cls_hidden_scale": 2,
60 |     "task": "msrvtt_qa",
61 |   
62 |     "resnet_depth": 50,
63 |     "resnet_frozen_stage": -1,
64 |     "timesformer_depth": 4,
65 |     "timesformer_heads": 16,
66 |     "backbone_channels": [256, 512, 1024, 2048],
67 |     "backbone_downsample": [4, 8, 16, 32],
68 |     "backbone_channel_in_size": 2048,
69 |     "hidden_size": 1024,
70 |   
71 |     "inference_model_step": 0,
72 |     "inference_txt_db": "data/txt_db/msrvtt_qa/test.jsonl",
73 |     "inference_img_db": "data/vis_db/msrvtt_video_clips/videos_6fps",
74 |     "inference_batch_size": 4,
75 |     "inference_n_clips": 8
76 |   }


--------------------------------------------------------------------------------
/hd-vila/src/configs/pretrain_stage1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_datasets": [
 3 |     {
 4 |       "name": "hdvila_pretrain",
 5 |       "vis_format": "video",
 6 |       "txt": "data/hdvila/train_group/part1.jsonl",
 7 |       "vis": "data/hdvila/video_clips"
 8 |     }
 9 |   ],
10 |   "val_datasets": [
11 |     {
12 |       "name": "msrvtt",
13 |       "vis_format": "video",
14 |       "txt": "data/msrvtt_retrieval/test1ka.jsonl",
15 |       "vis": "data/msrvtt_retrieval/videos"
16 |     },
17 |     {
18 |       "name": "hdvila_test_how2",
19 |       "vis_format": "video",
20 |       "txt": "data/hdvila/test_howto_1k.jsonl",
21 |       "vis": "data/hdvila/video_clips"
22 |     },
23 |     {
24 |       "name": "hdvila_test_full",
25 |       "vis_format": "video",
26 |       "txt": "data/hdvila/test_full_1k.jsonl",
27 |       "vis": "data/hdvila/video_clips"
28 |     }
29 |   ],
30 | 
31 |   "model_config": "src/configs/base_model_large.json",
32 |   "e2e_weights_path": null,
33 |   "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
34 |   "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
35 |   "tokenizer_dir": "data/pretrained/bert-base-uncased/",
36 |   "output_dir": "data/output/pretrain_stage1/",
37 |   "resnet_depth": 50,
38 |   "resnet_frozen_stage": -1,
39 |   "timesformer_depth": 4,
40 |   "timesformer_heads": 16,
41 |   "max_txt_len": 50,
42 |   "score_agg_func": "lse",
43 |   "loss_type": "ce",
44 |   "train_n_clips": 2,
45 |   "num_frm": 7,
46 |   "sample_rate": 12,
47 |   "crop_size": [640, 1024],
48 |   "out_size": [256, 128, 64, 3],
49 |   "train_batch_size": 8,
50 |   "val_batch_size": 8,
51 |   "max_n_example_per_group": 1,
52 |   "gradient_accumulation_steps": 1,
53 |   "num_train_epochs": 10,
54 |   "min_valid_steps": 1,
55 |   "num_valid": 200,
56 |   "only_valid_steps": 1000,
57 |   "save_steps_ratio": 0.005,
58 |   "learning_rate": 5e-5,
59 |   "decay": "linear",
60 |   "optim": "adamw",
61 |   "betas": [0.9, 0.98],
62 |   "dropout": 0.1,
63 |   "weight_decay": 1e-3,
64 |   "grad_norm": 5.0,
65 |   "cnn_learning_rate": 5e-5,
66 |   "cnn_weight_decay": 1e-3,
67 |   "cnn_lr_decay": "linear",
68 |   "align_learning_rate": 5e-5,
69 |   "align_weight_decay": 1e-3,
70 |   "pixel_random_sampling_size": 160,
71 |   "seed":24,
72 |   "fp16": 1,
73 |   "amp_level": "O2",
74 |   "use_itm": 0,
75 |   "use_itc": 1,
76 |   "use_mlm": 0,
77 |   
78 |   "bert_mean": 1,
79 |   "n_workers": 8,
80 | 
81 |   "backbone_channels": [256, 512, 1024, 2048],
82 |   "backbone_channel_in_size": 2048,
83 |   "hidden_size": 1024,
84 | 
85 |   "temp": 0.05
86 | }
87 | 


--------------------------------------------------------------------------------
/hd-vila/src/optimization/sched.py:
--------------------------------------------------------------------------------
 1 | """
 2 | optimizer learning rate scheduling helpers
 3 | """
 4 | from math import ceil
 5 | from collections import Counter
 6 | 
 7 | 
 8 | def noam_schedule(step, warmup_step=4000):
 9 |     if step <= warmup_step:
10 |         return step / warmup_step
11 |     return (warmup_step ** 0.5) * (step ** -0.5)
12 | 
13 | 
14 | def warmup_linear(step, warmup_step, tot_step):
15 |     if step < warmup_step:
16 |         return step / warmup_step
17 |     return max(0, (tot_step-step)/(tot_step-warmup_step))
18 | 
19 | 
20 | def multi_step_schedule(n_epoch, milestones, step, warmup_step,gamma=0.5):
21 |     if step <= warmup_step:
22 |         return step / warmup_step
23 | 
24 |     milestones = list(sorted(milestones))
25 |     for i, m in enumerate(milestones):
26 |         if n_epoch < m:
27 |             return gamma**i
28 |     return gamma**(len(milestones)+1)
29 | 
30 | class AutoStep():
31 |     def __init__(self, tolerance, gamma):
32 |         self.tolerance = tolerance
33 |         self.coeff_mem = 1
34 |         self.gamma = gamma
35 |         self.best_score = 0.
36 |         self.count = 0
37 | 
38 |     def step(self, score):
39 |         if score <= self.best_score:
40 |             self.count += 1
41 |         else:
42 |             self.count = 0
43 |         self.best_score = score
44 |         if self.count > self.tolerance:
45 |             self.count = 0
46 |             self.coeff_mem = self.coeff_mem * self.gamma
47 | 
48 |     def get_lr(self, global_step, learning_rate, num_train_steps, warmup_ratio=0.1):
49 |         warmup_steps = int(warmup_ratio * num_train_steps)
50 |         if global_step <= warmup_steps:
51 |             return learning_rate * global_step / warmup_steps
52 | 
53 |         return max(self.coeff_mem * learning_rate, 1e-8)
54 | 
55 | 
56 | def get_lr_sched(global_step, decay, learning_rate,
57 |                  num_train_steps, warmup_ratio=0.1,
58 |                  decay_epochs=[], multi_step_epoch=-1):
59 |     warmup_steps = int(warmup_ratio*num_train_steps)
60 |     if decay == 'linear':
61 |         lr_this_step = learning_rate * warmup_linear(
62 |             global_step, warmup_steps, num_train_steps)
63 |     elif decay == 'invsqrt':
64 |         lr_this_step = learning_rate * noam_schedule(
65 |             global_step, warmup_steps)
66 |     elif decay == 'constant':
67 |         lr_this_step = learning_rate
68 |     elif decay == "multi_step":
69 |         assert multi_step_epoch >= 0
70 |         lr_this_step = learning_rate * multi_step_schedule(
71 |             multi_step_epoch, decay_epochs, global_step, warmup_steps)
72 |     if lr_this_step <= 0:
73 |         # save guard for possible miscalculation of train steps
74 |         lr_this_step = 1e-8
75 |     return lr_this_step
76 | 


--------------------------------------------------------------------------------
/LF-VILA/src/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | references: UNITER
 3 | """
 4 | 
 5 | import logging
 6 | from tensorboardX import SummaryWriter
 7 | import os
 8 | 
 9 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s'
10 | _DATE_FMT = '%m/%d/%Y %H:%M:%S'
11 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO)
12 | LOGGER = logging.getLogger('__main__')  # this is the global logger
13 | 
14 | 
15 | def add_log_to_file(log_path):
16 |     fh = logging.FileHandler(log_path)
17 |     formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT)
18 |     fh.setFormatter(formatter)
19 |     LOGGER.addHandler(fh)
20 | 
21 | 
22 | class TensorboardLogger(object):
23 |     def __init__(self):
24 |         self._logger = None
25 |         self._global_step = 0
26 | 
27 |     def create(self, path):
28 |         if "AZUREML_TB_PATH" in os.environ:
29 |             self._logger = SummaryWriter(os.environ["AZUREML_TB_PATH"])
30 |         else:
31 |             self._logger = SummaryWriter(path)
32 | 
33 |     def noop(self, *args, **kwargs):
34 |         return
35 | 
36 |     def step(self):
37 |         self._global_step += 1
38 | 
39 |     @property
40 |     def global_step(self):
41 |         return self._global_step
42 | 
43 |     @global_step.setter
44 |     def global_step(self, step):
45 |         self._global_step = step
46 | 
47 |     def log_scalar_dict(self, log_dict, prefix=''):
48 |         """ log a dictionary of scalar values"""
49 |         if self._logger is None:
50 |             return
51 |         if prefix:
52 |             prefix = f'{prefix}_'
53 |         for name, value in log_dict.items():
54 |             if isinstance(value, dict):
55 |                 self.log_scalar_dict(value, self._global_step,
56 |                                      prefix=f'{prefix}{name}')
57 |             else:
58 |                 self._logger.add_scalar(f'{prefix}{name}', value,
59 |                                         self._global_step)
60 | 
61 |     def __getattr__(self, name):
62 |         if self._logger is None:
63 |             return self.noop
64 |         return self._logger.__getattribute__(name)
65 | 
66 | 
67 | TB_LOGGER = TensorboardLogger()
68 | 
69 | 
70 | class RunningMeter(object):
71 |     """ running meteor of a scalar value
72 |         (useful for monitoring training loss)
73 |     """
74 |     def __init__(self, name, val=None, smooth=0.99):
75 |         self._name = name
76 |         self._sm = smooth
77 |         self._val = val
78 | 
79 |     def __call__(self, value):
80 |         self._val = (value if self._val is None
81 |                      else value*(1-self._sm) + self._val*self._sm)
82 | 
83 |     def __str__(self):
84 |         return f'{self._name}: {self._val:.4f}'
85 | 
86 |     @property
87 |     def val(self):
88 |         return self._val
89 | 
90 |     @property
91 |     def name(self):
92 |         return self._name
93 | 


--------------------------------------------------------------------------------
/hd-vila-100m/README.md:
--------------------------------------------------------------------------------
 1 | # HD-VILA-100M Dataset
 2 | 
 3 | ## What is HD-VILA-100M?
 4 | HD-VILA-100M is a large-scale, high-resolution, and
 5 | diversified video-language dataset to facilitate the multimodal representation learning. 
 6 | 
 7 | <p align="center">
 8 | <img src="figs/examples.png" alt="examples for hd-vila"/>
 9 | </p>
10 | <p align="center">
11 | <font size=2 color="gray">Examples of video clips and ASR generated transcriptions in the proposed HD-VILA-100M dataset.</font>
12 | </p>
13 | 
14 | ## Data statistics
15 | The dataset contains 3.3 million videos in total, which are of high quality and distributed in 15 categories in balance.
16 | <p align="center">
17 | <img src="figs/statics.png" alt="statistics" width="60%"/>
18 | </p>
19 | <p align="center">
20 | <font size=2 color="gray">The distribution of categories in HD-VILA-100M dataset.</font>
21 | </p>
22 | 
23 | The details of our dataset are presented in the table below.
24 | | Dataset | Domain |  #Video clips | #Sentence | Avg len(sec) | Sent len | Duration(h) | Resolution
25 | | :-----| :---- | :---- | :---- | :---- | :---- | :---- | :---- |
26 | | HD-VILA-100M | open | 100M | 100M | 13.4 | 32.5 | 371.5K | 720p |
27 | 
28 | 
29 | ## Download
30 | 
31 | You can download all the urls through this [link](https://hdvila.blob.core.windows.net/dataset/hdvila100m.zip?sp=r&st=2022-06-28T03:33:11Z&se=2026-01-01T11:33:11Z&spr=https&sv=2021-06-08&sr=b&sig=VaqQkLFDqKinfkaPNs1jJ1EQIYCB%2FUPYiqFqmjWye6Y%3D) (updated 6/28/2022). Together we also offer all the timestamps to divide the videos into clips. The format of the data is:
32 | ```
33 | {   
34 |     'video_id':'QMi8x8o55Ns',
35 |     'url': 'https://www.youtube.com/watch?v=QMi8x8o55Ns',
36 |     'clip': [
37 |                 {'clip_id': 'QMi8x8o55Ns.1.mp4', 'span': ['00:00:17.759', '00:00:23.279']}
38 |                 ...
39 |                 {'clip_id': 'QMi8x8o55Ns.16.mp4', 'span': ['00:04:52.140', '00:05:03.350']}
40 |             ],
41 | }
42 | ```
43 | 
44 | You can download the raw videos from YouTube and use [src/cut_videos.py](./src/cut_videos.py) to cut the videos to clips.
45 | 
46 | 
47 | ## License
48 | 
49 | The license of the collected dataset is [here](./LICENSE).
50 | 
51 | ## Citing HD-VILA
52 | 
53 | If you find this dataset useful for your research, please consider citing our paper. :blush:
54 | 
55 | ```bibtex
56 | @inproceedings{xue2022hdvila,
57 |     title={Advancing High-Resolution Video-Language Representation with Large-Scale Video Transcriptions},
58 |     author={Xue, Hongwei and Hang, Tiankai and Zeng, Yanhong and Sun, Yuchong and Liu, Bei and Yang, Huan and Fu, Jianlong and Guo, Baining},
59 |     booktitle={International Conference on Computer Vision and Pattern Recognition (CVPR)},
60 |     year={2022}
61 | }
62 | ```
63 | 
64 | ## Contact Information
65 | 
66 | For further request about dataset or problems using the dataset, you can contact [Bei Liu]() (`bei.liu@microsoft.com`).
67 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/pretrain/pretrain_vip_base_16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |           "name": "hdvila",
 5 |           "vis_format": "videoframe",
 6 |           "txt": "datasets/hdvila/hdvila_subtitles_92m_db",
 7 |           "vis": "youtube_data/ytt180m/video_clips_3fps",
 8 |           "vid_cap_path": "datasets/hdvila/hdvila_captions_db",
 9 |           "vid_txt": "subtitle",
10 |           "img_dir": "",
11 |           "cap_path": "",
12 |           "img_source": "",
13 |           "img_ratio": 0
14 |         }
15 |       ],
16 |       "val_datasets": [
17 |         {
18 |           "name": "msrvtt",
19 |           "vis_format": "video",
20 |           "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
21 |           "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
22 |         },
23 |         {
24 |           "name": "how2",
25 |           "vis_format": "video",
26 |           "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl",
27 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
28 |         },
29 |         {
30 |           "name": "ours",
31 |           "vis_format": "video",
32 |           "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl",
33 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
34 |         }
35 |     ],
36 |   
37 |     "train_n_clips": 1,
38 |     "train_num_frms": 12,
39 |     "test_n_clips": 1,
40 |     "test_num_frms": 12,
41 |     "sample_rate": 0,
42 |     "sample_jitter": 1,
43 |     "video_res": [240, 320],
44 |     "input_res": [224, 224],
45 |     "max_txt_len": 70,
46 |   
47 |     "e2e_weights_path": null,
48 |     "clip_weights": "openai/clip-vit-base-patch16",
49 |     "clip_config": "openai/clip-vit-base-patch16",
50 |     "clip_vision_additional_config": {
51 |       "type": "ViP",
52 |       "temporal_size": 12,
53 |       "if_use_temporal_embed": 1,
54 |       "logit_scale_init_value": 4.60,
55 |       "add_cls_num": 3
56 |     },
57 |   
58 |     "train_batch_size": 16,
59 |     "test_batch_size": 16,
60 |     "max_n_example_per_group": 1,
61 |     "gradient_accumulation_steps": 1,
62 |     "n_workers": 8,
63 |     "pin_mem": 1,
64 |     "fp16": 1,
65 |     "amp_level": "O2",
66 |     "seed": 42,
67 |   
68 |     "optim": "adamw",
69 |     "betas": [0.9, 0.98],
70 |     "learning_rate": 5e-6,
71 |     "weight_decay": 0.05,
72 |     "lr_mul": 1,
73 |     "lr_mul_prefix": "",
74 |     "loss_config": {
75 |       "loss_name": "NCELearnableTempLoss_vsc_fc",
76 |       "if_gather": 1
77 |     },
78 |     "warmup_ratio": 0.01,
79 |     "decay": "cosine",
80 |     "grad_norm": 5.0,
81 |   
82 |     "num_train_epochs": 5,
83 |     "min_valid_steps": 1,
84 |     "num_valid": 100,
85 |     "only_valid_steps": 1000,
86 |     "save_steps_ratio": 0.01,
87 |     "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_16/",
88 |     "if_tb_log": 1,
89 |     "if_model_saver": 1,
90 |     "if_log2file": 1,
91 |     "dummy_data": 0
92 |   }
93 |   


--------------------------------------------------------------------------------
/CLIP-ViP/src/configs/pretrain/pretrain_vip_base_32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |           "name": "hdvila",
 5 |           "vis_format": "videoframe",
 6 |           "txt": "datasets/hdvila/hdvila_subtitles_92m_db",
 7 |           "vis": "youtube_data/ytt180m/video_clips_3fps",
 8 |           "vid_cap_path": "datasets/hdvila/hdvila_captions_db",
 9 |           "vid_txt": "subtitle",
10 |           "img_dir": "",
11 |           "cap_path": "",
12 |           "img_source": "",
13 |           "img_ratio": 0
14 |         }
15 |       ],
16 |       "val_datasets": [
17 |         {
18 |           "name": "msrvtt",
19 |           "vis_format": "video",
20 |           "txt": "clip_data/vis_db/msrvtt_video_clips/test1ka.jsonl",
21 |           "vis": "clip_data/vis_db/msrvtt_video_clips/videos_6fps"
22 |         },
23 |         {
24 |           "name": "how2",
25 |           "vis_format": "video",
26 |           "txt": "clip_data/vis_db/pretrain_data/test_howto_1k.jsonl",
27 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
28 |         },
29 |         {
30 |           "name": "ours",
31 |           "vis_format": "video",
32 |           "txt": "clip_data/vis_db/pretrain_data/test_full_1k.jsonl",
33 |           "vis": "youtube_data/ytt180m/video_clips_3fps"
34 |         }
35 |     ],
36 |   
37 |     "train_n_clips": 1,
38 |     "train_num_frms": 12,
39 |     "test_n_clips": 1,
40 |     "test_num_frms": 12,
41 |     "sample_rate": 0,
42 |     "sample_jitter": 1,
43 |     "video_res": [240, 320],
44 |     "input_res": [224, 224],
45 |     "max_txt_len": 70,
46 |   
47 |     "e2e_weights_path": null,
48 |     "clip_weights": "openai/clip-vit-base-patch32",
49 |     "clip_config": "openai/clip-vit-base-patch32",
50 |     "clip_vision_additional_config": {
51 |       "type": "ViP",
52 |       "temporal_size": 12,
53 |       "if_use_temporal_embed": 1,
54 |       "logit_scale_init_value": 4.60,
55 |       "add_cls_num": 3
56 |     },
57 |   
58 |     "train_batch_size": 32,
59 |     "test_batch_size": 32,
60 |     "max_n_example_per_group": 1,
61 |     "gradient_accumulation_steps": 1,
62 |     "n_workers": 8,
63 |     "pin_mem": 1,
64 |     "fp16": 1,
65 |     "amp_level": "O2",
66 |     "seed": 42,
67 |   
68 |     "optim": "adamw",
69 |     "betas": [0.9, 0.98],
70 |     "learning_rate": 5e-6,
71 |     "weight_decay": 0.05,
72 |     "lr_mul": 1,
73 |     "lr_mul_prefix": "",
74 |     "loss_config": {
75 |       "loss_name": "NCELearnableTempLoss_vsc_fc",
76 |       "if_gather": 1
77 |     },
78 |     "warmup_ratio": 0.01,
79 |     "decay": "cosine",
80 |     "grad_norm": 5.0,
81 |   
82 |     "num_train_epochs": 5,
83 |     "min_valid_steps": 1,
84 |     "num_valid": 100,
85 |     "only_valid_steps": 1000,
86 |     "save_steps_ratio": 0.01,
87 |     "output_dir": "vidclip_data/output/pretrain/pretrain_vip_base_32/",
88 |     "if_tb_log": 1,
89 |     "if_model_saver": 1,
90 |     "if_log2file": 1,
91 |     "dummy_data": 0
92 |   }
93 |   


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/hd-vila/src/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | references: UNITER
 3 | """
 4 | 
 5 | import logging
 6 | from tensorboardX import SummaryWriter
 7 | import os
 8 | 
 9 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s'
10 | _DATE_FMT = '%m/%d/%Y %H:%M:%S'
11 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO)
12 | LOGGER = logging.getLogger('__main__')  # this is the global logger
13 | 
14 | 
15 | def add_log_to_file(log_path):
16 |     fh = logging.FileHandler(log_path)
17 |     formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT)
18 |     fh.setFormatter(formatter)
19 |     LOGGER.addHandler(fh)
20 | 
21 | 
22 | class TensorboardLogger(object):
23 |     def __init__(self):
24 |         self._logger = None
25 |         self._global_step = 0
26 | 
27 |     def create(self, path):
28 |         if "AZUREML_TB_PATH" in os.environ:
29 |             self._logger = SummaryWriter(os.environ["AZUREML_TB_PATH"])
30 |         else:
31 |             self._logger = SummaryWriter(path)
32 | 
33 |     def noop(self, *args, **kwargs):
34 |         return
35 | 
36 |     def step(self):
37 |         self._global_step += 1
38 | 
39 |     @property
40 |     def global_step(self):
41 |         return self._global_step
42 | 
43 |     @global_step.setter
44 |     def global_step(self, step):
45 |         self._global_step = step
46 | 
47 |     def log_scalar_dict(self, log_dict, prefix=''):
48 |         """ log a dictionary of scalar values"""
49 |         if self._logger is None:
50 |             return
51 |         if prefix:
52 |             prefix = f'{prefix}_'
53 |         for name, value in log_dict.items():
54 |             if isinstance(value, dict):
55 |                 self.log_scalar_dict(value, self._global_step,
56 |                                      prefix=f'{prefix}{name}')
57 |             else:
58 |                 self._logger.add_scalar(f'{prefix}{name}', value,
59 |                                         self._global_step)
60 | 
61 |     def __getattr__(self, name):
62 |         if self._logger is None:
63 |             return self.noop
64 |         return self._logger.__getattribute__(name)
65 | 
66 | 
67 | TB_LOGGER = TensorboardLogger()
68 | 
69 | 
70 | class RunningMeter(object):
71 |     """ running meteor of a scalar value
72 |         (useful for monitoring training loss)
73 |     """
74 |     def __init__(self, name, val=None, smooth=0.99):
75 |         self._name = name
76 |         self._sm = smooth
77 |         self._val = val
78 | 
79 |     def __call__(self, value):
80 |         self._val = (value if self._val is None
81 |                      else value*(1-self._sm) + self._val*self._sm)
82 | 
83 |     def __str__(self):
84 |         return f'{self._name}: {self._val:.4f}'
85 | 
86 |     @property
87 |     def val(self):
88 |         return self._val
89 | 
90 |     @property
91 |     def name(self):
92 |         return self._name
93 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | references: UNITER
 3 | """
 4 | 
 5 | import logging
 6 | from tensorboardX import SummaryWriter
 7 | import os
 8 | 
 9 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s'
10 | _DATE_FMT = '%m/%d/%Y %H:%M:%S'
11 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO)
12 | LOGGER = logging.getLogger('__main__')  # this is the global logger
13 | 
14 | 
15 | def add_log_to_file(log_path):
16 |     fh = logging.FileHandler(log_path)
17 |     formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT)
18 |     fh.setFormatter(formatter)
19 |     LOGGER.addHandler(fh)
20 | 
21 | 
22 | class TensorboardLogger(object):
23 |     def __init__(self):
24 |         self._logger = None
25 |         self._global_step = 0
26 | 
27 |     def create(self, path):
28 |         if "AZUREML_TB_PATH" in os.environ:
29 |             self._logger = SummaryWriter(os.environ["AZUREML_TB_PATH"])
30 |         else:
31 |             self._logger = SummaryWriter(path)
32 | 
33 |     def noop(self, *args, **kwargs):
34 |         return
35 | 
36 |     def step(self):
37 |         self._global_step += 1
38 | 
39 |     @property
40 |     def global_step(self):
41 |         return self._global_step
42 | 
43 |     @global_step.setter
44 |     def global_step(self, step):
45 |         self._global_step = step
46 | 
47 |     def log_scalar_dict(self, log_dict, prefix=''):
48 |         """ log a dictionary of scalar values"""
49 |         if self._logger is None:
50 |             return
51 |         if prefix:
52 |             prefix = f'{prefix}_'
53 |         for name, value in log_dict.items():
54 |             if isinstance(value, dict):
55 |                 self.log_scalar_dict(value, self._global_step,
56 |                                      prefix=f'{prefix}{name}')
57 |             else:
58 |                 self._logger.add_scalar(f'{prefix}{name}', value,
59 |                                         self._global_step)
60 | 
61 |     def __getattr__(self, name):
62 |         if self._logger is None:
63 |             return self.noop
64 |         return self._logger.__getattribute__(name)
65 | 
66 | 
67 | TB_LOGGER = TensorboardLogger()
68 | 
69 | 
70 | class RunningMeter(object):
71 |     """ running meteor of a scalar value
72 |         (useful for monitoring training loss)
73 |     """
74 |     def __init__(self, name, val=None, smooth=0.99):
75 |         self._name = name
76 |         self._sm = smooth
77 |         self._val = val
78 | 
79 |     def __call__(self, value):
80 |         self._val = (value if self._val is None
81 |                      else value*(1-self._sm) + self._val*self._sm)
82 | 
83 |     def __str__(self):
84 |         return f'{self._name}: {self._val:.4f}'
85 | 
86 |     @property
87 |     def val(self):
88 |         return self._val
89 | 
90 |     @property
91 |     def name(self):
92 |         return self._name
93 | 


--------------------------------------------------------------------------------
/LF-VILA/src/models/lfvila_video_classification.py:
--------------------------------------------------------------------------------
 1 | from locale import LC_NUMERIC
 2 | from src.models.bert import (
 3 |     BertConfig, BertModel, BertOnlyMLMHead, BertOnlyNSPHead, BertForMaskedLM)
 4 | from src.models.video_encoder import SwinTransformer3D
 5 | from src.models.text_encoder import TextEncoderForPretraining
 6 | import torch
 7 | import torch.nn.functional as F
 8 | from torch import nn
 9 | import numpy as np
10 | import random
11 | import einops
12 | from src.utils.logger import LOGGER
13 | from timm.models.vision_transformer import Block
14 | 
15 | 
16 | class LFVILA_Video_Classification(nn.Module):
17 |     def __init__(self, args, config):
18 |         super().__init__()
19 |         
20 |         self.cfg = config
21 |         self.video_encoder = SwinTransformer3D(**config.VideoEncoder)
22 |         bert_config = BertConfig.from_json_file(config.bert_config)
23 | 
24 |         self.video_downsample = nn.MaxPool2d((2,3), stride=(1,1))
25 | 
26 |         self.video_global_proj = nn.Linear(bert_config.hidden_size, bert_config.hidden_size)
27 |         self.video_frame_proj = nn.Linear(bert_config.hidden_size, bert_config.hidden_size)
28 | 
29 |         self.classifier = nn.Linear(bert_config.hidden_size, self.cfg.DATA.classification_labels)
30 | 
31 | 
32 |     def downsample_video_embd(self, video_embd):
33 |         B, N, H, W, C = video_embd.size() # B, N, H, W, C
34 |         video_embd = video_embd.permute(0,1,4,2,3)
35 |         video_embd = self.video_downsample(video_embd.view(B*N, C, H, W))
36 |         video_embd = video_embd.permute(0,2,3,1) # B*N, H, W, C
37 |         video_embd = video_embd.view(B, N, video_embd.size(-3), video_embd.size(-2),video_embd.size(-1))
38 |         video_embd = video_embd.flatten(2,3) # B, N, X, C
39 | 
40 |         video_feat = video_embd.mean(dim=[1, 2])
41 |         video_frame_feat = video_embd.mean(dim=2)
42 | 
43 |         return video_feat, video_frame_feat
44 | 
45 | 
46 |     def forward(self, video_frames, labels = None):
47 |         B, C, N, H, W = video_frames.size()
48 |         video_global_embd, _ = self.video_encoder(video_frames) # B, N, H, W, C
49 |         video_global_feat, video_frame_feat = self.downsample_video_embd(video_global_embd)
50 | 
51 |         video_global_feat = F.normalize(self.video_global_proj(video_global_feat),dim=-1)
52 | 
53 |         video_frame_feat = F.normalize(self.video_frame_proj(video_frame_feat),dim=-1)
54 | 
55 |  
56 |         logits = self.classifier(video_global_feat)
57 | 
58 |         loss_fct = nn.CrossEntropyLoss()
59 |         loss = loss_fct(logits, labels)
60 | 
61 |         acc = logits.max(dim=-1)[1] == labels
62 |         acc = acc.float().mean(dim=0, keepdim=True)
63 | 
64 |         return dict(video_global_feat = video_global_feat,
65 |                     video_frame_feat = video_frame_feat,
66 |                     prediction = logits,
67 |                     loss = loss,
68 |                     acc = acc)
69 | 
70 | 
71 | 
72 | 
73 |     
74 |     
75 | 
76 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/tgif_frame_qa.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |       {
 4 |         "name": "tgif_qa",
 5 |         "txt": {
 6 |           "action": "data/tgif_qa/action_train.jsonl",
 7 |           "transition": "data/tgif_qa/transition_train.jsonl",
 8 |           "frameqa": "data/tgif_qa/frameqa_train.jsonl"
 9 |         },
10 |         "vis": "data/tgif_qa/videos_mp4"
11 |       }
12 |     ],
13 |     "val_datasets": [
14 |       {
15 |         "name": "tgif_qa",
16 |         "txt": {
17 |           "action": "data/tgif_qa/action_val.jsonl",
18 |           "transition": "data/tgif_qa/transition_val.jsonl",
19 |           "frameqa": "data/tgif_qa/frameqa_val.jsonl"
20 |         },
21 |         "vis": "data/tgif_qa/videos_mp4"
22 |       }
23 |     ],
24 |     "ans2label_path": "data/tgif_qa/frameqa_trainval_ans2label.json",
25 |     "max_txt_len": 30,
26 |     "max_img_size": 192,
27 |     "sample_rate": 4,
28 |     "reshape_size": [180, 288],
29 |     "crop_size": [160, 256],
30 |     "pad_value": 1,
31 |     "img_pixel_mean": [123.675, 116.28, 103.53],
32 |     "img_pixel_std": [1.0, 1.0, 1.0],
33 |     "img_input_format": "BGR",
34 |     "fps": 2,
35 |     "num_frm": 7,
36 |     "train_n_clips": 1,
37 |     "max_n_example_per_group": 1,
38 |     "model_config": "src/configs/base_model_large.json",
39 |     "e2e_weights_path": "data/pretrained/hdvila_stage2.pt",
40 |     "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
41 |     "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
42 |     "tokenizer_dir": "data/pretrained/bert-base-uncased/",
43 |     "output_dir": "data/output/videoqa/tgif_qa_frame",
44 |     "train_batch_size": 14,
45 |     "val_batch_size": 14,
46 |     "gradient_accumulation_steps": 4,
47 |     "num_train_epochs": 40,
48 |     "min_valid_steps": 1,
49 |     "num_valid": 40,
50 |     "save_steps_ratio": 0.2,
51 |     "learning_rate": 4e-5,
52 |     "weight_decay": 0.3,
53 |     "decay": "multi_step",
54 |     "step_decay_epochs":[10,15,20,25,30,35],
55 |     "optim": "adamw",
56 |     "betas": [0.9, 0.98],
57 |     "dropout": 0.1,
58 |     "grad_norm": 5.0,
59 |     "cnn_learning_rate": 4e-5,
60 |     "cnn_weight_decay": 0.3,
61 |     "cnn_lr_decay": "multi_step",
62 |     "cnn_step_decay_epochs":[10,15,20,25,30,35],
63 |     "align_learning_rate": 4e-5,
64 |     "align_weight_decay": 0.3,
65 |     "seed": 66,
66 |     "fp16": 1,
67 |     "classifier": "mlp",
68 |     "cls_hidden_scale": 2,
69 |     "task": "frameqa",
70 |     "n_workers": 4,
71 |   
72 |     "resnet_depth": 50,
73 |     "resnet_frozen_stage": -1,
74 |     "timesformer_depth": 4,
75 |     "timesformer_heads": 16,
76 |     "backbone_channels": [256, 512, 1024, 2048],
77 |     "backbone_downsample": [4, 8, 16, 32],
78 |     "backbone_channel_in_size": 2048,
79 |     "hidden_size": 1024,
80 |   
81 |   
82 |     "inference_model_step": 0,
83 |     "inference_txt_db": "data/tgif_qa/frameqa_test.jsonl",
84 |     "inference_img_db": "data/tgif_qa/videos_mp4",
85 |     "inference_batch_size": 4,
86 |     "inference_n_clips": 8
87 |   }


--------------------------------------------------------------------------------
/hd-vila/src/configs/tgif_action_qa.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |       {
 4 |         "name": "tgif_qa",
 5 |         "txt": {
 6 |           "action": "data/tgif_qa/action_train.jsonl",
 7 |           "transition": "data/tgif_qa/transition_train.jsonl",
 8 |           "frameqa": "data/tgif_qa/frameqa_train.jsonl"
 9 |         },
10 |         "vis": "data/tgif_qa/videos_mp4"
11 |       }
12 |     ],
13 |     "val_datasets": [
14 |       {
15 |         "name": "tgif_qa",
16 |         "txt": {
17 |           "action": "data/tgif_qa/action_val.jsonl",
18 |           "transition": "data/tgif_qa/transition_val.jsonl",
19 |           "frameqa": "data/tgif_qa/frameqa_val.jsonl"
20 |         },
21 |         "vis": "data/tgif_qa/videos_mp4"
22 |       }
23 |     ],
24 |     "ans2label_path": "data/tgif_qa/frameqa_trainval_ans2label.json",
25 |     "max_txt_len": 30,
26 |     "max_img_size": 192,
27 |     "sample_rate": 4,
28 |     "reshape_size": [180, 288],
29 |     "crop_size": [160, 256],
30 |     "pad_value": 1,
31 |     "img_pixel_mean": [123.675, 116.28, 103.53],
32 |     "img_pixel_std": [1.0, 1.0, 1.0],
33 |     "img_input_format": "BGR",
34 |     "fps": 2,
35 |     "num_frm": 7,
36 |     "train_n_clips": 1,
37 |     "max_n_example_per_group": 1,
38 |     "model_config": "src/configs/base_model_large.json",
39 |     "e2e_weights_path": "data/pretrained/hdvila_stage2.pt",
40 |     "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
41 |     "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
42 |     "tokenizer_dir": "data/pretrained/bert-base-uncased/",
43 |     "output_dir": "data/output/videoqa/tgif_qa_action",
44 |     "train_batch_size": 12,
45 |     "val_batch_size": 12,
46 |     "gradient_accumulation_steps": 4,
47 |     "num_train_epochs": 80,
48 |     "min_valid_steps": 1,
49 |     "num_valid": 80,
50 |     "save_steps_ratio": 0.2,
51 |     "learning_rate": 5e-5,
52 |     "weight_decay": 1e-1,
53 |     "decay": "multi_step",
54 |     "step_decay_epochs":[10,20,30,40,50,60,70],
55 |     "optim": "adamw",
56 |     "betas": [0.9, 0.98],
57 |     "dropout": 0.1,
58 |     "grad_norm": 5.0,
59 |     "cnn_learning_rate": 5e-5,
60 |     "cnn_weight_decay": 1e-1,
61 |     "cnn_lr_decay": "multi_step",
62 |     "cnn_step_decay_epochs":[10,20,30,40,50,60,70],
63 |     "align_learning_rate": 5e-5,
64 |     "align_weight_decay": 1e-1,
65 |     "seed": 66,
66 |     "fp16": 1,
67 |     "classifier": "mlp",
68 |     "cls_hidden_scale": 2,
69 |     "task": "action",
70 |     "n_workers": 4,
71 |   
72 |     "resnet_depth": 50,
73 |     "resnet_frozen_stage": -1,
74 |     "timesformer_depth": 4,
75 |     "timesformer_heads": 16,
76 |     "backbone_channels": [256, 512, 1024, 2048],
77 |     "backbone_downsample": [4, 8, 16, 32],
78 |     "backbone_channel_in_size": 2048,
79 |     "hidden_size": 1024,
80 |   
81 |   
82 |     "inference_model_step": 0,
83 |     "inference_txt_db": "data/tgif_qa/action_test.jsonl",
84 |     "inference_img_db": "data/tgif_qa/videos_mp4",
85 |     "inference_batch_size": 4,
86 |     "inference_n_clips": 8
87 |   }


--------------------------------------------------------------------------------
/hd-vila/src/configs/tgif_transition_qa.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |       {
 4 |         "name": "tgif_qa",
 5 |         "txt": {
 6 |           "action": "data/tgif_qa/action_train.jsonl",
 7 |           "transition": "data/tgif_qa/transition_train.jsonl",
 8 |           "frameqa": "data/tgif_qa/frameqa_train.jsonl"
 9 |         },
10 |         "vis": "data/tgif_qa/videos_mp4"
11 |       }
12 |     ],
13 |     "val_datasets": [
14 |       {
15 |         "name": "tgif_qa",
16 |         "txt": {
17 |           "action": "data/tgif_qa/action_val.jsonl",
18 |           "transition": "data/tgif_qa/transition_val.jsonl",
19 |           "frameqa": "data/tgif_qa/frameqa_val.jsonl"
20 |         },
21 |         "vis": "data/tgif_qa/videos_mp4"
22 |       }
23 |     ],
24 |     "ans2label_path": "data/tgif_qa/frameqa_trainval_ans2label.json",
25 |     "max_txt_len": 30,
26 |     "max_img_size": 192,
27 |     "sample_rate": 4,
28 |     "reshape_size": [180, 288],
29 |     "crop_size": [160, 256],
30 |     "pad_value": 1,
31 |     "img_pixel_mean": [123.675, 116.28, 103.53],
32 |     "img_pixel_std": [1.0, 1.0, 1.0],
33 |     "img_input_format": "BGR",
34 |     "fps": 2,
35 |     "num_frm": 7,
36 |     "train_n_clips": 1,
37 |     "max_n_example_per_group": 1,
38 |     "model_config": "src/configs/base_model_large.json",
39 |     "e2e_weights_path": "data/pretrained/hdvila_stage2.pt",
40 |     "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
41 |     "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
42 |     "tokenizer_dir": "data/pretrained/bert-base-uncased/",
43 |     "output_dir": "data/output/videoqa/tgif_qa_transition/",
44 |     "train_batch_size": 12,
45 |     "val_batch_size": 12,
46 |     "gradient_accumulation_steps": 4,
47 |     "num_train_epochs": 80,
48 |     "min_valid_steps": 1,
49 |     "num_valid": 80,
50 |     "save_steps_ratio": 0.2,
51 |     "learning_rate": 5e-5,
52 |     "weight_decay": 1e-1,
53 |     "decay": "multi_step",
54 |     "step_decay_epochs":[10,20,30,40,50,60,70],
55 |     "optim": "adamw",
56 |     "betas": [0.9, 0.98],
57 |     "dropout": 0.1,
58 |     "grad_norm": 5.0,
59 |     "cnn_learning_rate": 5e-5,
60 |     "cnn_weight_decay": 1e-1,
61 |     "cnn_lr_decay": "multi_step",
62 |     "cnn_step_decay_epochs":[10,20,30,40,50,60,70],
63 |     "align_learning_rate": 5e-5,
64 |     "align_weight_decay": 1e-1,
65 |     "seed": 66,
66 |     "fp16": 1,
67 |     "classifier": "mlp",
68 |     "cls_hidden_scale": 2,
69 |     "task": "transition",
70 |     "n_workers": 4,
71 |   
72 |     "resnet_depth": 50,
73 |     "resnet_frozen_stage": -1,
74 |     "timesformer_depth": 4,
75 |     "timesformer_heads": 16,
76 |     "backbone_channels": [256, 512, 1024, 2048],
77 |     "backbone_downsample": [4, 8, 16, 32],
78 |     "backbone_channel_in_size": 2048,
79 |     "hidden_size": 1024,
80 |   
81 |   
82 |     "inference_model_step": 0,
83 |     "inference_txt_db": "data/tgif_qa/transition_test.jsonl",
84 |     "inference_img_db": "data/tgif_qa/videos_mp4",
85 |     "inference_batch_size": 4,
86 |     "inference_n_clips": 8
87 |   }


--------------------------------------------------------------------------------
/CLIP-ViP/src/optimization/sched.py:
--------------------------------------------------------------------------------
 1 | """
 2 | optimizer learning rate scheduling helpers
 3 | """
 4 | import math
 5 | from math import ceil
 6 | from collections import Counter
 7 | 
 8 | 
 9 | def noam_schedule(step, warmup_step=4000):
10 |     if step <= warmup_step:
11 |         return step / warmup_step
12 |     return (warmup_step ** 0.5) * (step ** -0.5)
13 | 
14 | 
15 | def warmup_linear(step, warmup_step, tot_step):
16 |     if step < warmup_step:
17 |         return step / warmup_step
18 |     return max(0, (tot_step-step)/(tot_step-warmup_step))
19 | 
20 | def warmup_cosine(step, warmup_step, tot_step):
21 |     if step < warmup_step:
22 |         return step / warmup_step
23 |     progress = (step - warmup_step) / (tot_step - warmup_step)
24 |     return 0.5 * (1.0 + math.cos(math.pi * progress))
25 | 
26 | def multi_step_schedule(n_epoch, milestones, step, warmup_step,gamma=0.5):
27 |     if step <= warmup_step:
28 |         return step / warmup_step
29 | 
30 |     milestones = list(sorted(milestones))
31 |     for i, m in enumerate(milestones):
32 |         if n_epoch < m:
33 |             return gamma**i
34 |     return gamma**(len(milestones)+1)
35 | 
36 | class AutoStep():
37 |     def __init__(self, tolerance, gamma):
38 |         self.tolerance = tolerance
39 |         self.coeff_mem = 1
40 |         self.gamma = gamma
41 |         self.best_score = 0.
42 |         self.count = 0
43 | 
44 |     def step(self, score):
45 |         if score <= self.best_score:
46 |             self.count += 1
47 |         else:
48 |             self.count = 0
49 |         self.best_score = score
50 |         if self.count > self.tolerance:
51 |             self.count = 0
52 |             self.coeff_mem = self.coeff_mem * self.gamma
53 | 
54 |     def get_lr(self, global_step, learning_rate, num_train_steps, warmup_ratio=0.1):
55 |         warmup_steps = int(warmup_ratio * num_train_steps)
56 |         if global_step <= warmup_steps:
57 |             return learning_rate * global_step / warmup_steps
58 | 
59 |         return max(self.coeff_mem * learning_rate, 1e-8)
60 | 
61 | 
62 | def get_lr_sched(global_step, decay, learning_rate,
63 |                  num_train_steps, warmup_ratio=0.1,
64 |                  decay_epochs=[], multi_step_epoch=-1):
65 |     warmup_steps = int(warmup_ratio*num_train_steps)
66 |     if decay == 'linear':
67 |         lr_this_step = learning_rate * warmup_linear(
68 |             global_step, warmup_steps, num_train_steps)
69 |     elif decay == 'cosine':
70 |         lr_this_step = learning_rate * warmup_cosine(
71 |             global_step, warmup_steps, num_train_steps)
72 |     elif decay == 'invsqrt':
73 |         lr_this_step = learning_rate * noam_schedule(
74 |             global_step, warmup_steps)
75 |     elif decay == 'constant':
76 |         lr_this_step = learning_rate
77 |     elif decay == "multi_step":
78 |         assert multi_step_epoch >= 0
79 |         lr_this_step = learning_rate * multi_step_schedule(
80 |             multi_step_epoch, decay_epochs, global_step, warmup_steps)
81 |     if lr_this_step <= 0:
82 |         # save guard for possible miscalculation of train steps
83 |         lr_this_step = 1e-8
84 |     return lr_this_step
85 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/lsmdc_retrieval.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "train_datasets": 
  3 |     {
  4 |       "name": "lsmdc-101k",
  5 |       "vis_format": "frame",
  6 |       "txt": "data/lsmdc_retrieval/train_101k_frame.jsonl",
  7 |       "vis": "data/lsmdc_retrieval/video_frames"
  8 |     },
  9 |   "val_datasets": [
 10 |     {
 11 |       "name": "lsmdc-1k",
 12 |       "vis_format": "frame",
 13 |       "txt": "data/lsmdc_retrieval/test_1k_frame.jsonl",
 14 |       "vis": "data/lsmdc_retrieval/video_frames"
 15 |     }
 16 |   ],
 17 |   "inference_datasets": [
 18 |     {
 19 |       "name": "lsmdc-1k",
 20 |       "vis_format": "frame",
 21 |       "txt": "data/lsmdc_retrieval/test_1k_frame.jsonl",
 22 |       "vis": "data/lsmdc_retrieval/video_frames"
 23 |     }
 24 |   ],
 25 |   "img_pixel_mean": [123.675, 116.28, 103.53],
 26 |   "img_pixel_std": [58.395, 57.12, 57.375],
 27 |   "model_config": "src/configs/base_model_large.json",
 28 |   "e2e_weights_path": "data/pretrained/hdvila_stage2.pt",
 29 |   "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
 30 |   "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
 31 |   "tokenizer_dir": "data/pretrained/bert-base-uncased/",
 32 |   "output_dir": "data/output/retrieval/lsmdc_retrieval",
 33 |   "vis_steps":0,
 34 |   "warmup_ratio":0.01,
 35 |   "resnet_depth": 50,
 36 |   "resnet_frozen_stage": -1,
 37 |   "bert_frozen_stage": -1,
 38 |   "bert_mean":1,
 39 |   "timesformer_type": "new",
 40 |   "timesformer_depth": 4,
 41 |   "timesformer_heads": 16,
 42 |   "max_txt_len": 50,
 43 |   "score_agg_func": "lse",
 44 |   "loss_type": "ce",
 45 |   "train_n_clips": 2,
 46 |   "inference_n_clips": 4,
 47 |   "num_frm": 11,
 48 |   "sample_rate": 3,
 49 |   "crop_size": [160,256],
 50 |   "out_size": [256, 128, 64, 3],
 51 |   "train_batch_size": 8,
 52 |   "val_batch_size": 8,
 53 |   "max_n_example_per_group": 1,
 54 |   "gradient_accumulation_steps": 1,
 55 |   "num_train_epochs": 20,
 56 |   "min_valid_steps": 1,
 57 |   "num_valid": 20,
 58 |   "only_valid_steps": 500,
 59 |   "save_steps_ratio": 0.05,
 60 |   "learning_rate": 5e-6,
 61 |   "decay": "multi_step",
 62 |   "step_decay_epochs":[4,8,16],
 63 |   "cnn_step_decay_epochs":[4,8,16],
 64 |   "optim": "adamw",
 65 |   "betas": [0.9, 0.98],
 66 |   "dropout": 0.1,
 67 |   "weight_decay": 1e-3,
 68 |   "grad_norm": 5.0,
 69 |   "cnn_learning_rate": 5e-6,
 70 |   "cnn_weight_decay": 1e-3,
 71 |   "cnn_lr_decay": "multi_step",
 72 |   "align_learning_rate": 5e-6,
 73 |   "align_weight_decay": 1e-3,
 74 |   "generator_learning_rate": 5e-3,
 75 |   "generator_weight_decay": 0.0,
 76 |   "low_level_tasks": ["none"],
 77 |   "pixel_random_sampling_size": 160,
 78 |   "seed":24,
 79 |   "fp16": 1,
 80 |   "amp_level": "O2",
 81 |   "use_itm": 0,
 82 |   "use_itc": 1,
 83 |   "use_mlm": 0,
 84 |   
 85 |   "n_workers": 4,
 86 | 
 87 |   "hframe":1,
 88 |   "lframe":11,
 89 | 
 90 | 
 91 |   "backbone_channels": [256, 512, 1024, 2048],
 92 |   "backbone_downsample": [4, 8, 16, 32],
 93 |   "backbone_channel_in_size": 2048,
 94 |   "hidden_size": 1024,
 95 | 
 96 |   "temp": 0.08,
 97 |   "loss_config":{
 98 |     "loss_name":"NCEContrastiveLoss",
 99 |     "temp":0.08
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/didemo_retrieval.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "train_datasets": 
  3 |     {
  4 |       "name": "didemo-train",
  5 |       "vis_format": "frame",
  6 |       "txt": "data/didemo_retrieval/train_frame.jsonl",
  7 |       "vis": "data/didemo_retrieval/video_frames"
  8 |     },
  9 |   "val_datasets": [
 10 |     {
 11 |       "name": "didemo-test",
 12 |       "vis_format": "frame",
 13 |       "txt": "data/didemo_retrieval/test_frame.jsonl",
 14 |       "vis": "data/didemo_retrieval/video_frames"
 15 |     }
 16 |   ],
 17 |   "inference_datasets": [
 18 |     {
 19 |       "name": "didemo-test",
 20 |       "vis_format": "frame",
 21 |       "txt": "data/didemo_retrieval/test_frame.jsonl",
 22 |       "vis": "data/didemo_retrieval/video_frames"
 23 |     }
 24 |   ],
 25 |   "img_pixel_mean": [123.675, 116.28, 103.53],
 26 |   "img_pixel_std": [58.395, 57.12, 57.375],
 27 |   "model_config": "src/configs/base_model_large.json",
 28 |   "e2e_weights_path": "data/pretrained/hdvila_stage2.pt",
 29 |   "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
 30 |   "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
 31 |   "tokenizer_dir": "data/pretrained/bert-base-uncased/",
 32 |   "output_dir": "data/output/retrieval/didemo_retrieval",
 33 |   "vis_steps":0,
 34 |   "warmup_ratio":0.1,
 35 |   "resnet_depth": 50,
 36 |   "resnet_frozen_stage": -1,
 37 |   "bert_frozen_stage": -1,
 38 |   "bert_mean":1,
 39 |   "timesformer_type": "new",
 40 |   "timesformer_depth": 4,
 41 |   "timesformer_heads": 16,
 42 |   "max_txt_len": 50,
 43 |   "score_agg_func": "lse",
 44 |   "loss_type": "ce",
 45 |   "train_n_clips": 4,
 46 |   "inference_n_clips": 8,
 47 |   "num_frm": 11,
 48 |   "sample_rate": 2,
 49 |   "crop_size": [160,256],
 50 |   "out_size": [256, 128, 64, 3],
 51 |   "train_batch_size": 4,
 52 |   "val_batch_size": 8,
 53 |   "max_n_example_per_group": 1,
 54 |   "gradient_accumulation_steps": 1,
 55 |   "num_train_epochs": 100,
 56 |   "min_valid_steps": 1,
 57 |   "num_valid": 20,
 58 |   "only_valid_steps": 500,
 59 |   "save_steps_ratio": 0.05,
 60 |   "learning_rate": 5e-6,
 61 |   "decay": "multi_step",
 62 |   "step_decay_epochs":[8,16,32,64],
 63 |   "cnn_step_decay_epochs":[8,16,32,64],
 64 |   "optim": "adamw",
 65 |   "betas": [0.9, 0.98],
 66 |   "dropout": 0.1,
 67 |   "weight_decay": 1e-3,
 68 |   "grad_norm": 5.0,
 69 |   "cnn_learning_rate": 5e-6,
 70 |   "cnn_weight_decay": 1e-1,
 71 |   "cnn_lr_decay": "multi_step",
 72 |   "align_learning_rate": 5e-6,
 73 |   "align_weight_decay": 1e-1,
 74 |   "generator_learning_rate": 5e-3,
 75 |   "generator_weight_decay": 0.0,
 76 |   "low_level_tasks": ["none"],
 77 |   "pixel_random_sampling_size": 160,
 78 |   "seed":24,
 79 |   "fp16": 1,
 80 |   "amp_level": "O2",
 81 |   "use_itm": 0,
 82 |   "use_itc": 1,
 83 |   "use_mlm": 0,
 84 |   
 85 |   "n_workers": 4,
 86 | 
 87 |   "hframe":1,
 88 |   "lframe":11,
 89 | 
 90 | 
 91 |   "backbone_channels": [256, 512, 1024, 2048],
 92 |   "backbone_downsample": [4, 8, 16, 32],
 93 |   "backbone_channel_in_size": 2048,
 94 |   "hidden_size": 1024,
 95 | 
 96 |   "temp": 0.08,
 97 |   "loss_config":{
 98 |     "loss_name":"NCEContrastiveLoss",
 99 |     "temp":0.08
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/actnet_retrieval.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "train_datasets": 
  3 |     {
  4 |       "name": "actnet-train",
  5 |       "vis_format": "frame",
  6 |       "txt": "data/activitynet_retrieval/train.jsonl",
  7 |       "vis": "data/activitynet_retrieval/video_frames"
  8 |     },
  9 |   "val_datasets": [
 10 |     {
 11 |       "name": "actnet-test",
 12 |       "vis_format": "frame",
 13 |       "txt": "data/activitynet_retrieval/val1.jsonl",
 14 |       "vis": "data/activitynet_retrieval/video_frames"
 15 |     }
 16 |   ],
 17 |   "inference_datasets": [
 18 |     {
 19 |       "name": "actnet-test",
 20 |       "vis_format": "frame",
 21 |       "txt": "data/activitynet_retrieval/val1.jsonl",
 22 |       "vis": "data/activitynet_retrieval/video_frames"
 23 |     }
 24 |   ],
 25 |   "img_pixel_mean": [123.675, 116.28, 103.53],
 26 |   "img_pixel_std": [58.395, 57.12, 57.375],
 27 |   "model_config": "src/configs/base_model_large.json",
 28 |   "e2e_weights_path": "data/pretrained/hdvila_stage2.pt",
 29 |   "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
 30 |   "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
 31 |   "tokenizer_dir": "data/pretrained/bert-base-uncased/",
 32 |   "output_dir": "data/output/retrieval/actnet_retrieval",
 33 |   "vis_steps":0,
 34 |   "warmup_ratio":0.1,
 35 |   "resnet_depth": 50,
 36 |   "resnet_frozen_stage": -1,
 37 |   "bert_frozen_stage": -1,
 38 |   "bert_mean":1,
 39 |   "timesformer_type": "new",
 40 |   "timesformer_depth": 4,
 41 |   "timesformer_heads": 16,
 42 |   "max_txt_len": 50,
 43 |   "score_agg_func": "lse",
 44 |   "loss_type": "ce",
 45 |   "train_n_clips": 4,
 46 |   "inference_n_clips": 8,
 47 |   "num_frm": 13,
 48 |   "sample_rate":4,
 49 |   "crop_size": [160,256],
 50 |   "out_size": [256, 128, 64, 3],
 51 |   "train_batch_size": 4,
 52 |   "val_batch_size": 8,
 53 |   "max_n_example_per_group": 1,
 54 |   "gradient_accumulation_steps": 1,
 55 |   "num_train_epochs": 100,
 56 |   "min_valid_steps": 1,
 57 |   "num_valid": 20,
 58 |   "only_valid_steps": 500,
 59 |   "save_steps_ratio": 0.05,
 60 |   "learning_rate": 5e-6,
 61 |   "decay": "multi_step",
 62 |   "step_decay_epochs":[8,16,32,64],
 63 |   "cnn_step_decay_epochs":[8,16,32,64],
 64 |   "optim": "adamw",
 65 |   "betas": [0.9, 0.98],
 66 |   "dropout": 0.1,
 67 |   "weight_decay": 1e-3,
 68 |   "grad_norm": 5.0,
 69 |   "cnn_learning_rate": 5e-6,
 70 |   "cnn_weight_decay": 1e-3,
 71 |   "cnn_lr_decay": "multi_step",
 72 |   "align_learning_rate": 5e-6,
 73 |   "align_weight_decay": 1e-3,
 74 |   "generator_learning_rate": 5e-3,
 75 |   "generator_weight_decay": 0.0,
 76 |   "low_level_tasks": ["none"],
 77 |   "pixel_random_sampling_size": 160,
 78 |   "seed":24,
 79 |   "fp16": 1,
 80 |   "amp_level": "O2",
 81 |   "use_itm": 0,
 82 |   "use_itc": 1,
 83 |   "use_mlm": 0,
 84 |   
 85 |   "n_workers": 4,
 86 | 
 87 |   "hframe":1,
 88 |   "lframe":11,
 89 | 
 90 | 
 91 |   "backbone_channels": [256, 512, 1024, 2048],
 92 |   "backbone_downsample": [4, 8, 16, 32],
 93 |   "backbone_channel_in_size": 2048,
 94 |   "hidden_size": 1024,
 95 | 
 96 |   "temp": 0.08,
 97 |   "loss_config":{
 98 |     "loss_name":"NCEContrastiveLoss",
 99 |     "temp":0.08
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/hd-vila/src/configs/msrvtt_retrieval.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "train_datasets": 
  3 |     {
  4 |       "name": "msrvtt-9k",
  5 |       "vis_format": "video",
  6 |       "txt": "data/msrvtt_retrieval/train9k.jsonl",
  7 |       "vis": "data/msrvtt_retrieval/videos_6fps"
  8 |     },
  9 |   "val_datasets": [
 10 | 
 11 |     {
 12 |       "name": "msrvtt-1ka",
 13 |       "vis_format": "video",
 14 |       "txt": "data/msrvtt_retrieval/test1ka.jsonl",
 15 |       "vis": "data/msrvtt_retrieval/videos_6fps"
 16 |     }
 17 |   ],
 18 |   "inference_datasets": [
 19 |     {
 20 |       "name": "msrvtt-1ka",
 21 |       "vis_format": "video",
 22 |       "txt": "data/msrvtt_retrieval/test1ka.jsonl",
 23 |       "vis": "data/msrvtt_retrieval/videos_6fps"
 24 |     }
 25 |   ],
 26 |   "img_pixel_mean": [123.675, 116.28, 103.53],
 27 |   "img_pixel_std": [58.395, 57.12, 57.375],
 28 |   "model_config": "src/configs/base_model_large.json",
 29 |   "e2e_weights_path": "data/pretrained/hdvila_stage2.pt",
 30 |   "mmdetection_weights_path": "data/pretrained/res50_mmdetection.pth",
 31 |   "bert_weights_path": "data/pretrained/bert-large-uncased/pytorch_model.bin",
 32 |   "tokenizer_dir": "data/pretrained/bert-base-uncased/",
 33 |   "output_dir": "data/output/retrieval/msrvtt_retrieval",
 34 |   "vis_steps":0,
 35 |   
 36 |   "resnet_depth": 50,
 37 |   "resnet_frozen_stage": -1,
 38 |   "bert_frozen_stage": -1,
 39 |   "bert_mean":1,
 40 |   "timesformer_depth": 4,
 41 |   "timesformer_heads": 16,
 42 |   "timesformer_type": "new",
 43 |   "max_txt_len": 50,
 44 |   "score_agg_func": "lse",
 45 |   "loss_type": "ce",
 46 |   "train_n_clips": 2,
 47 |   "inference_n_clips": 4,
 48 |   
 49 |   "crop_size": [160,256],
 50 |   "out_size": [256, 128, 64, 3],
 51 |   "train_batch_size": 8,
 52 |   "val_batch_size": 8,
 53 |   "max_n_example_per_group": 1,
 54 |   "gradient_accumulation_steps": 1,
 55 |   "num_train_epochs": 200,
 56 |   "min_valid_steps": 1,
 57 |   "num_valid": 10,
 58 |   "only_valid_steps": 100,
 59 |   "save_steps_ratio": 0.1,
 60 |   "learning_rate": 1e-5,
 61 |   "decay": "multi_step",
 62 |   "step_decay_epochs":[32, 64, 128, 256],
 63 |   "cnn_step_decay_epochs":[32, 64, 128, 256],
 64 |   "optim": "adamw",
 65 |   "betas": [0.9, 0.98],
 66 |   "dropout": 0.1,
 67 |   "weight_decay": 1e-4,
 68 |   "grad_norm": 5.0,
 69 |   "cnn_learning_rate": 1e-5,
 70 |   "cnn_weight_decay": 1e-4,
 71 |   "cnn_lr_decay": "multi_step",
 72 |   "align_learning_rate": 5e-6,
 73 |   "align_weight_decay": 1e-3,
 74 |   "generator_learning_rate": 5e-3,
 75 |   "generator_weight_decay": 0.0,
 76 |   "low_level_tasks": ["none"],
 77 |   "pixel_random_sampling_size": 160,
 78 |   "seed":24,
 79 |   "fp16": 1,
 80 |   "amp_level": "O2",
 81 |   "use_itm": 0,
 82 |   "use_itc": 1,
 83 |   "use_mlm": 0,
 84 |   
 85 |   "n_workers": 4,
 86 | 
 87 |   "pos_num":1,
 88 | 
 89 |   "backbone_channels": [256, 512, 1024, 2048],
 90 |   "backbone_downsample": [4, 8, 16, 32],
 91 |   "backbone_channel_in_size": 2048,
 92 |   "hidden_size": 1024,
 93 | 
 94 |   "hframe":1,
 95 |   "lframe":11,
 96 | 
 97 |   
 98 |   "num_frm": 7,
 99 |   "sample_rate": 4,
100 |   "warmup_ratio":0.01,
101 |   
102 |   "temp": 0.1,
103 |   "loss_config":{
104 |     "loss_name":"NCEContrastiveLoss",
105 |     "temp":0.08
106 |   }
107 |   
108 | }
109 | 


--------------------------------------------------------------------------------
/hd-vila/scripts/process_raw_video/decode_frames.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | os.system('pip install Pillow')
 4 | os.system('pip install decord')
 5 | import jsonlines
 6 | from tqdm import tqdm
 7 | import time
 8 | from PIL import Image
 9 | import decord
10 | import multiprocessing
11 | from joblib import Parallel, delayed
12 | from glob import glob
13 | import numpy as np
14 | 
15 | def parse_args():
16 |     parser = argparse.ArgumentParser(description='decode frames')
17 |     parser.add_argument('--workdir', default='/data',type=str, help='workdir')
18 |     parser.add_argument('--inputfile', default='train.jsonl', type=str, help='inputfile')
19 |     parser.add_argument("--outputfile",type=str, default="train_result.jsonl", help="outputfile")
20 | 
21 |     args = parser.parse_args()
22 |     return args
23 | 
24 | def check_dirs(dirs):
25 |     if not os.path.exists(dirs):
26 |         os.makedirs(dirs)
27 | 
28 | 
29 | def load_clip_text(args):
30 |     p = os.path.join(args.workdir,'lsmdc/', args.inputfile)
31 |     data = []
32 |     with open(p,'r') as f:
33 |         for l in jsonlines.Reader(f):
34 |             data.append(l)
35 |     return data
36 | 
37 | def extract_single_clip(clip_text):
38 |     try:
39 |         clip_id = clip_text['clip_id']
40 |         clip_path = os.path.join(args.workdir, 'lsmdc/videos/{}.avi'.format(clip_id))
41 |         if os.path.exists(clip_path):
42 |         
43 |             out_folder = os.path.join(os.path.join(args.workdir, 'lsmdc/video_frames',clip_id))
44 |             out_folder_lr = os.path.join(os.path.join(args.workdir, 'lsmdc/video_frames_lr',clip_id))
45 |             os.system('rm -rf {}'.format(out_folder))
46 |             
47 |             check_dirs(out_folder)
48 | 
49 |             vr = decord.VideoReader(clip_path, ctx=decord.cpu(0))
50 |             fps = vr.get_avg_fps()
51 |             sample_id = np.round(np.linspace(0, len(vr)-1, round(len(vr)/fps*6))).astype(int)
52 |             if len(sample_id)<=20:
53 |                 sample_id = np.round(np.linspace(0, len(vr)-1, 20)).astype(int)
54 | 
55 | 
56 |             for i in range(len(sample_id)):
57 |                 frame = vr[sample_id[i]].asnumpy()
58 |                 img = Image.fromarray(frame).convert("RGB")
59 |                 img.save(os.path.join(out_folder, clip_id.split('/')[-1]+'_{0:03d}.jpg'.format(i)))
60 | 
61 |                 img = img.resize((288,180),Image.BICUBIC)
62 |                 img.save(os.path.join(out_folder_lr, clip_id.split('/')[-1]+'_{0:03d}.jpg'.format(i)))
63 | 
64 |             return {'clip_id':clip_id, 'num_frame':len(sample_id)}
65 |         else:
66 |             return None
67 |     except:
68 |         return None
69 | 
70 | def main(args):
71 | 
72 |     clip_texts = load_clip_text(args)
73 | 
74 | 
75 |     num_cores = multiprocessing.cpu_count()
76 |     print(num_cores)
77 |     results = Parallel(n_jobs=2)(delayed(extract_single_clip)(c) for c in tqdm(clip_texts))
78 |     results = [x for x in results if x is not None]
79 | 
80 |     print(len(results))
81 |     check_dirs(os.path.join(args.workdir,'lsmdc/decode_results'))
82 |     save_path = os.path.join(args.workdir,'lsmdc/decode_results',args.outputfile)
83 |     print(save_path)
84 |     with jsonlines.open(save_path, 'w') as f:
85 |         for i in tqdm(range(len(results))):
86 |             f.write(results[i])
87 |     print('write done')
88 | 
89 | 
90 | if __name__ =='__main__':
91 |     args = parse_args()
92 |     
93 |     print(args.workdir)
94 |     main(args)
95 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/queryd_ret.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 1
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | 
 22 | WEIGHTS: 
 23 |     model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin'
 24 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 25 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 26 |     pretrained_2d: True
 27 | 
 28 | DATA:
 29 |     BATCH_SIZE_per_gpu: 16
 30 |     NUM_WORKERS: 12
 31 |     PIN_MEMORY: True
 32 | 
 33 |     sample_frame: 32
 34 |     sample_clip: 4
 35 |     input_res: [192, 320]
 36 |     center_crop: 200
 37 | 
 38 | 
 39 |     DATASET_train: {
 40 |             'name': 'RetrievalDataset-train',
 41 |             'type': 'RetrievalDataset',
 42 |             'metadata_dir': 'datasets/lfvila_data/task/querydret/train.jsonl',
 43 |             'video_path': 'datasets/queryd/queryd_video'
 44 |         }
 45 | 
 46 |     DATASET_val: [{
 47 |             'name': 'RetrievalDataset-val',
 48 |             'type': 'RetrievalDataset',
 49 |             'metadata_dir': 'datasets/lfvila_data/task/querydret/test.jsonl',
 50 |             'video_path': 'datasets/queryd/queryd_video'
 51 |         }
 52 |         ]
 53 | 
 54 | 
 55 | TRAINING:
 56 |     save_feats: 0
 57 |     do_eval2: false
 58 |     EPOCHS: 20
 59 |     WARMUP_EPOCHS: 1
 60 |     WARMUP_LR: 0.
 61 |     LR_SCHEDULER: {
 62 |         'NAME': 'step',
 63 |         'DECAY_EPOCHS': 5,
 64 |         'DECAY_RATE': 0.25
 65 |         }
 66 | 
 67 |     use_mlm: false
 68 | 
 69 |     ct_global_loss_weight: 1
 70 | 
 71 | 
 72 |     temp: 0.05
 73 |     weight_decay: 0.05
 74 |     save_dir: "project/lfvila/lfvila_save/querydret"
 75 |     checkpoint_step: 20000
 76 |     save_step: 10000
 77 |     print_step: 25
 78 |     eval_step: 25
 79 | 
 80 | deepspeed_config: {
 81 |     "train_micro_batch_size_per_gpu": 16,
 82 |     "gradient_accumulation_steps": 1,
 83 |     "steps_per_print": 500,
 84 | 
 85 | 
 86 |     "zero_optimization": {
 87 |       "stage": 2,
 88 |       "allgather_partitions": true,
 89 |       "allgather_bucket_size": 5.0e+8,
 90 |       "overlap_comm": false,
 91 |       "reduce_scatter": true,
 92 |       "reduce_bucket_size": 5.0e+8,
 93 |       "contiguous_gradients" : false,
 94 |       "stage3_gather_fp16_weights_on_model_save": true
 95 |     },
 96 | 
 97 |     "fp16": {
 98 |       "enabled": true,
 99 |       "loss_scale": 0,
100 |       "loss_scale_window": 1000,
101 |       "initial_scale_power": 32,
102 |       "hysteresis": 2,
103 |       "min_loss_scale": 1
104 |   },
105 | 
106 |     "optimizer": {
107 |         "type": "AdamW",
108 |         "params": {
109 |         "lr": 5.0e-5,
110 |         "betas": [0.9, 0.98],
111 |         "eps": 1.0e-8,
112 |         "weight_decay": 5.0e-2
113 |         }
114 |     },
115 | 
116 | 
117 |     "sparse_attention": {
118 |       "mode": "fixed",
119 |       "block": 32,
120 |       "different_layout_per_head": true,
121 |       "num_local_blocks": 16,
122 |       "num_global_blocks": 1,
123 |       "attention": "bidirectional",
124 |       "horizontal_global_attention": true,
125 |       "num_different_global_patterns": 4
126 |     }
127 | }
128 | 
129 | 
130 |   
131 | 
132 | 
133 |   
134 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/violin_qa.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 2
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | final_num_patches: 6
 22 | 
 23 | qa_type: 'classification'
 24 | 
 25 | WEIGHTS:
 26 |     model_weight: 'project/lfvila/pretrained/lfvila_stage2.bin'
 27 |     stage1_model_weight: ''
 28 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 29 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 30 |     pretrained_2d: True
 31 | 
 32 | DATA:
 33 |     BATCH_SIZE_per_gpu: 12
 34 |     NUM_WORKERS: 8
 35 |     PIN_MEMORY: True
 36 | 
 37 |     sample_frame: 32
 38 |     sample_clip: 4
 39 |     input_res: [192, 320]
 40 |     center_crop: 200
 41 | 
 42 |     max_num_subtitle: 4
 43 | 
 44 |     classification_labels: 2
 45 | 
 46 |     DATASET_train: {
 47 |             'name': 'QADataset-train',
 48 |             'type': 'ViolinDataset',
 49 |             'metadata_dir': 'datasets/lfvila_data/task/violin/violin_train.jsonl',
 50 |             'video_path': 'datasets/violin/violin_video'
 51 |         }
 52 | 
 53 |     DATASET_val: [{
 54 |             'name': 'QADataset-val',
 55 |             'type': 'ViolinDataset',
 56 |             'metadata_dir': 'datasets/lfvila_data/task/violin/violin_test.jsonl',
 57 |             'video_path': 'datasets/violin/violin_video'
 58 |         }]
 59 | 
 60 | 
 61 | TRAINING:
 62 |     EPOCHS: 100
 63 |     WARMUP_EPOCHS: 10
 64 |     WARMUP_LR: 0.
 65 |     LR_SCHEDULER: {
 66 |         'NAME': 'linear',
 67 |         'DECAY_EPOCHS': 10,
 68 |         }
 69 | 
 70 | 
 71 |     use_mlm: false
 72 | 
 73 |     weight_decay: 0.1
 74 | 
 75 |     save_dir: "project/lfvila/lfvila_save/violin"
 76 |     checkpoint_step: 10000
 77 |     save_step: 5000
 78 |     print_step: 100
 79 |     eval_step: 500
 80 | 
 81 | deepspeed_config: {
 82 |     "train_micro_batch_size_per_gpu": 12,
 83 |     "gradient_accumulation_steps": 1,
 84 |     "steps_per_print": 500,
 85 | 
 86 | 
 87 |     "zero_optimization": {
 88 |       "stage": 2,
 89 |       "allgather_partitions": true,
 90 |       "allgather_bucket_size": 5.0e+8,
 91 |       "overlap_comm": false,
 92 |       "reduce_scatter": true,
 93 |       "reduce_bucket_size": 5.0e+8,
 94 |       "contiguous_gradients" : false,
 95 |       "stage3_gather_fp16_weights_on_model_save": true
 96 |     },
 97 | 
 98 |     "fp16": {
 99 |       "enabled": true,
100 |       "loss_scale": 0,
101 |       "loss_scale_window": 1000,
102 |       "initial_scale_power": 32,
103 |       "hysteresis": 2,
104 |       "min_loss_scale": 1
105 |   },
106 | 
107 |     "optimizer": {
108 |         "type": "AdamW",
109 |         "params": {
110 |         "lr": 5.0e-5,
111 |         "betas": [0.9, 0.98],
112 |         "eps": 1.0e-8,
113 |         "weight_decay": 5.0e-2
114 |         }
115 |     },
116 | 
117 | 
118 |     "sparse_attention": {
119 |       "mode": "fixed",
120 |       "block": 32,
121 |       "different_layout_per_head": true,
122 |       "num_local_blocks": 16,
123 |       "num_global_blocks": 1,
124 |       "attention": "bidirectional",
125 |       "horizontal_global_attention": true,
126 |       "num_different_global_patterns": 4
127 |     }
128 | }
129 | 
130 |   
131 | 
132 | 
133 |   
134 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/actnet_qa.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 2
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | final_num_patches: 6
 22 | 
 23 | 
 24 | qa_type: 'classification'
 25 | 
 26 | WEIGHTS:
 27 |     model_weight: 'project/lfvila/pretrained/lfvila_stage2.bin'
 28 |     stage1_model_weight: ''
 29 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 30 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 31 |     pretrained_2d: True
 32 | 
 33 | DATA:
 34 |     BATCH_SIZE_per_gpu: 16
 35 |     NUM_WORKERS: 4
 36 |     PIN_MEMORY: True
 37 | 
 38 |     sample_frame: 32
 39 |     sample_clip: 4
 40 |     input_res: [192, 320]
 41 |     center_crop: 200
 42 | 
 43 | 
 44 |     classification_labels: 1654
 45 | 
 46 |     DATASET_train: {
 47 |             'name': 'QADataset-train',
 48 |             'type': 'ActnetQADataset',
 49 |             'metadata_dir': 'datasets/lfvila_data/task/actnet_qa/train.jsonl',
 50 |             'video_path': 'datasets/activitynet/actnet_video'
 51 |         }
 52 | 
 53 |     DATASET_val: [{
 54 |             'name': 'QADataset-val',
 55 |             'type': 'ActnetQADataset',
 56 |             'metadata_dir': 'datasets/lfvila_data/task/actnet_qa/test.jsonl',
 57 |             'video_path': 'datasets/activitynet/actnet_video'
 58 |         }]
 59 | 
 60 | 
 61 | TRAINING:
 62 |     EPOCHS: 100
 63 |     WARMUP_EPOCHS: 10
 64 |     WARMUP_LR: 0.
 65 |     MIN_LR: 1.0e-8
 66 |     LR_SCHEDULER: {
 67 |         'NAME': 'cosine',
 68 |         'DECAY_EPOCHS': 10 
 69 |         }
 70 | 
 71 |     use_mlm: false
 72 | 
 73 |     weight_decay: 0.1
 74 | 
 75 |     save_dir: "project/lfvila/lfvila_save/actnetqa"
 76 |     checkpoint_step: 10000
 77 |     save_step: 5000
 78 |     print_step: 100
 79 |     eval_step: 500
 80 | 
 81 | deepspeed_config: {
 82 |     "train_micro_batch_size_per_gpu": 16,
 83 |     "gradient_accumulation_steps": 1,
 84 |     "steps_per_print": 500,
 85 | 
 86 | 
 87 |     "zero_optimization": {
 88 |       "stage": 2,
 89 |       "allgather_partitions": true,
 90 |       "allgather_bucket_size": 5.0e+8,
 91 |       "overlap_comm": false,
 92 |       "reduce_scatter": true,
 93 |       "reduce_bucket_size": 5.0e+8,
 94 |       "contiguous_gradients" : false,
 95 |       "stage3_gather_fp16_weights_on_model_save": true
 96 |     },
 97 | 
 98 |     "fp16": {
 99 |       "enabled": true,
100 |       "loss_scale": 0,
101 |       "loss_scale_window": 1000,
102 |       "initial_scale_power": 32,
103 |       "hysteresis": 2,
104 |       "min_loss_scale": 1
105 |   },
106 | 
107 |     "optimizer": {
108 |         "type": "AdamW",
109 |         "params": {
110 |         "lr": 5.0e-5,
111 |         "betas": [0.9, 0.98],
112 |         "eps": 1.0e-8,
113 |         "weight_decay": 5.0e-2
114 |         }
115 |     },
116 | 
117 | 
118 |     "sparse_attention": {
119 |       "mode": "fixed",
120 |       "block": 32,
121 |       "different_layout_per_head": true,
122 |       "num_local_blocks": 16,
123 |       "num_global_blocks": 1,
124 |       "attention": "bidirectional",
125 |       "horizontal_global_attention": true,
126 |       "num_different_global_patterns": 4
127 |     }
128 | }
129 | 
130 |   
131 | 
132 | 
133 |   
134 | 


--------------------------------------------------------------------------------
/LF-VILA/src/optimization/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from timm.scheduler.cosine_lr import CosineLRScheduler
 3 | from timm.scheduler.step_lr import StepLRScheduler
 4 | from timm.scheduler.scheduler import Scheduler
 5 | 
 6 | 
 7 | def build_scheduler(config, optimizer, n_iter_per_epoch):
 8 |     num_steps = int(config.TRAINING.EPOCHS * n_iter_per_epoch)
 9 |     warmup_steps = int(config.TRAINING.WARMUP_EPOCHS * n_iter_per_epoch)
10 |     decay_steps = int(config.TRAINING.LR_SCHEDULER.DECAY_EPOCHS * n_iter_per_epoch)
11 | 
12 |     lr_scheduler = None
13 |     if config.TRAINING.LR_SCHEDULER.NAME == 'cosine':
14 |         lr_scheduler = CosineLRScheduler(
15 |             optimizer,
16 |             t_initial=num_steps,
17 |             t_mul=1.,
18 |             lr_min=config.TRAINING.MIN_LR,
19 |             warmup_lr_init=config.TRAINING.WARMUP_LR,
20 |             warmup_t=warmup_steps,
21 |             cycle_limit=1,
22 |             t_in_epochs=False,
23 |         )
24 |     elif config.TRAINING.LR_SCHEDULER.NAME == 'linear':
25 |         lr_scheduler = LinearLRScheduler(
26 |             optimizer,
27 |             t_initial=num_steps,
28 |             lr_min_rate=0.01,
29 |             warmup_lr_init=config.TRAINING.WARMUP_LR,
30 |             warmup_t=warmup_steps,
31 |             t_in_epochs=False,
32 |         )
33 |     elif config.TRAINING.LR_SCHEDULER.NAME == 'step':
34 |         lr_scheduler = StepLRScheduler(
35 |             optimizer,
36 |             decay_t=decay_steps,
37 |             decay_rate=config.TRAINING.LR_SCHEDULER.DECAY_RATE,
38 |             warmup_lr_init=config.TRAINING.WARMUP_LR,
39 |             warmup_t=warmup_steps,
40 |             t_in_epochs=False,
41 |         )
42 | 
43 |     return lr_scheduler
44 | 
45 | 
46 | class LinearLRScheduler(Scheduler):
47 |     def __init__(self,
48 |                  optimizer: torch.optim.Optimizer,
49 |                  t_initial: int,
50 |                  lr_min_rate: float,
51 |                  warmup_t=0,
52 |                  warmup_lr_init=0.,
53 |                  t_in_epochs=True,
54 |                  noise_range_t=None,
55 |                  noise_pct=0.67,
56 |                  noise_std=1.0,
57 |                  noise_seed=42,
58 |                  initialize=True,
59 |                  ) -> None:
60 |         super().__init__(
61 |             optimizer, param_group_field="lr",
62 |             noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
63 |             initialize=initialize)
64 | 
65 |         self.t_initial = t_initial
66 |         self.lr_min_rate = lr_min_rate
67 |         self.warmup_t = warmup_t
68 |         self.warmup_lr_init = warmup_lr_init
69 |         self.t_in_epochs = t_in_epochs
70 |         if self.warmup_t:
71 |             self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
72 |             super().update_groups(self.warmup_lr_init)
73 |         else:
74 |             self.warmup_steps = [1 for _ in self.base_values]
75 | 
76 |     def _get_lr(self, t):
77 |         if t < self.warmup_t:
78 |             lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
79 |         else:
80 |             t = t - self.warmup_t
81 |             total_t = self.t_initial - self.warmup_t
82 |             lrs = [v - ((v - v * self.lr_min_rate) * (t / total_t)) for v in self.base_values]
83 |         return lrs
84 | 
85 |     def get_epoch_values(self, epoch: int):
86 |         if self.t_in_epochs:
87 |             return self._get_lr(epoch)
88 |         else:
89 |             return None
90 | 
91 |     def get_update_values(self, num_updates: int):
92 |         if not self.t_in_epochs:
93 |             return self._get_lr(num_updates)
94 |         else:
95 |             return None


--------------------------------------------------------------------------------
/LF-VILA/src/configs/didemo_ret.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 1
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | 
 22 | WEIGHTS: 
 23 |     model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin'
 24 |     stage1_model_weight: ''
 25 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 26 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 27 |     pretrained_2d: True
 28 | 
 29 | DATA:
 30 |     BATCH_SIZE_per_gpu: 16
 31 |     NUM_WORKERS: 12
 32 |     PIN_MEMORY: True
 33 | 
 34 |     sample_frame: 32
 35 |     sample_clip: 4
 36 |     input_res: [192, 320]
 37 |     center_crop: 200
 38 | 
 39 | 
 40 |     DATASET_train: {
 41 |             'name': 'RetrievalDataset-train',
 42 |             'type': 'RetrievalDataset',
 43 |             'metadata_dir': 'datasets/lfvila_data/task/didemo/train.jsonl',
 44 |             'video_path': 'datasets/didemo/didemo_video'
 45 |         }
 46 | 
 47 |     DATASET_val: [{
 48 |             'name': 'RetrievalDataset-val',
 49 |             'type': 'RetrievalDataset',
 50 |             'metadata_dir': 'datasets/lfvila_data/task/didemo/test.jsonl',
 51 |             'video_path': 'datasets/didemo/didemo_video'
 52 |         }
 53 |         ]
 54 | 
 55 | 
 56 | TRAINING:
 57 |     save_feats: 0
 58 |     do_eval2: false
 59 |     EPOCHS: 20
 60 |     WARMUP_EPOCHS: 1
 61 |     WARMUP_LR: 0.
 62 |     LR_SCHEDULER: {
 63 |         'NAME': 'step',
 64 |         'DECAY_EPOCHS': 5,
 65 |         'DECAY_RATE': 0.25
 66 |         }
 67 | 
 68 |     use_mlm: false
 69 | 
 70 |     ct_global_loss_weight: 1
 71 | 
 72 |     temp: 0.05
 73 |     weight_decay: 0.05
 74 |     save_dir: "project/lfvila/lfvila_save/didemoret"
 75 |     checkpoint_step: 20000
 76 |     save_step: 10000
 77 |     print_step: 25
 78 |     eval_step: 25
 79 | 
 80 | deepspeed_config: {
 81 |     "train_micro_batch_size_per_gpu": 16,
 82 |     "gradient_accumulation_steps": 1,
 83 |     "steps_per_print": 500,
 84 | 
 85 | 
 86 |     "zero_optimization": {
 87 |       "stage": 2,
 88 |       "allgather_partitions": true,
 89 |       "allgather_bucket_size": 5.0e+8,
 90 |       "overlap_comm": false,
 91 |       "reduce_scatter": true,
 92 |       "reduce_bucket_size": 5.0e+8,
 93 |       "contiguous_gradients" : false,
 94 |       "stage3_gather_fp16_weights_on_model_save": true
 95 |     },
 96 | 
 97 |     "fp16": {
 98 |       "enabled": true,
 99 |       "loss_scale": 0,
100 |       "loss_scale_window": 1000,
101 |       "initial_scale_power": 32,
102 |       "hysteresis": 2,
103 |       "min_loss_scale": 1
104 |   },
105 | 
106 |     "optimizer": {
107 |         "type": "AdamW",
108 |         "params": {
109 |         "lr": 5.0e-5,
110 |         "betas": [0.9, 0.98],
111 |         "eps": 1.0e-8,
112 |         "weight_decay": 5.0e-2
113 |         }
114 |     },
115 | 
116 | 
117 |     "sparse_attention": {
118 |       "mode": "fixed",
119 |       "block": 32,
120 |       "different_layout_per_head": true,
121 |       "num_local_blocks": 16,
122 |       "num_global_blocks": 1,
123 |       "attention": "bidirectional",
124 |       "horizontal_global_attention": true,
125 |       "num_different_global_patterns": 4
126 |     }
127 | }
128 | 
129 | 
130 |   
131 | 
132 | 
133 |   
134 | 


--------------------------------------------------------------------------------
/LF-VILA/src/utils/dist.py:
--------------------------------------------------------------------------------
 1 | import torch.distributed as dist
 2 | import torch
 3 | import math
 4 | 
 5 | def master_process(args):
 6 |     return (dist.get_rank() == 0) or (args.distributed == False)
 7 | 
 8 | @torch.no_grad()
 9 | def concat_all_gather(tensor):
10 |     """
11 |     Performs all_gather operation on the provided tensors.
12 |     *** Warning ***: torch.distributed.all_gather has no gradient.
13 |     """
14 |     tensors_gather = [torch.ones_like(tensor)
15 |         for _ in range(torch.distributed.get_world_size())]
16 |     dist.all_gather(tensors_gather, tensor, async_op=False)
17 | 
18 |     output = torch.cat(tensors_gather, dim=0)
19 |     return output
20 | 
21 | class SyncFunction(torch.autograd.Function):
22 | 
23 |     @staticmethod
24 |     def forward(ctx, tensor):
25 |         ctx.batch_size = tensor.shape[0]
26 | 
27 |         gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())]
28 | 
29 |         torch.distributed.all_gather(gathered_tensor, tensor)
30 |         gathered_tensor = torch.cat(gathered_tensor, 0)
31 | 
32 |         return gathered_tensor
33 | 
34 |     @staticmethod
35 |     def backward(ctx, grad_output):
36 |         grad_input = grad_output.clone()
37 |         torch.distributed.all_reduce(grad_input, op=torch.distributed.ReduceOp.SUM, async_op=False)
38 | 
39 |         idx_from = torch.distributed.get_rank() * ctx.batch_size
40 |         idx_to = (torch.distributed.get_rank() + 1) * ctx.batch_size
41 |         return grad_input[idx_from:idx_to]
42 | 
43 | 
44 | class SequentialDistributedSampler(torch.utils.data.sampler.Sampler):
45 |     """
46 |     Distributed Sampler that subsamples indices sequentially, making it easier to collate all results at the end.
47 | 
48 |     Even though we only use this sampler for eval and predict (no training), which means that the model params won't
49 |     have to be synced (i.e. will not hang for synchronization even if varied number of forward passes), we still add
50 |     extra samples to the sampler to make it evenly divisible (like in `DistributedSampler`) to make it easy to `gather`
51 |     or `reduce` resulting tensors at the end of the loop.
52 |     """
53 | 
54 |     def __init__(self, dataset, num_replicas=None, rank=None):
55 |         if num_replicas is None:
56 |             if not torch.distributed.is_available():
57 |                 raise RuntimeError("Requires distributed package to be available")
58 |             num_replicas = torch.distributed.get_world_size()
59 |         if rank is None:
60 |             if not torch.distributed.is_available():
61 |                 raise RuntimeError("Requires distributed package to be available")
62 |             rank = torch.distributed.get_rank()
63 |         self.dataset = dataset
64 |         self.num_replicas = num_replicas
65 |         self.rank = rank
66 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
67 |         self.total_size = self.num_samples * self.num_replicas
68 | 
69 |     def __iter__(self):
70 |         indices = list(range(len(self.dataset)))
71 | 
72 |         # add extra samples to make it evenly divisible
73 |         indices += indices[: (self.total_size - len(indices))]
74 |         assert (
75 |             len(indices) == self.total_size
76 |         ), f"Indices length {len(indices)} and total size {self.total_size} mismatched"
77 | 
78 |         # subsample
79 |         indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples]
80 |         assert (
81 |             len(indices) == self.num_samples
82 |         ), f"Indices length {len(indices)} and sample number {self.num_samples} mismatched"
83 | 
84 |         return iter(indices)
85 | 
86 |     def __len__(self):
87 |         return self.num_samples


--------------------------------------------------------------------------------
/LF-VILA/src/configs/cmovie_ret.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 1
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | 
 22 | 
 23 | WEIGHTS: 
 24 |     model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin'
 25 |     stage1_model_weight: ''
 26 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 27 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 28 |     pretrained_2d: True
 29 | 
 30 | DATA:
 31 |     BATCH_SIZE_per_gpu: 16
 32 |     NUM_WORKERS: 12
 33 |     PIN_MEMORY: True
 34 | 
 35 |     sample_frame: 32
 36 |     sample_clip: 1
 37 |     input_res: [192, 320]
 38 |     center_crop: 200
 39 | 
 40 | 
 41 |     DATASET_train: {
 42 |             'name': 'RetrievalDataset-train',
 43 |             'type': 'RetrievalDataset',
 44 |             'metadata_dir': 'datasets/lfvila_data/task/cmovie/train.jsonl',
 45 |             'video_path': 'datasets/CondensedMovies/cmovie_video'
 46 |         }
 47 | 
 48 |     DATASET_val: [{
 49 |             'name': 'RetrievalDataset-val',
 50 |             'type': 'RetrievalDataset',
 51 |             'metadata_dir': 'datasets/lfvila_data/task/cmovie/val.jsonl',
 52 |             'video_path': 'datasets/CondensedMovies/cmovie_video'
 53 |         }
 54 |         ]
 55 | 
 56 | 
 57 | TRAINING:
 58 |     save_feats: 0
 59 |     do_eval2: false
 60 |     EPOCHS: 20
 61 |     WARMUP_EPOCHS: 1
 62 |     WARMUP_LR: 0.
 63 |     LR_SCHEDULER: {
 64 |         'NAME': 'step',
 65 |         'DECAY_EPOCHS': 1,
 66 |         'DECAY_RATE': 0.5
 67 |         }
 68 | 
 69 |     use_mlm: false
 70 | 
 71 |     ct_global_loss_weight: 1
 72 | 
 73 |     temp: 0.05
 74 |     weight_decay: 0.05
 75 |     save_dir: "project/lfvila/lfvila_save/cmovieret"
 76 |     checkpoint_step: 20000
 77 |     save_step: 10000
 78 |     print_step: 100
 79 |     eval_step: 100
 80 | 
 81 | deepspeed_config: {
 82 |     "train_micro_batch_size_per_gpu": 16,
 83 |     "gradient_accumulation_steps": 1,
 84 |     "steps_per_print": 500,
 85 | 
 86 | 
 87 |     "zero_optimization": {
 88 |       "stage": 2,
 89 |       "allgather_partitions": true,
 90 |       "allgather_bucket_size": 5.0e+8,
 91 |       "overlap_comm": false,
 92 |       "reduce_scatter": true,
 93 |       "reduce_bucket_size": 5.0e+8,
 94 |       "contiguous_gradients" : false,
 95 |       "stage3_gather_fp16_weights_on_model_save": true
 96 |     },
 97 | 
 98 |     "fp16": {
 99 |       "enabled": true,
100 |       "loss_scale": 0,
101 |       "loss_scale_window": 1000,
102 |       "initial_scale_power": 32,
103 |       "hysteresis": 2,
104 |       "min_loss_scale": 1
105 |   },
106 | 
107 |     "optimizer": {
108 |         "type": "AdamW",
109 |         "params": {
110 |         "lr": 5.0e-5,
111 |         "betas": [0.9, 0.98],
112 |         "eps": 1.0e-8,
113 |         "weight_decay": 5.0e-2
114 |         }
115 |     },
116 | 
117 | 
118 |     "sparse_attention": {
119 |       "mode": "fixed",
120 |       "block": 32,
121 |       "different_layout_per_head": true,
122 |       "num_local_blocks": 16,
123 |       "num_global_blocks": 1,
124 |       "attention": "bidirectional",
125 |       "horizontal_global_attention": true,
126 |       "num_different_global_patterns": 4
127 |     }
128 | }
129 | 
130 | 
131 |   
132 | 
133 | 
134 |   
135 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/actnet_ret.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 1
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | 
 22 | WEIGHTS: 
 23 |     model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin'
 24 |     stage1_model_weight: ''
 25 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 26 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 27 |     pretrained_2d: True
 28 | 
 29 | DATA:
 30 |     BATCH_SIZE_per_gpu: 16
 31 |     NUM_WORKERS: 6
 32 |     NUM_WORKERS: 6
 33 |     PIN_MEMORY: True
 34 | 
 35 |     sample_frame: 32
 36 |     sample_clip: 4
 37 |     input_res: [192, 320]
 38 |     center_crop: 200
 39 | 
 40 | 
 41 |     DATASET_train: {
 42 |             'name': 'RetrievalDataset-train',
 43 |             'type': 'RetrievalDataset',
 44 |             'metadata_dir': 'datasets/lfvila_data/task/actnet/train.jsonl',
 45 |             'video_path': 'datasets/activitynet/actnet_video'
 46 |         }
 47 | 
 48 |     DATASET_val: [{
 49 |             'name': 'RetrievalDataset-val',
 50 |             'type': 'RetrievalDataset',
 51 |             'metadata_dir': 'datasets/lfvila_data/task/actnet/val1.jsonl',
 52 |             'video_path': 'datasets/activitynet/actnet_video'
 53 |         }
 54 |         ]
 55 | 
 56 | 
 57 | TRAINING:
 58 |     save_feats: 0
 59 |     do_eval2: false
 60 |     EPOCHS: 20
 61 |     WARMUP_EPOCHS: 1
 62 |     WARMUP_LR: 0.
 63 |     LR_SCHEDULER: {
 64 |         'NAME': 'step',
 65 |         'DECAY_EPOCHS': 5,
 66 |         'DECAY_RATE': 0.25
 67 |         }
 68 | 
 69 |     use_mlm: false
 70 | 
 71 |     ct_global_loss_weight: 1
 72 | 
 73 |     temp: 0.05
 74 |     weight_decay: 0.05
 75 |     save_dir: "project/lfvila/lfvila_save/actnetret"
 76 |     checkpoint_step: 20000
 77 |     save_step: 10000
 78 |     print_step: 100
 79 |     eval_step: 100
 80 | 
 81 | deepspeed_config: {
 82 |     "train_micro_batch_size_per_gpu": 16,
 83 |     "gradient_accumulation_steps": 1,
 84 |     "steps_per_print": 500,
 85 | 
 86 | 
 87 |     "zero_optimization": {
 88 |       "stage": 2,
 89 |       "allgather_partitions": true,
 90 |       "allgather_bucket_size": 5.0e+8,
 91 |       "overlap_comm": false,
 92 |       "reduce_scatter": true,
 93 |       "reduce_bucket_size": 5.0e+8,
 94 |       "contiguous_gradients" : false,
 95 |       "stage3_gather_fp16_weights_on_model_save": true
 96 |     },
 97 | 
 98 |     "fp16": {
 99 |       "enabled": true,
100 |       "loss_scale": 0,
101 |       "loss_scale_window": 1000,
102 |       "initial_scale_power": 32,
103 |       "hysteresis": 2,
104 |       "min_loss_scale": 1
105 |   },
106 | 
107 |     "optimizer": {
108 |         "type": "AdamW",
109 |         "params": {
110 |         "lr": 5.0e-5,
111 |         "betas": [0.9, 0.98],
112 |         "eps": 1.0e-8,
113 |         "weight_decay": 5.0e-2
114 |         }
115 |     },
116 | 
117 | 
118 |     "sparse_attention": {
119 |       "mode": "fixed",
120 |       "block": 32,
121 |       "different_layout_per_head": true,
122 |       "num_local_blocks": 16,
123 |       "num_global_blocks": 1,
124 |       "attention": "bidirectional",
125 |       "horizontal_global_attention": true,
126 |       "num_different_global_patterns": 4
127 |     }
128 | }
129 | 
130 | 
131 |   
132 | 
133 | 
134 |   


--------------------------------------------------------------------------------
/LF-VILA/src/configs/coin_cls.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 1
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | 
 22 | 
 23 | WEIGHTS: 
 24 |     model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin'
 25 |     stage1_model_weight: ''
 26 |     bert_weight: ''
 27 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 28 |     pretrained_2d: True
 29 | 
 30 | DATA:
 31 |     BATCH_SIZE_per_gpu: 16
 32 |     NUM_WORKERS: 12
 33 |     PIN_MEMORY: True
 34 | 
 35 |     sample_frame: 32
 36 |     sample_clip: 4
 37 |     input_res: [192, 320]
 38 |     center_crop: 200
 39 | 
 40 |     classification_labels: 180
 41 | 
 42 |     tokenizer_dir: 'project/lfvideo/pretrained/bert-large-uncased/'
 43 | 
 44 |     DATASET_train: {
 45 |             'name': 'VideoClassificationDataset-train',
 46 |             'type': 'VideoClassificationDataset',
 47 |             'metadata_dir': 'datasets/lfvila_data/task/COIN/coin_train.jsonl',
 48 |             'video_path': 'datasets/COIN/coin_video'
 49 |         }
 50 | 
 51 |     DATASET_val: [{
 52 |             'name': 'VideoClassificationDataset-val',
 53 |             'type': 'VideoClassificationDataset',
 54 |             'metadata_dir': 'datasets/lfvila_data/task/COIN/coin_test.jsonl',
 55 |             'video_path': 'datasets/COIN/coin_video'
 56 |         }
 57 |         ]
 58 | 
 59 | 
 60 | TRAINING:
 61 |     save_feats: 0
 62 |     only_val: 0
 63 |     EPOCHS: 500
 64 |     WARMUP_EPOCHS: 1
 65 |     WARMUP_LR: 0.
 66 |     LR_SCHEDULER: {
 67 |         'NAME': 'linear',
 68 |         'DECAY_EPOCHS': 10,
 69 |         }
 70 | 
 71 |     use_mlm: false
 72 | 
 73 | 
 74 |     temp: 0.05
 75 |     weight_decay: 0.05
 76 |     save_dir: "project/lfvideo/lfvideo_save/coin"
 77 |     checkpoint_step: 20000
 78 |     save_step: 10000
 79 |     print_step: 50
 80 |     eval_step: 200
 81 | 
 82 | deepspeed_config: {
 83 |     "train_micro_batch_size_per_gpu": 16,
 84 |     "gradient_accumulation_steps": 1,
 85 |     "steps_per_print": 500,
 86 | 
 87 | 
 88 |     "zero_optimization": {
 89 |       "stage": 2,
 90 |       "allgather_partitions": true,
 91 |       "allgather_bucket_size": 5.0e+8,
 92 |       "overlap_comm": false,
 93 |       "reduce_scatter": true,
 94 |       "reduce_bucket_size": 5.0e+8,
 95 |       "contiguous_gradients" : false,
 96 |       "stage3_gather_fp16_weights_on_model_save": true
 97 |     },
 98 | 
 99 |     "fp16": {
100 |       "enabled": true,
101 |       "loss_scale": 0,
102 |       "loss_scale_window": 1000,
103 |       "initial_scale_power": 32,
104 |       "hysteresis": 2,
105 |       "min_loss_scale": 1
106 |   },
107 | 
108 |     "optimizer": {
109 |         "type": "AdamW",
110 |         "params": {
111 |         "lr": 5.0e-5,
112 |         "betas": [0.9, 0.98],
113 |         "eps": 1.0e-8,
114 |         "weight_decay": 5.0e-2
115 |         }
116 |     },
117 | 
118 | 
119 |     "sparse_attention": {
120 |       "mode": "fixed",
121 |       "block": 32,
122 |       "different_layout_per_head": true,
123 |       "num_local_blocks": 16,
124 |       "num_global_blocks": 1,
125 |       "attention": "bidirectional",
126 |       "horizontal_global_attention": true,
127 |       "num_different_global_patterns": 4
128 |     }
129 | }
130 | 
131 | 
132 |   
133 | 
134 | 
135 |   
136 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/lvu_scene_cls.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 1
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | 
 22 | 
 23 | WEIGHTS: 
 24 |     model_weight: 'project/lfvila/saved_model/lfvila_stage1.bin'
 25 |     stage1_model_weight: ''
 26 |     bert_weight: ''
 27 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 28 |     pretrained_2d: True
 29 | 
 30 | DATA:
 31 |     BATCH_SIZE_per_gpu: 16
 32 |     NUM_WORKERS: 12
 33 |     PIN_MEMORY: True
 34 | 
 35 |     sample_frame: 32
 36 |     sample_clip: 4
 37 |     input_res: [192, 320]
 38 |     center_crop: 200
 39 | 
 40 |     classification_labels: 6
 41 | 
 42 |     tokenizer_dir: 'project/lfvila/pretrained/bert-large-uncased/'
 43 | 
 44 |     DATASET_train: {
 45 |             'name': 'VideoClassificationDataset-train',
 46 |             'type': 'VideoClassificationDataset',
 47 |             'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/scene_train.jsonl',
 48 |             'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video'
 49 |         }
 50 | 
 51 |     DATASET_val: [{
 52 |             'name': 'VideoClassificationDataset-val',
 53 |             'type': 'VideoClassificationDataset',
 54 |             'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/scene_test.jsonl',
 55 |             'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video'
 56 |         }
 57 |         ]
 58 | 
 59 | 
 60 | TRAINING:
 61 |     save_feats: 0
 62 |     only_val: 0
 63 |     EPOCHS: 500
 64 |     WARMUP_EPOCHS: 1
 65 |     WARMUP_LR: 0.
 66 |     LR_SCHEDULER: {
 67 |         'NAME': 'linear',
 68 |         'DECAY_EPOCHS': 10,
 69 |         }
 70 | 
 71 |     use_mlm: false
 72 | 
 73 |     temp: 0.05
 74 |     weight_decay: 0.05
 75 |     save_dir: "project/lfvila/lfvila_save/lvu_scene"
 76 |     checkpoint_step: 20000
 77 |     save_step: 10000
 78 |     print_step: 10
 79 |     eval_step: 50
 80 | 
 81 | deepspeed_config: {
 82 |     "train_micro_batch_size_per_gpu": 16,
 83 |     "gradient_accumulation_steps": 1,
 84 |     "steps_per_print": 500,
 85 | 
 86 | 
 87 |     "zero_optimization": {
 88 |       "stage": 2,
 89 |       "allgather_partitions": true,
 90 |       "allgather_bucket_size": 5.0e+8,
 91 |       "overlap_comm": false,
 92 |       "reduce_scatter": true,
 93 |       "reduce_bucket_size": 5.0e+8,
 94 |       "contiguous_gradients" : false,
 95 |       "stage3_gather_fp16_weights_on_model_save": true
 96 |     },
 97 | 
 98 |     "fp16": {
 99 |       "enabled": true,
100 |       "loss_scale": 0,
101 |       "loss_scale_window": 1000,
102 |       "initial_scale_power": 32,
103 |       "hysteresis": 2,
104 |       "min_loss_scale": 1
105 |   },
106 | 
107 |     "optimizer": {
108 |         "type": "AdamW",
109 |         "params": {
110 |         "lr": 5.0e-5,
111 |         "betas": [0.9, 0.98],
112 |         "eps": 1.0e-8,
113 |         "weight_decay": 5.0e-2
114 |         }
115 |     },
116 | 
117 | 
118 |     "sparse_attention": {
119 |       "mode": "fixed",
120 |       "block": 32,
121 |       "different_layout_per_head": true,
122 |       "num_local_blocks": 16,
123 |       "num_global_blocks": 1,
124 |       "attention": "bidirectional",
125 |       "horizontal_global_attention": true,
126 |       "num_different_global_patterns": 4
127 |     }
128 | }
129 | 
130 | 
131 |   
132 | 
133 | 
134 |   
135 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/lvu_relationship_cls.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 1
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | 
 22 | 
 23 | WEIGHTS: 
 24 |     model_weight: 'project/lfvila/saved_model/lfvila_stage1.bin'
 25 |     stage1_model_weight: ''
 26 |     bert_weight: ''
 27 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 28 |     pretrained_2d: True
 29 | 
 30 | DATA:
 31 |     BATCH_SIZE_per_gpu: 16
 32 |     NUM_WORKERS: 12
 33 |     PIN_MEMORY: True
 34 | 
 35 |     sample_frame: 32
 36 |     sample_clip: 4
 37 |     input_res: [192, 320]
 38 |     center_crop: 200
 39 | 
 40 |     classification_labels: 4
 41 | 
 42 |     tokenizer_dir: 'project/lfvila/pretrained/bert-large-uncased/'
 43 | 
 44 |     DATASET_train: {
 45 |             'name': 'VideoClassificationDataset-train',
 46 |             'type': 'VideoClassificationDataset',
 47 |             'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/relationship_train.jsonl',
 48 |             'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video'
 49 |         }
 50 | 
 51 |     DATASET_val: [{
 52 |             'name': 'VideoClassificationDataset-val',
 53 |             'type': 'VideoClassificationDataset',
 54 |             'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/relationship_test.jsonl',
 55 |             'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video'
 56 |         }
 57 |         ]
 58 | 
 59 | 
 60 | TRAINING:
 61 |     save_feats: 0
 62 |     only_val: 0
 63 |     EPOCHS: 500
 64 |     WARMUP_EPOCHS: 1
 65 |     WARMUP_LR: 0.
 66 |     LR_SCHEDULER: {
 67 |         'NAME': 'linear',
 68 |         'DECAY_EPOCHS': 10,
 69 |         }
 70 | 
 71 |     use_mlm: false
 72 | 
 73 |     temp: 0.05
 74 |     weight_decay: 0.05
 75 |     save_dir: "project/lfvila/lfvila_save/lvu_relation"
 76 |     checkpoint_step: 20000
 77 |     save_step: 10000
 78 |     print_step: 5
 79 |     eval_step: 5
 80 | 
 81 | deepspeed_config: {
 82 |     "train_micro_batch_size_per_gpu": 16,
 83 |     "gradient_accumulation_steps": 1,
 84 |     "steps_per_print": 500,
 85 | 
 86 | 
 87 |     "zero_optimization": {
 88 |       "stage": 2,
 89 |       "allgather_partitions": true,
 90 |       "allgather_bucket_size": 5.0e+8,
 91 |       "overlap_comm": false,
 92 |       "reduce_scatter": true,
 93 |       "reduce_bucket_size": 5.0e+8,
 94 |       "contiguous_gradients" : false,
 95 |       "stage3_gather_fp16_weights_on_model_save": true
 96 |     },
 97 | 
 98 |     "fp16": {
 99 |       "enabled": true,
100 |       "loss_scale": 0,
101 |       "loss_scale_window": 1000,
102 |       "initial_scale_power": 32,
103 |       "hysteresis": 2,
104 |       "min_loss_scale": 1
105 |   },
106 | 
107 |     "optimizer": {
108 |         "type": "AdamW",
109 |         "params": {
110 |         "lr": 5.0e-5,
111 |         "betas": [0.9, 0.98],
112 |         "eps": 1.0e-8,
113 |         "weight_decay": 5.0e-2
114 |         }
115 |     },
116 | 
117 | 
118 |     "sparse_attention": {
119 |       "mode": "fixed",
120 |       "block": 32,
121 |       "different_layout_per_head": true,
122 |       "num_local_blocks": 16,
123 |       "num_global_blocks": 1,
124 |       "attention": "bidirectional",
125 |       "horizontal_global_attention": true,
126 |       "num_different_global_patterns": 4
127 |     }
128 | }
129 | 
130 | 
131 |   
132 | 
133 | 
134 |   
135 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/how2_qa.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | bert_config: "src/configs/bert_large_config.json"
 15 | stage: 2
 16 | type_vocab_size: 8
 17 | num_local_layers: 8
 18 | stage1_layers: 12
 19 | bert_frozen_stage: -1
 20 | final_num_patches: 6
 21 | use_simple_merge_qas: false
 22 | 
 23 | qa_type: 'multichoice'
 24 | 
 25 | WEIGHTS:
 26 |     model_weight: 'project/lfvila/pretrained/lfvila_stage2.bin'
 27 |     stage1_model_weight: ''
 28 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 29 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 30 |     pretrained_2d: True
 31 | 
 32 | subtitle_fuse: false
 33 | 
 34 | DATA:
 35 |     BATCH_SIZE_per_gpu: 4
 36 |     NUM_WORKERS: 8
 37 |     PIN_MEMORY: True
 38 | 
 39 |     sample_frame: 32
 40 |     sample_clip: 4
 41 |     input_res: [192, 320]
 42 |     center_crop: 200
 43 | 
 44 |     use_subtitle: true
 45 |     max_num_subtitle: 6
 46 |     max_text_lenght: 50
 47 | 
 48 |     DATASET_train: {
 49 |             'name': 'QADataset-train',
 50 |             'type': 'How2QADataset',
 51 |             'metadata_dir': 'datasets/lfvila_data/task/how2qa/how2qa_train.jsonl',
 52 |             'video_path': 'datasets/how2qa/how2qa_video'
 53 |         }
 54 | 
 55 |     DATASET_val: [{
 56 |             'name': 'QADataset-val',
 57 |             'type': 'How2QADataset',
 58 |             'metadata_dir': 'datasets/lfvila_data/task/how2qa/how2qa_val.jsonl',
 59 |             'video_path': 'datasets/how2qa/how2qa_video'
 60 |         }]
 61 | 
 62 | 
 63 | TRAINING:
 64 |     EPOCHS: 30
 65 |     WARMUP_EPOCHS: 10
 66 |     WARMUP_LR: 0.
 67 |     LR_SCHEDULER: {
 68 |         'NAME': 'linear',
 69 |         'DECAY_EPOCHS': 10,
 70 |         }
 71 | 
 72 |     use_span_loss: true
 73 |     span_loss_weight: 1.0
 74 | 
 75 |     use_mlm: false
 76 | 
 77 |     weight_decay: 0.05
 78 | 
 79 |     save_dir: "project/lfvila/lfvila_save/how2qa"
 80 |     checkpoint_step: 10000
 81 |     save_step: 5000
 82 |     print_step: 100
 83 |     eval_step: 500
 84 | 
 85 | deepspeed_config: {
 86 |     "train_micro_batch_size_per_gpu": 4,
 87 |     "gradient_accumulation_steps": 1,
 88 |     "steps_per_print": 500,
 89 | 
 90 | 
 91 |     "zero_optimization": {
 92 |       "stage": 2,
 93 |       "allgather_partitions": true,
 94 |       "allgather_bucket_size": 5.0e+8,
 95 |       "overlap_comm": false,
 96 |       "reduce_scatter": true,
 97 |       "reduce_bucket_size": 5.0e+8,
 98 |       "contiguous_gradients" : false,
 99 |       "stage3_gather_fp16_weights_on_model_save": true
100 |     },
101 | 
102 |     "fp16": {
103 |       "enabled": true,
104 |       "loss_scale": 0,
105 |       "loss_scale_window": 1000,
106 |       "initial_scale_power": 32,
107 |       "hysteresis": 2,
108 |       "min_loss_scale": 1
109 |   },
110 | 
111 |     "optimizer": {
112 |         "type": "AdamW",
113 |         "params": {
114 |         "lr": 5.0e-5,
115 |         "betas": [0.9, 0.98],
116 |         "eps": 1.0e-8,
117 |         "weight_decay": 5.0e-2
118 |         }
119 |     },
120 | 
121 | 
122 |     "sparse_attention": {
123 |       "mode": "fixed",
124 |       "block": 32,
125 |       "different_layout_per_head": true,
126 |       "num_local_blocks": 16,
127 |       "num_global_blocks": 1,
128 |       "attention": "bidirectional",
129 |       "horizontal_global_attention": true,
130 |       "num_different_global_patterns": 4
131 |     }
132 | }
133 | 
134 | 
135 |   
136 | 
137 | 
138 |   
139 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/lvu_wayspeaking_cls.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 1
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: -1
 21 | 
 22 | 
 23 | WEIGHTS: 
 24 |     model_weight: 'project/lfvila/saved_model/lfvila_stage1.bin'
 25 |     stage1_model_weight: ''
 26 |     bert_weight: ''
 27 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 28 |     pretrained_2d: True
 29 | 
 30 | DATA:
 31 |     BATCH_SIZE_per_gpu: 16
 32 |     NUM_WORKERS: 12
 33 |     PIN_MEMORY: True
 34 | 
 35 |     sample_frame: 32
 36 |     sample_clip: 4
 37 |     input_res: [192, 320]
 38 |     center_crop: 200
 39 | 
 40 |     classification_labels: 5
 41 | 
 42 |     tokenizer_dir: 'project/lfvila/pretrained/bert-large-uncased/'
 43 | 
 44 |     DATASET_train: {
 45 |             'name': 'VideoClassificationDataset-train',
 46 |             'type': 'VideoClassificationDataset',
 47 |             'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/wayspeaking_train.jsonl',
 48 |             'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video'
 49 |         }
 50 | 
 51 |     DATASET_val: [{
 52 |             'name': 'VideoClassificationDataset-val',
 53 |             'type': 'VideoClassificationDataset',
 54 |             'metadata_dir': 'datasets/lfvila_data/task/LVU_movieclips/wayspeaking_test.jsonl',
 55 |             'video_path': 'datasets/LVU_movieclips/lvu_movieclips_video'
 56 |         }
 57 |         ]
 58 | 
 59 | 
 60 | TRAINING:
 61 |     save_feats: 0
 62 |     only_val: 0
 63 |     EPOCHS: 500
 64 |     WARMUP_EPOCHS: 1
 65 |     WARMUP_LR: 0.
 66 |     LR_SCHEDULER: {
 67 |         'NAME': 'linear',
 68 |         'DECAY_EPOCHS': 10,
 69 |         }
 70 | 
 71 |     use_mlm: false
 72 | 
 73 |     temp: 0.05
 74 |     weight_decay: 0.05
 75 |     save_dir: "project/lfvila/lfvila_save/lvu_wayspeaking"
 76 |     checkpoint_step: 20000
 77 |     save_step: 10000
 78 |     print_step: 10
 79 |     eval_step: 10
 80 | 
 81 | deepspeed_config: {
 82 |     "train_micro_batch_size_per_gpu": 16,
 83 |     "gradient_accumulation_steps": 1,
 84 |     "steps_per_print": 500,
 85 | 
 86 | 
 87 |     "zero_optimization": {
 88 |       "stage": 2,
 89 |       "allgather_partitions": true,
 90 |       "allgather_bucket_size": 5.0e+8,
 91 |       "overlap_comm": false,
 92 |       "reduce_scatter": true,
 93 |       "reduce_bucket_size": 5.0e+8,
 94 |       "contiguous_gradients" : false,
 95 |       "stage3_gather_fp16_weights_on_model_save": true
 96 |     },
 97 | 
 98 |     "fp16": {
 99 |       "enabled": true,
100 |       "loss_scale": 0,
101 |       "loss_scale_window": 1000,
102 |       "initial_scale_power": 32,
103 |       "hysteresis": 2,
104 |       "min_loss_scale": 1
105 |   },
106 | 
107 |     "optimizer": {
108 |         "type": "AdamW",
109 |         "params": {
110 |         "lr": 5.0e-5,
111 |         "betas": [0.9, 0.98],
112 |         "eps": 1.0e-8,
113 |         "weight_decay": 5.0e-2
114 |         }
115 |     },
116 | 
117 | 
118 |     "sparse_attention": {
119 |       "mode": "fixed",
120 |       "block": 32,
121 |       "different_layout_per_head": true,
122 |       "num_local_blocks": 16,
123 |       "num_global_blocks": 1,
124 |       "attention": "bidirectional",
125 |       "horizontal_global_attention": true,
126 |       "num_different_global_patterns": 4
127 |     }
128 | }
129 | 
130 | 
131 |   
132 | 
133 | 
134 |   
135 | 


--------------------------------------------------------------------------------
/hd-vila/scripts/process_raw_video/compress_video.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import subprocess
  4 | import time
  5 | from multiprocessing import cpu_count
  6 | import subprocess
  7 | import multiprocessing
  8 | from joblib import Parallel, delayed
  9 | import jsonlines
 10 | from tqdm import tqdm
 11 | from multiprocessing import Pool
 12 | from glob import glob
 13 | from decord import VideoReader, cpu
 14 | 
 15 | 
 16 | def parse_args():
 17 |     parser = argparse.ArgumentParser(description='video processing')
 18 |     parser.add_argument('--workdir', default='/data',type=str, help='work dir')
 19 |     parser.add_argument('--inputdir', default='datasets/msrvtt/videos', type=str, help='inputdir')
 20 |     parser.add_argument('--outputdir', default='datasets/msrvtt/videos_6fps', type=str, help='outputdir')
 21 |     parser.add_argument('--vidfile', default='datasets/msrvtt/train.jsonl', type=str, help='video id')
 22 |     args = parser.parse_args()
 23 |     return args
 24 | 
 25 | def check_dirs(dirs):
 26 |     if not os.path.exists(dirs):
 27 |         print(dirs)
 28 |         os.makedirs(dirs, exist_ok=True)
 29 | 
 30 | 
 31 | class CompressVideo():
 32 |     def __init__(self, vidfile, workdir, inputdir, outputdir):
 33 |         self.workdir = workdir
 34 |         self.vidfile = vidfile
 35 |         self.inputdir = inputdir
 36 |         self.outputdir = outputdir
 37 |         self.vids = self.loadvids()
 38 | 
 39 |     def loadvids(self):
 40 |         vids = []
 41 |         with open(os.path.join(self.workdir,self.vidfile), 'r') as f:
 42 |             for l in jsonlines.Reader(f):
 43 |                 vids.append(l)
 44 |         return vids
 45 | 
 46 |     def run(self, cmd):
 47 |         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 48 |         out, _ = proc.communicate()
 49 |         return out.decode('utf-8')
 50 | 
 51 |     def compress_single_clip(self,data):
 52 |         vid = data['clip_id']
 53 | 
 54 |         input_video_path = os.path.join(self.workdir, self.inputdir, '{}.mp4'.format(vid))
 55 | 
 56 |         vr = VideoReader(input_video_path, ctx=cpu(0))
 57 |         time = len(vr) * vr.get_avg_fps()
 58 | 
 59 |         output_video_path = os.path.join(self.workdir,self.outputdir, vid+'.mp4')
 60 |         check_dirs(os.path.join(self.workdir,self.outputdir))
 61 | 
 62 |         cmd = ['ffmpeg',
 63 |                 '-y',  # (optional) overwrite output file if it exists
 64 |                 '-i', input_video_path,
 65 |                 '-max_muxing_queue_size', '9999',
 66 |                 '-r', '6',  
 67 |                 output_video_path]
 68 | 
 69 | 
 70 |         self.run(cmd)
 71 | 
 72 |         if os.path.isfile(output_video_path):
 73 |             return vid + '*' + str(len(vr))
 74 |         else:
 75 |             return None
 76 | 
 77 | 
 78 |     def compress_clips(self):
 79 | 
 80 |         results = []
 81 |         print('start process')
 82 |         for vid in tqdm(self.vids):
 83 |             result =  self.compress_single_clip(vid)
 84 |             results.append(result)
 85 |         print(len(results))
 86 | 
 87 | 
 88 |     def compress_clips_parallel(self):
 89 |         num_cores = multiprocessing.cpu_count()
 90 |         print(num_cores)
 91 |         print('start process')
 92 |         results = Parallel(n_jobs=20, backend = 'threading')(delayed(self.compress_single_clip)(v) for v in tqdm(self.vids))
 93 | 
 94 |         results = [x for x in results if x is not None]
 95 | 
 96 |         print(len(results))
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 | 
101 |     args = parse_args()
102 |     print(args)
103 | 
104 |     cpv = CompressVideo(args.vidfile, args.blob_mount_dir, args.inputdir, args.outputdir)
105 |     cpv.compress_clips_parallel()
106 | 


--------------------------------------------------------------------------------
/LF-VILA/src/datasets/video_classification_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import jsonlines
  4 | import decord
  5 | import lmdb
  6 | from decord import VideoReader, cpu
  7 | import numpy as np
  8 | import torch
  9 | from torch.utils.data import Dataset
 10 | from torch.utils.data.dataloader import default_collate
 11 | from src.utils.logger import LOGGER
 12 | 
 13 | decord.bridge.set_bridge("torch")
 14 | 
 15 | class VideoClassificationDataset(Dataset):
 16 |     def __init__(self,
 17 |                  cfg,
 18 |                  metadata_dir,
 19 |                  video_path,
 20 |                  sample_frame,
 21 |                  sample_clip,
 22 |                  tokenizer,
 23 |                  transform=None,
 24 |                  return_index=False,
 25 |                  is_train=True,
 26 |                  **kwargs
 27 |                  ):
 28 |         self.cfg = cfg
 29 |         self.metadata_dir = metadata_dir
 30 |         self.transform = transform
 31 |         self.video_path = video_path
 32 |         self.return_index = return_index
 33 |         self.reliable_idx_list = []
 34 |         self.sample_frame = sample_frame
 35 | 
 36 |         self._load_metadata()
 37 |         self.is_train = is_train
 38 | 
 39 |     def _load_metadata(self):
 40 |         data = []
 41 |         with open(self.metadata_dir) as f:
 42 |             for l in jsonlines.Reader(f):
 43 |                 data.append(l)
 44 |         self.metadata = data
 45 | 
 46 |     def _read_video(self, video_id, sample_frame_num):
 47 |         '''
 48 |         read frames from long video
 49 |         args:
 50 |             video_id: str,
 51 |             sample_frame_num: frames used
 52 |         return: 
 53 |             img_arrays: [num_frm, 3, H, W]
 54 |             chunk_mask: [num_frm, n_clip], , mask for indicating frames belong to each clip
 55 | 
 56 |         '''
 57 | 
 58 |         video_path = os.path.join(self.video_path, video_id + '.mp4')
 59 |         vr = VideoReader(video_path, ctx=cpu(0))
 60 |         num_frame = len(vr)
 61 | 
 62 |         if self.is_train:
 63 |             interval = int(num_frame / (sample_frame_num - 1))
 64 |             start = np.random.randint(0, interval+1)
 65 |             end = np.random.randint(num_frame-1-interval, num_frame)
 66 |             frame_idx = np.linspace(start, end, num=sample_frame_num).astype(int)
 67 |         else:
 68 |             frame_idx = np.linspace(0, num_frame-1, num=sample_frame_num).astype(int)
 69 | 
 70 |         img_arrays = vr.get_batch(frame_idx)
 71 | 
 72 |         img_arrays = img_arrays.float() / 255
 73 |   
 74 |         img_arrays = img_arrays.permute(0, 3, 1, 2) # N,C,H,W
 75 | 
 76 |         return img_arrays
 77 | 
 78 |     def __len__(self):
 79 |         return len(self.metadata)
 80 | 
 81 |     def __getitem__(self, index):
 82 |         num_retries = 10
 83 |         for j in range(num_retries):
 84 |             try:
 85 |                 item = self.metadata[index]
 86 | 
 87 |                 video_id = item['video_id']
 88 | 
 89 |                 video = self._read_video(video_id, self.sample_frame)
 90 | 
 91 | 
 92 |                 label = int(item['recipe_type'])
 93 | 
 94 |                 if self.transform is not None:
 95 |                     video = self.transform(video) # N, C, H, W
 96 |                 video = video.permute(1, 0, 2, 3) # C, N, H, W
 97 | 
 98 |                 data = {
 99 |                         'video_frames': video, # C, N, H, W
100 |                         'label': torch.tensor(label)
101 |                         }
102 |             except:
103 |                 index = random.randint(0, len(self) - 1)
104 |                 continue
105 |             else:
106 |                 break
107 | 
108 |         if self.return_index:
109 |             data['index'] = torch.tensor(index)
110 | 
111 |         return data
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/utils/stop_words.py:
--------------------------------------------------------------------------------
 1 | """List of stop words."""
 2 | # This list of English stop words is taken from the "Glasgow Information
 3 | # Retrieval Group". The original list can be found at
 4 | # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
 5 | ENGLISH_STOP_WORDS = frozenset([
 6 |     "a", "about", "above", "across", "actually", "after", "afterwards", "again",
 7 |     "against", "all", "almost", "alone", "along", "already", "also", "although",
 8 |     "always", "am", "among", "amongst", "amoungst", "amount", "an", "and",
 9 |     "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere",
10 |     "are", "around", "as", "at", "back", "be", "became", "because", "become",
11 |     "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
12 |     "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom",
13 |     "but", "by", "call", "can", "cannot", "cant", "can't", "co", "con", "could",
14 |     "couldnt", "cry", "de", "describe", "detail", "do", "done", "don't", "down",
15 |     "due", "during", "each", "easy", "eg", "eight", "either", "eleven", "else",
16 |     "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
17 |     "everything", "everywhere", "except", "few", "fifteen", "fifty", "find",
18 |     "fire", "first", "five", "for", "former", "formerly", "forty", "found",
19 |     "four", "from", "further", "give", "had", "has", "hasnt", "have", "he",
20 |     "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
21 |     "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie",
22 |     "if", "i'm", "i'll", "i've", "in", "inc", "indeed", "interest", "is", "it",
23 |     "it'll", "its", "it's", "itself", "just", "keep", "last", "latter",
24 |     "latterly", "least", "less", "like", "ltd", "made", "many", "may", "me",
25 |     "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
26 |     "much", "must", "my", "myself", "name", "namely", "neither", "never",
27 |     "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor",
28 |     "not", "nothing", "now", "nowhere", "of", "off", "often", "ok", "okay",
29 |     "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise",
30 |     "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
31 |     "please", "put", "rather", "re", "really", "same", "see", "seem", "seemed",
32 |     "seeming", "seems", "serious", "several", "she", "should", "show", "side",
33 |     "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
34 |     "something", "sometime", "sometimes", "somewhere", "still", "such", "take",
35 |     "ten", "than", "thank", "thanks", "that", "that's", "the", "their", "them",
36 |     "themselves", "then", "thence", "there", "thereafter", "thereby",
37 |     "therefore", "therein", "thereupon", "these", "they", "third", "this",
38 |     "those", "though", "three", "through", "throughout", "thru", "thus", "to",
39 |     "together", "too", "top", "toward", "towards", "twelve", "twenty", "two",
40 |     "un", "until", "up", "upon", "us", "very", "via", "view", "viewing",
41 |     "viewer", "was", "we", "we'll", "well", "welcome", "were", "what",
42 |     "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
43 |     "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while",
44 |     "whither", "who", "whoever", "whole", "whom", "whose", "why", "will",
45 |     "with", "within", "without", "would", "wont", "won't", "yet", "you", "your",
46 |     "yours", "you've", "you'll", "yourself", "yourselves", "youtube", "going",
47 |     "want", "right", "you're", "we're", "know", "gonna", "need", "bit", "look",
48 |     "yeah", "guys", "sure", "let's", "video", "oh", "let", "today", "they're",
49 |     "did", "looks", "different", "great", "different", "say", "um", "probably",
50 |     "kind", "doesn't", "does", "maybe", "hey", "we've", "better", "hope",
51 |     "there's", "try"
52 | ])


--------------------------------------------------------------------------------
/hd-vila/src/utils/stop_words.py:
--------------------------------------------------------------------------------
 1 | """List of stop words."""
 2 | # This list of English stop words is taken from the "Glasgow Information
 3 | # Retrieval Group". The original list can be found at
 4 | # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
 5 | ENGLISH_STOP_WORDS = frozenset([
 6 |     "a", "about", "above", "across", "actually", "after", "afterwards", "again",
 7 |     "against", "all", "almost", "alone", "along", "already", "also", "although",
 8 |     "always", "am", "among", "amongst", "amoungst", "amount", "an", "and",
 9 |     "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere",
10 |     "are", "around", "as", "at", "back", "be", "became", "because", "become",
11 |     "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
12 |     "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom",
13 |     "but", "by", "call", "can", "cannot", "cant", "can't", "co", "con", "could",
14 |     "couldnt", "cry", "de", "describe", "detail", "do", "done", "don't", "down",
15 |     "due", "during", "each", "easy", "eg", "eight", "either", "eleven", "else",
16 |     "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
17 |     "everything", "everywhere", "except", "few", "fifteen", "fifty", "find",
18 |     "fire", "first", "five", "for", "former", "formerly", "forty", "found",
19 |     "four", "from", "further", "give", "had", "has", "hasnt", "have", "he",
20 |     "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
21 |     "herself", "him", "himself", "his", "how", "however", "hundred", "i", "ie",
22 |     "if", "i'm", "i'll", "i've", "in", "inc", "indeed", "interest", "is", "it",
23 |     "it'll", "its", "it's", "itself", "just", "keep", "last", "latter",
24 |     "latterly", "least", "less", "like", "ltd", "made", "many", "may", "me",
25 |     "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
26 |     "much", "must", "my", "myself", "name", "namely", "neither", "never",
27 |     "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor",
28 |     "not", "nothing", "now", "nowhere", "of", "off", "often", "ok", "okay",
29 |     "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise",
30 |     "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
31 |     "please", "put", "rather", "re", "really", "same", "see", "seem", "seemed",
32 |     "seeming", "seems", "serious", "several", "she", "should", "show", "side",
33 |     "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
34 |     "something", "sometime", "sometimes", "somewhere", "still", "such", "take",
35 |     "ten", "than", "thank", "thanks", "that", "that's", "the", "their", "them",
36 |     "themselves", "then", "thence", "there", "thereafter", "thereby",
37 |     "therefore", "therein", "thereupon", "these", "they", "third", "this",
38 |     "those", "though", "three", "through", "throughout", "thru", "thus", "to",
39 |     "together", "too", "top", "toward", "towards", "twelve", "twenty", "two",
40 |     "un", "until", "up", "upon", "us", "very", "via", "view", "viewing",
41 |     "viewer", "was", "we", "we'll", "well", "welcome", "were", "what",
42 |     "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas",
43 |     "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while",
44 |     "whither", "who", "whoever", "whole", "whom", "whose", "why", "will",
45 |     "with", "within", "without", "would", "wont", "won't", "yet", "you", "your",
46 |     "yours", "you've", "you'll", "yourself", "yourselves", "youtube", "going",
47 |     "want", "right", "you're", "we're", "know", "gonna", "need", "bit", "look",
48 |     "yeah", "guys", "sure", "let's", "video", "oh", "let", "today", "they're",
49 |     "did", "looks", "different", "great", "different", "say", "um", "probably",
50 |     "kind", "doesn't", "does", "maybe", "hey", "we've", "better", "hope",
51 |     "there's", "try"
52 | ])


--------------------------------------------------------------------------------
/hd-vila-100m/LICENSE:
--------------------------------------------------------------------------------
 1 | Research Use of Data Agreement v1.0 
 2 | 
 3 |  
 4 | 
 5 | This is the Research Use of Data Agreement, Version 1.0 (the “R-UDA”). Capitalized terms are defined in Section 5. Data Provider and you agree as follows: 
 6 | 
 7 |  
 8 | 
 9 |  
10 | 
11 | 1. Provision of the Data 
12 | 
13 | 1.1. You may use, modify, and distribute the Data made available to you by the Data Provider under this R-UDA for Research Use if you follow the R-UDA’s terms. 
14 | 
15 | 1.2. Data Provider will not sue you or any Downstream Recipient for any claim arising out of the use, modification, or distribution of the Data provided you meet the terms of the R-UDA. 
16 | 
17 | 1.3. This R-UDA does not restrict your use, modification, or distribution of any portions of the Data that are in the public domain or that may be used, modified, or distributed under any other legal exception or limitation. 
18 | 
19 |  
20 | 
21 |  
22 | 
23 | 2. Restrictions 
24 | 
25 | 2.1. You agree that you will use the Data solely for Computational Use for non-commercial research. This restriction means that you may engage in non-commercial research activities (including non-commercial research undertaken by or funded via a commercial entity), but you may not use the Data or any Results in any commercial offering, including as part of a product or service (or to improve any product or service) you use or provide to others. 
26 | 
27 | 2.2. You may not receive money or other consideration in exchange for use or redistribution of Data. 
28 | 
29 |  
30 | 
31 |  
32 | 
33 | 3. Redistribution of Data 
34 | 
35 | 3.1. You may redistribute the Data, so long as: 
36 | 
37 | 3.1.1. You include with any Data you redistribute all credit or attribution information that you received with the Data, and your terms require any Downstream Recipient to do the same; and 
38 | 
39 | 3.1.2. You bind each recipient to whom you redistribute the Data to the terms of the R-UDA. 
40 | 
41 |  
42 | 
43 |  
44 | 
45 | 4. No Warranty, Limitation of Liability 
46 | 
47 | 4.1. Data Provider does not represent or warrant that it has any rights whatsoever in the Data. 
48 | 
49 | 4.2. THE DATA IS PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 
50 | 
51 | 4.3. NEITHER DATA PROVIDER NOR ANY UPSTREAM DATA PROVIDER SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE DATA OR RESULTS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 
52 | 
53 |  
54 | 
55 |  
56 | 
57 | 5. Definitions 
58 | 
59 | 5.1. “Computational Use” means activities necessary to enable the use of Data (alone or along with other material) for analysis by a computer. 
60 | 
61 | 5.2. “Data” means the material you receive under the R-UDA in modified or unmodified form, but not including Results. 
62 | 
63 | 5.3. “Data Provider” means the source from which you receive the Data and with whom you enter into the R-UDA. 
64 | 
65 | 5.4. “Downstream Recipient” means any person or persons who receives the Data directly or indirectly from you in accordance with the R-UDA. 
66 | 
67 | 5.5. “Result” means anything that you develop or improve from your use of Data that does not include more than a de minimis portion of the Data on which the use is based. Results may include de minimis portions of the Data necessary to report on or explain use that has been conducted with the Data, such as figures in scientific papers, but do not include more. Artificial intelligence models trained on Data (and which do not include more than a de minimis portion of Data) are Results. 
68 | 
69 | 5.6. “Upstream Data Providers” means the source or sources from which the Data Provider directly or indirectly received, under the terms of the R-UDA, material that is included in the Data. 
70 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/pretrain_stage1.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8
 11 | }
 12 | 
 13 | 
 14 | bert_config: "src/configs/bert_large_config.json"
 15 | stage: 1
 16 | type_vocab_size: 8
 17 | num_local_layers: 8
 18 | stage1_layers: 12
 19 | bert_frozen_stage: -1
 20 | 
 21 | log_tb: true
 22 | 
 23 | 
 24 | WEIGHTS: 
 25 |     model_weight: ''
 26 |     stage1_model_weight: ''
 27 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 28 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 29 |     pretrained_2d: True
 30 | 
 31 | DATA:
 32 |     use_lmdb_train_data: True
 33 |     len_lmdb_train_data: 8523237 
 34 |     BATCH_SIZE_per_gpu: 16
 35 |     NUM_WORKERS: 12
 36 |     PIN_MEMORY: True
 37 | 
 38 |     sample_frame: 32
 39 |     sample_clip: 4
 40 |     input_res: [192, 320]
 41 |     center_crop: 200
 42 | 
 43 |     DATASET_train: {
 44 |             'name': 'PreTrainDataset-train',
 45 |             'type': 'PreTrainDataset',
 46 |             'metadata_dir': 'datasets/lfvila_data/pretrain/train_db',
 47 |             'video_path': 'datasets/hdvila100m/video_clip_3fps'
 48 |         }
 49 | 
 50 |     DATASET_val: [{
 51 |             'name': 'RetrievalDataset-val',
 52 |             'type': 'RetrievalDataset',
 53 |             'metadata_dir': 'datasets/lfvila_data/task/actnet/val_s.jsonl',
 54 |             'video_path': 'datasets/activitynet/actnet_video'
 55 |         },
 56 |         {
 57 |             'name': 'PreTrainDataset-val',
 58 |             'type': 'PreTrainDataset',
 59 |             'metadata_dir': 'datasets/lfvila_data/pretrain/val.jsonl',
 60 |             'video_path': 'datasets/hdvila100m/video_clip_3fps'
 61 |         }   
 62 |         ]
 63 | 
 64 | 
 65 | TRAINING:
 66 |     BREAK_STEP: 10000000000
 67 |     EPOCHS: 10
 68 |     WARMUP_EPOCHS: 1
 69 |     WARMUP_LR: 0.
 70 |     LR_SCHEDULER: {
 71 |         'NAME': 'linear',
 72 |         'DECAY_EPOCHS': 10,
 73 |         }
 74 | 
 75 |     use_mlm: false
 76 | 
 77 |     ct_global_loss_weight: 1
 78 | 
 79 | 
 80 |     use_time_match: true
 81 |     ct_time_loss_weight: 0.25
 82 |     num_key: 2
 83 |     num_value: 2
 84 |     num_other_neg: 3
 85 |     time_temp: 0.05
 86 |     use_mask_equal: false
 87 | 
 88 | 
 89 |     temp: 0.05
 90 |     weight_decay: 0.05
 91 | 
 92 |     save_dir: "project/lfvila/lfvila_save/pretrain_stage1"
 93 |     checkpoint_step: 4000
 94 |     save_step: 2000
 95 |     print_step: 100
 96 |     eval_step: 500
 97 | 
 98 | deepspeed_config: {
 99 |     "train_micro_batch_size_per_gpu": 16,
100 |     "gradient_accumulation_steps": 1,
101 |     "steps_per_print": 500,
102 | 
103 | 
104 |     "zero_optimization": {
105 |       "stage": 2,
106 |       "allgather_partitions": true,
107 |       "allgather_bucket_size": 5.0e+8,
108 |       "overlap_comm": false,
109 |       "reduce_scatter": true,
110 |       "reduce_bucket_size": 5.0e+8,
111 |       "contiguous_gradients" : false,
112 |       "stage3_gather_fp16_weights_on_model_save": true
113 |     },
114 | 
115 |     "fp16": {
116 |       "enabled": true,
117 |       "loss_scale": 0,
118 |       "loss_scale_window": 1000,
119 |       "initial_scale_power": 32,
120 |       "hysteresis": 2,
121 |       "min_loss_scale": 1
122 |   },
123 | 
124 |     "optimizer": {
125 |         "type": "AdamW",
126 |         "params": {
127 |         "lr": 5.0e-5,
128 |         "betas": [0.9, 0.98],
129 |         "eps": 1.0e-8,
130 |         "weight_decay": 5.0e-2
131 |         }
132 |     },
133 | 
134 | 
135 |     "sparse_attention": {
136 |       "mode": "fixed",
137 |       "block": 32,
138 |       "different_layout_per_head": true,
139 |       "num_local_blocks": 16,
140 |       "num_global_blocks": 1,
141 |       "attention": "bidirectional",
142 |       "horizontal_global_attention": true,
143 |       "num_different_global_patterns": 4
144 |     }
145 | }
146 | 
147 | 
148 |   
149 | 
150 | 
151 |   
152 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # XPretrain
 2 | 
 3 | This repo includes some recent research works in **multi-modality learning**, especially with **pre-training** method from [MSM group](https://www.microsoft.com/en-us/research/group/multimedia-search-and-mining/) of Microsoft Research. 
 4 | 
 5 | ## Multi-modality Learning
 6 | 
 7 | ### ***** Video & Language *****
 8 | 
 9 | #### Dataset
10 | 
11 | > [**HD-VILA-100M dataset**](https://github.com/microsoft/XPretrain/tree/main/hd-vila-100m): high-resolution and diversified video-language dataset
12 | 
13 | #### Pre-training model
14 | 
15 | > [**HD-VILA (CVPR 2022)**](https://github.com/microsoft/XPretrain/tree/main/hd-vila): high-resolution and diversified video-language pre-training model
16 | 
17 | > [**LF-VILA (NeurIPS 2022)**](https://github.com/microsoft/XPretrain/tree/main/LF-VILA): long-form video-language pre-training model
18 | 
19 | > [**CLIP-ViP (ICLR 2023)**](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP): adapting image-language pre-training to video-language pretraining model
20 | 
21 | ### ***** Image & Language *****
22 | 
23 | #### Pre-training model
24 | 
25 | > [**Pixel-BERT**](https://arxiv.org/pdf/2004.00849.pdf): end-to-end image and language pre-training model
26 | 
27 | > [**SOHO (CVPR 2021 oral)**](https://github.com/researchmm/soho): improved end-to-end image and language pre-training model with quantized visual tokens
28 | 
29 | > [**VisualParsing (NeurIPS 2021)**](https://github.com/microsoft/XPretrain/tree/main/visualparsing): Transformer-based end-to-end image and language pre-training model
30 | 
31 | ## News
32 | - :smiley:**March, 2023: the code of [**CLIP-ViP**](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP) and [**LF-VILA**](https://github.com/microsoft/XPretrain/tree/main/LF-VILA) was released.**
33 | - January, 2023: our paper [**CLIP-ViP**](https://github.com/microsoft/XPretrain/tree/main/CLIP-ViP) to adapt image-language pre-training model to video-language pretraining was accepted by ICLR 2023.
34 | - September, 2022: our paper [**LF-VILA**](https://github.com/microsoft/XPretrain/tree/main/LF-VILA) on long-form video-language pre-training was accepted by NeurIPS 2022. 
35 | - September, 2022: the code of [**HD-VILA**](https://github.com/microsoft/XPretrain/tree/main/hd-vila) was released.
36 | - March, 2022: [**HD-VILA-100M dataset**](https://github.com/microsoft/XPretrain/tree/main/hd-vila-100m) was released publicly.
37 | - March, 2022: [**HD-VILA**](https://github.com/microsoft/XPretrain/tree/main/hd-vila) was accepted by CVPR 2022.
38 | 
39 | 
40 | ## Contributing
41 | 
42 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
43 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
44 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
45 | 
46 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
47 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
48 | provided by the bot. You will only need to do this once across all repos using our CLA.
49 | 
50 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
51 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
52 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
53 | 
54 | ## Trademarks
55 | 
56 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
57 | trademarks or logos is subject to and must follow 
58 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
59 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
60 | Any use of third-party trademarks or logos are subject to those third-party's policies.
61 | 
62 | ## Contact Information
63 | 
64 | For help or issues using the pre-trained models, please submit an issue. 
65 | For other communications, please contact [Bei Liu]() (`bei.liu@microsoft.com`) and [Jianlong Fu]() (`jianf@microsoft.com`).
66 | 


--------------------------------------------------------------------------------
/LF-VILA/src/configs/pretrain_stage2.yaml:
--------------------------------------------------------------------------------
  1 | VideoEncoder: {
  2 |     "patch_size": [1,8,8],
  3 |     "embed_dim": 128,
  4 |     "depths":[2, 2, 14, 2, 2, 2],
  5 |     "downsample_stages":[0, 1, 4],
  6 |     "stages":[0, 1, 2, 2, 2, 3],
  7 |     "num_heads":[4, 8, 16, 16, 16, 32],
  8 |     "window_size":[[2,3,5],[4,3,5],[8,3,5],[16,3,5],[16,3,5],[32,3,5]], #time, h, w
  9 |     "patch_norm": True,
 10 |     "local_window": 8,
 11 |     "frozen_stages": 6
 12 | }
 13 | 
 14 | 
 15 | bert_config: "src/configs/bert_large_config.json"
 16 | stage: 2
 17 | type_vocab_size: 8
 18 | num_local_layers: 8
 19 | stage1_layers: 12
 20 | bert_frozen_stage: 12
 21 | final_num_patches: 6
 22 | 
 23 | log_tb: true
 24 | 
 25 | 
 26 | WEIGHTS: 
 27 |     model_weight: ''
 28 |     stage1_model_weight: 'project/lfvila/pretrained/lfvila_stage1.bin'
 29 |     bert_weight: 'project/lfvila/pretrained/bert-large-uncased/pytorch_model.bin'
 30 |     swin_weight: 'project/lfvila/pretrained/swin/swin_base_patch4_window12_384_22k.pth'
 31 |     pretrained_2d: True
 32 | 
 33 | DATA:
 34 |     use_lmdb_train_data: true
 35 |     len_lmdb_train_data: 8523237 
 36 |     BATCH_SIZE_per_gpu: 48
 37 |     NUM_WORKERS: 12
 38 |     PIN_MEMORY: True
 39 | 
 40 |     sample_frame: 32
 41 |     sample_clip: 4
 42 |     input_res: [192, 320]
 43 |     center_crop: 200
 44 | 
 45 | 
 46 |     DATASET_train: {
 47 |             'name': 'PreTrainDataset-train',
 48 |             'type': 'PreTrainDataset',
 49 |             'metadata_dir': 'datasets/lfvila_data/pretrain/train_db',
 50 |             'video_path': 'datasets/hdvila100m/video_clip_3fps'
 51 |         }
 52 | 
 53 |     DATASET_val: [{
 54 |             'name': 'RetrievalDataset-val',
 55 |             'type': 'RetrievalDataset',
 56 |             'metadata_dir': 'datasets/lfvila_data/task/actnet/val_s.jsonl',
 57 |             'video_path': 'datasets/activitynet/actnet_video'
 58 |         },
 59 |         {
 60 |             'name': 'PreTrainDataset-val',
 61 |             'type': 'PreTrainDataset',
 62 |             'metadata_dir': 'datasets/lfvila_data/pretrain/val.jsonl',
 63 |             'video_path': 'datasets/hdvila100m/video_clip_3fps'
 64 |         }   
 65 |         ]
 66 | 
 67 | 
 68 | TRAINING:
 69 |     BREAK_STEP: 10000000000
 70 |     EPOCHS: 10
 71 |     WARMUP_EPOCHS: 1
 72 |     WARMUP_LR: 0.
 73 |     LR_SCHEDULER: {
 74 |         'NAME': 'linear',
 75 |         'DECAY_EPOCHS': 10,
 76 |         }
 77 | 
 78 |     use_mlm: true
 79 |     mlm_loss_weight: 1
 80 |     vtm_loss_weight: 10
 81 | 
 82 |     ct_global_loss_weight: 1
 83 | 
 84 | 
 85 |     use_time_match: true
 86 |     ct_time_loss_weight: 0.25
 87 |     num_key: 2
 88 |     num_value: 2
 89 |     num_other_neg: 3
 90 |     time_temp: 0.05
 91 |     use_mask_equal: false
 92 | 
 93 |     temp: 0.05
 94 |     weight_decay: 0.05
 95 | 
 96 |     save_dir: "project/lfvila/lfvila_save/pretrain_stage2"
 97 |     checkpoint_step: 2000
 98 |     save_step: 2000
 99 |     print_step: 50
100 |     eval_step: 250
101 | 
102 | deepspeed_config: {
103 |     "train_micro_batch_size_per_gpu": 48,
104 |     "gradient_accumulation_steps": 1,
105 |     "steps_per_print": 500,
106 | 
107 | 
108 |     "zero_optimization": {
109 |       "stage": 2,
110 |       "allgather_partitions": true,
111 |       "allgather_bucket_size": 5.0e+8,
112 |       "overlap_comm": false,
113 |       "reduce_scatter": true,
114 |       "reduce_bucket_size": 5.0e+8,
115 |       "contiguous_gradients" : false,
116 |       "stage3_gather_fp16_weights_on_model_save": true
117 |     },
118 | 
119 |     "fp16": {
120 |       "enabled": true,
121 |       "loss_scale": 0,
122 |       "loss_scale_window": 1000,
123 |       "initial_scale_power": 32,
124 |       "hysteresis": 2,
125 |       "min_loss_scale": 1
126 |   },
127 | 
128 |     "optimizer": {
129 |         "type": "AdamW",
130 |         "params": {
131 |         "lr": 5.0e-5,
132 |         "betas": [0.9, 0.98],
133 |         "eps": 1.0e-8,
134 |         "weight_decay": 5.0e-2
135 |         }
136 |     },
137 | 
138 | 
139 |     "sparse_attention": {
140 |       "mode": "fixed",
141 |       "block": 32,
142 |       "different_layout_per_head": true,
143 |       "num_local_blocks": 16,
144 |       "num_global_blocks": 1,
145 |       "attention": "bidirectional",
146 |       "horizontal_global_attention": true,
147 |       "num_different_global_patterns": 4
148 |     }
149 | }
150 | 
151 | 
152 |   
153 | 
154 | 
155 |   
156 | 


--------------------------------------------------------------------------------
/hd-vila-100m/src/cut_videos.py:
--------------------------------------------------------------------------------
  1 | import jsonlines
  2 | import os
  3 | from tqdm import tqdm
  4 | import logging
  5 | import argparse
  6 | import re
  7 | import subprocess
  8 | import multiprocessing
  9 | from joblib import Parallel, delayed
 10 | 
 11 | 
 12 | def parse_args():
 13 |     parser = argparse.ArgumentParser(description='youtube video processing')
 14 |     parser.add_argument('--workdir', default='./hdvila_100m',type=str, help='Working Directory')
 15 |     parser.add_argument('--metafile', default='meta_part0.jsonl', type=str, help='youtube video meta')
 16 |     parser.add_argument('--resultfile', default='cut_part0.jsonl', type=str, help='processed videos')
 17 |     parser.add_argument('--log', default='log_part0.log', type=str, help='log')
 18 |     args = parser.parse_args()
 19 |     return args
 20 | 
 21 | 
 22 | def check_dirs(dirs):
 23 |     if not os.path.exists(dirs):
 24 |         os.makedirs(dirs, exist_ok=True)
 25 | 
 26 | 
 27 | class Cutvideos():
 28 |     def __init__(self, metafile, workdir, resultfile):
 29 |         self.workdir = workdir
 30 |         self.metafile = metafile
 31 |         self.resultfile = resultfile
 32 |         self.metas = self.loadmetas()
 33 | 
 34 |     def loadmetas(self):
 35 |         metas = []
 36 |         with open(self.metafile, 'r') as f:
 37 |             for l in jsonlines.Reader(f):
 38 |                 metas.append(l)
 39 |         return metas
 40 | 
 41 |     def hhmmss(self, timestamp1, timestamp2):
 42 |         hh,mm,s = timestamp1.split(':')
 43 |         ss,ms = s.split('.')
 44 |         timems1 = 3600*1000*int((hh)) +  60*1000*int(mm) + 1000*int(ss) + int(ms)
 45 |         hh,mm,s = timestamp2.split(':')
 46 |         ss,ms = s.split('.')
 47 |         timems2 = 3600*1000*int((hh)) +  60*1000*int(mm) + 1000*int(ss) + int(ms)
 48 |         dur = (timems2 - timems1)/1000
 49 |         return str(dur)
 50 | 
 51 |     def run(self, cmd):
 52 |         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 53 |         out, _ = proc.communicate()
 54 |         return out.decode('utf-8')
 55 | 
 56 |     def extract_single_clip(self,sb, in_filepath, out_filepath):
 57 |         cmd = ['ffmpeg', '-ss', sb[0], '-t', self.hhmmss(sb[0], sb[1]),'-accurate_seek', '-i', in_filepath, '-c', 'copy',
 58 |             '-avoid_negative_ts', '1', '-reset_timestamps', '1',
 59 |             '-y', '-hide_banner', '-loglevel', 'panic', '-map', '0',out_filepath]
 60 |         self.run(cmd)
 61 |         if not os.path.isfile(out_filepath):
 62 |             raise Exception(f"{out_filepath}: ffmpeg clip extraction failed")
 63 | 
 64 |     def extract_clips(self, meta):
 65 |         clips = meta['clip']
 66 |         vid = meta['video_id']
 67 |         outfolder = os.path.join(self.workdir,'video_clips', vid)
 68 |         check_dirs(outfolder)
 69 |         result = []
 70 |         # try:
 71 |         for c in clips:
 72 |             self.extract_single_clip(c['span'], os.path.join(self.workdir,'download_videos', vid + '.mp4'), os.path.join(outfolder, c['clip_id']))
 73 |             result.append(c['clip_id'])
 74 |         # except:
 75 |         #     pass
 76 | 
 77 |         return result
 78 | 
 79 |     def extract_all_clip(self):
 80 |         results = []
 81 |         for v in tqdm(self.metas):
 82 |             result = self.extract_clips(v)
 83 |             results.extend(result)
 84 | 
 85 |         logger.info(f"Number of clips processed: {len(results)}")
 86 |         with jsonlines.open(os.path.join(self.workdir, 'cut_video_results', self.resultfile), 'w') as f:
 87 |             for l in results:
 88 |                 f.write(l)
 89 |         
 90 | 
 91 | if __name__ == '__main__':
 92 |     args = parse_args()
 93 |     
 94 |     metafile = os.path.join(args.workdir, 'metafiles', args.metafile)
 95 |     logdir = os.path.join(args.workdir,'cut_video_log')
 96 | 
 97 |     check_dirs(os.path.join(args.workdir, 'video_clips'))
 98 |     check_dirs(os.path.join(args.workdir, 'cut_video_results'))
 99 |     check_dirs(logdir)
100 | 
101 |     logging.basicConfig(level=logging.INFO,
102 |                     filename=os.path.join(logdir, args.log),
103 |                     datefmt='%Y/%m/%d %H:%M:%S',
104 |                     format='%(asctime)s - %(name)s - %(levelname)s - %(lineno)d - %(module)s - %(message)s')
105 | 
106 |     logger = logging.getLogger(__name__)
107 |     logger.info(args)
108 | 
109 |     cvd = Cutvideos(metafile, args.workdir, args.resultfile)
110 |     cvd.extract_all_clip()


--------------------------------------------------------------------------------
/LF-VILA/src/tasks/run_video_classification.py:
--------------------------------------------------------------------------------
  1 | from poplib import LF
  2 | import torch
  3 | import torch.distributed as dist
  4 | import deepspeed
  5 | import argparse
  6 | import os
  7 | from mmcv import Config
  8 | from src.models import LFVILA_Video_Classification
  9 | 
 10 | from src.tools import Trainer_Video_Classification
 11 | from src.datasets.dataloader import build_dataloader
 12 | from src.optimization.lr_scheduler import build_scheduler
 13 | from src.optimization.optimizer import build_optimizer_parameters
 14 | 
 15 | from src.utils.logger import LOGGER, add_log_to_file
 16 | from src.utils.dist import master_process
 17 | from src.utils.misc import mkdirp, set_random_seed
 18 | from src.utils.load import load_model_weights_with_mismatch       
 19 | 
 20 | def main():
 21 | 
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument('--config', default='./src/configs/pretrain_test_stage2.yaml')
 24 |     parser.add_argument('--blob_mount_dir', default="/blob_mount")
 25 |     parser.add_argument('--deepspeed_sparse_attention',action='store_true')
 26 |     parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher')
 27 |     parser.add_argument('--fp16', action='store_true', help='enable fp16')
 28 |     parser.add_argument('--seed', type=int, default=42, help='random seed')
 29 |     parser.add_argument('--distributed',action='store_true')
 30 |     parser.add_argument('--resume', action='store_true')
 31 |     # Include DeepSpeed configuration arguments
 32 |     parser = deepspeed.add_config_arguments(parser)
 33 | 
 34 |     args = parser.parse_args()
 35 | 
 36 |     set_random_seed(args.seed)
 37 | 
 38 |     config = Config.fromfile(args.config)
 39 | 
 40 |     
 41 |     LOGGER.info(config)
 42 |     LOGGER.info(args)
 43 | 
 44 |     if not master_process(args):
 45 |         LOGGER.disabled = True
 46 |     if master_process(args):
 47 |         mkdirp(os.path.join(args.blob_mount_dir, config.TRAINING.save_dir,"log"))
 48 |         add_log_to_file(os.path.join(args.blob_mount_dir, config.TRAINING.save_dir,"log/log.txt"))
 49 | 
 50 |     model = LFVILA_Video_Classification(args, config)
 51 | 
 52 |     if config.WEIGHTS.model_weight != '':
 53 |         LOGGER.info(f"Loading model weights from {config.WEIGHTS.model_weight}")
 54 |         load_model_weights_with_mismatch(model, os.path.join(args.blob_mount_dir, config.WEIGHTS.model_weight))
 55 | 
 56 |     else:
 57 |         if config.WEIGHTS.swin_weight != '':
 58 |             LOGGER.info(f"Loading video encoder weights from {config.WEIGHTS.swin_weight}")
 59 |             
 60 |             load_model_weights_with_mismatch(model.video_encoder, 
 61 |                                             os.path.join(args.blob_mount_dir, config.WEIGHTS.swin_weight),
 62 |                                             load_swin=True,
 63 |                                             pretrained2d=config.WEIGHTS.pretrained_2d)
 64 | 
 65 |     parameter_group = build_optimizer_parameters(config, model)
 66 | 
 67 | 
 68 |     # init deepspeed
 69 |     
 70 |     if args.distributed:
 71 | 
 72 |         model_engine, optimizer, _, _ = deepspeed.initialize(args = args,
 73 |                                                             model=model,
 74 |                                                             model_parameters=parameter_group,
 75 |                                                             config=config.deepspeed_config
 76 |                                                         )
 77 |         print(dist.get_rank())
 78 |     
 79 | 
 80 |     LOGGER.info(f'Training with {dist.get_world_size()} gpus')
 81 |     
 82 | 
 83 |     dataset_trains, dataset_vals, dataloader_trains, dataloader_vals = build_dataloader(args, config)
 84 | 
 85 |     dataloader_train = dataloader_trains['VideoClassificationDataset-train']
 86 |     steps_per_epoch = len(dataloader_train)
 87 |     scheduler = build_scheduler(config, optimizer, steps_per_epoch)
 88 | 
 89 |     args.fp16 = model_engine.fp16_enabled()
 90 |     if args.fp16:
 91 |         LOGGER.info('Enable fp16 Training')
 92 | 
 93 | 
 94 |     trainer = Trainer_Video_Classification(args, config, model_engine, optimizer, scheduler, dataloader_train, dataloader_vals['VideoClassificationDataset-val'])
 95 | 
 96 |     LOGGER.info('start first evaluate')
 97 | 
 98 |     trainer.evaluate(dataloader_vals['VideoClassificationDataset-val'])
 99 |     
100 |     if not config.TRAINING.only_val:
101 |         trainer.train(args.resume)
102 | 
103 | if __name__ == '__main__':
104 |     deepspeed.init_distributed()
105 |     main()
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/LF-VILA/src/datasets/actnet_qa_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import jsonlines
  4 | import decord
  5 | import lmdb
  6 | from decord import VideoReader, cpu
  7 | import numpy as np
  8 | import torch
  9 | from torch.utils.data import Dataset
 10 | from torch.utils.data.dataloader import default_collate
 11 | from src.utils.logger import LOGGER
 12 | 
 13 | decord.bridge.set_bridge("torch")
 14 | 
 15 | class ActnetQADataset(Dataset):
 16 |     def __init__(self,
 17 |                  cfg,
 18 |                  metadata_dir,
 19 |                  video_path,
 20 |                  sample_frame,
 21 |                  sample_clip,
 22 |                  tokenizer,
 23 |                  transform=None,
 24 |                  is_train=True,
 25 |                  return_rawtext=False,
 26 |                  return_index=False,
 27 |                  **kwargs
 28 |                  ):
 29 |         self.cfg = cfg
 30 |         self.metadata_dir = metadata_dir
 31 |         self.transform = transform
 32 |         self.video_path = video_path
 33 |         self.return_rawtext = return_rawtext
 34 |         self.return_index = return_index
 35 |         self.reliable_idx_list = []
 36 |         self.sample_frame = sample_frame
 37 |         self.sample_clip = sample_clip
 38 | 
 39 |         self._load_metadata()
 40 |         self.tokenizer = tokenizer
 41 |         self.is_train = is_train
 42 | 
 43 |     def _load_metadata(self):
 44 |         data = []
 45 |         with open(self.metadata_dir) as f:
 46 |             for l in jsonlines.Reader(f):
 47 |                 data.append(l)
 48 |         self.metadata = data
 49 | 
 50 |     def _read_video(self, video_id, sample_frame_num):
 51 |         '''
 52 |         read frames from long video
 53 |         args:
 54 |             video_id: str,
 55 |             sample_frame_num: frames used
 56 |         return: 
 57 |             img_arrays: [num_frm, 3, H, W]
 58 |             chunk_mask: [num_frm, n_clip], , mask for indicating frames belong to each clip
 59 | 
 60 |         '''
 61 |         video_path = os.path.join(self.video_path, video_id + '.mp4')
 62 |         vr = VideoReader(video_path, ctx=cpu(0))
 63 |         num_frame = len(vr)
 64 | 
 65 |         if self.is_train:
 66 |             interval = int(num_frame / (sample_frame_num - 1))
 67 |             start = np.random.randint(0, interval+1)
 68 |             end = np.random.randint(num_frame-1-interval, num_frame)
 69 |             frame_idx = np.linspace(start, end, num=sample_frame_num).astype(int)
 70 |         else:
 71 |             frame_idx = np.linspace(0, num_frame-1, num=sample_frame_num).astype(int)
 72 | 
 73 |         img_arrays = vr.get_batch(frame_idx)
 74 | 
 75 |         img_arrays = img_arrays.float() / 255
 76 |   
 77 |         img_arrays = img_arrays.permute(0, 3, 1, 2) # N,C,H,W
 78 | 
 79 |         return img_arrays
 80 | 
 81 |     def tokenize(self, text_q, max_length = 50):
 82 |         text_q = [text_q]
 83 |         
 84 |         encoded_qa = [self.tokenizer(x, padding='max_length', truncation=True, max_length=max_length) for x in text_q]
 85 | 
 86 |         text_ids = torch.tensor([x.input_ids for x in encoded_qa])
 87 |         attention_mask = torch.tensor([x.attention_mask for x in encoded_qa])
 88 |         return text_ids, attention_mask
 89 | 
 90 |     def __len__(self):
 91 |         return len(self.metadata)
 92 | 
 93 | 
 94 |     def __getitem__(self, index):
 95 |         num_retries = 10
 96 |         for j in range(num_retries):
 97 |             try:
 98 |                 item = self.metadata[index]
 99 | 
100 |                 clip_id = item['video_name']
101 | 
102 |                 video = self._read_video(clip_id, self.sample_frame)
103 | 
104 |                 rawtext_q = item['question']
105 |                 label_a = item['answer']
106 | 
107 |                 text_ids, attention_mask = self.tokenize(rawtext_q)
108 |                 
109 | 
110 |                 if self.transform is not None:
111 |                     video = self.transform(video)
112 |                 video = video.permute(1, 0, 2, 3)
113 | 
114 |                 data = {
115 |                         'video_frames': video,
116 |                         'text_ids': text_ids,
117 |                         'attention_mask': attention_mask,
118 |                         'label': label_a
119 |                         }
120 |             except:
121 |                 index = random.randint(0, len(self) - 1)
122 |                 continue
123 |             else:
124 |                 break
125 | 
126 |         if self.return_rawtext:
127 |             data['rawtext'] = rawtext_q
128 | 
129 |         if self.return_index:
130 |             data['index'] = torch.tensor(index)
131 | 
132 |         return data
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/LF-VILA/src/tasks/run_retrieval.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.distributed as dist
  3 | import deepspeed
  4 | import argparse
  5 | import os
  6 | from mmcv import Config
  7 | from src.models import LFVILA_Retrieval
  8 | 
  9 | from src.tools import Trainer_Retrieval
 10 | from src.datasets.dataloader import build_dataloader
 11 | from src.optimization.lr_scheduler import build_scheduler
 12 | from src.optimization.optimizer import build_optimizer_parameters
 13 | from src.utils.logger import LOGGER, add_log_to_file
 14 | from src.utils.dist import master_process
 15 | from src.utils.misc import mkdirp, set_random_seed
 16 | from src.utils.load import load_model_weights_with_mismatch
 17 | 
 18 | 
 19 | def main():
 20 | 
 21 |     parser = argparse.ArgumentParser()
 22 |     parser.add_argument('--config', default='./src/configs/pretrain_test_stage2.yaml')
 23 |     parser.add_argument('--blob_mount_dir', default="/blob_mount")
 24 |     parser.add_argument('--deepspeed_sparse_attention',action='store_true')
 25 |     parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher')
 26 |     parser.add_argument('--fp16', action='store_true', help='enable fp16')
 27 |     parser.add_argument('--seed', type=int, default=42, help='random seed')
 28 |     parser.add_argument('--distributed',action='store_true')
 29 |     parser.add_argument('--resume', action='store_true')
 30 |     parser.add_argument('--only_val', action='store_true')
 31 |     # Include DeepSpeed configuration arguments
 32 |     parser = deepspeed.add_config_arguments(parser)
 33 | 
 34 |     args = parser.parse_args()
 35 | 
 36 |     set_random_seed(args.seed)
 37 | 
 38 |     config = Config.fromfile(args.config)
 39 | 
 40 |     LOGGER.info(config)
 41 |     LOGGER.info(args)
 42 | 
 43 |     if not master_process(args):
 44 |         LOGGER.disabled = True
 45 |     if master_process(args):
 46 |         mkdirp(os.path.join(args.blob_mount_dir, config.TRAINING.save_dir,"log"))
 47 |         add_log_to_file(os.path.join(args.blob_mount_dir, config.TRAINING.save_dir,"log/log.txt"))
 48 | 
 49 |     model =  LFVILA_Retrieval(args, config)
 50 | 
 51 |     if config.WEIGHTS.model_weight != '':
 52 |         LOGGER.info(f"Loading model weights from {config.WEIGHTS.model_weight}")
 53 |         load_model_weights_with_mismatch(model, os.path.join(args.blob_mount_dir, config.WEIGHTS.model_weight))
 54 |     else:
 55 |         if config.WEIGHTS.swin_weight != '':
 56 |             LOGGER.info(f"Loading video encoder weights from {config.WEIGHTS.swin_weight}")
 57 |             
 58 |             load_model_weights_with_mismatch(model.video_encoder, 
 59 |                                             os.path.join(args.blob_mount_dir, config.WEIGHTS.swin_weight),
 60 |                                             load_swin=True,
 61 |                                             pretrained2d=config.WEIGHTS.pretrained_2d)
 62 |         if config.WEIGHTS.bert_weight != '':
 63 |             LOGGER.info(f"Loading bert weights from {config.WEIGHTS.bert_weight}")
 64 |             load_model_weights_with_mismatch(model.text_encoder, os.path.join(args.blob_mount_dir, config.WEIGHTS.bert_weight),load_bert=True)
 65 |             model._init_sent_embedding()
 66 | 
 67 |     parameter_group = build_optimizer_parameters(config, model)
 68 | 
 69 |     # init deepspeed
 70 |     if args.distributed:
 71 | 
 72 |         model_engine, optimizer, _, _ = deepspeed.initialize(args = args,
 73 |                                                             model=model,
 74 |                                                             model_parameters=parameter_group,
 75 |                                                             config=config.deepspeed_config
 76 |                                                         )
 77 |         print(dist.get_rank())
 78 |     
 79 | 
 80 |     LOGGER.info(f'Training with {dist.get_world_size()} gpus')
 81 |     
 82 |     dataset_trains, dataset_vals, dataloader_trains, dataloader_vals = build_dataloader(args, config)
 83 | 
 84 |     dataloader_train = dataloader_trains['RetrievalDataset-train']
 85 |     steps_per_epoch = len(dataloader_train)
 86 |     scheduler = build_scheduler(config, optimizer, steps_per_epoch)
 87 | 
 88 |     args.fp16 = model_engine.fp16_enabled()
 89 |     if args.fp16:
 90 |         LOGGER.info('Enable fp16 Training')
 91 | 
 92 |     trainer = Trainer_Retrieval(args, config, model_engine, optimizer, scheduler, dataloader_train, dataloader_vals['RetrievalDataset-val'])
 93 | 
 94 |     LOGGER.info('start first evaluate')
 95 | 
 96 |     trainer.evaluate(dataloader_vals['RetrievalDataset-val'])
 97 |     
 98 |     if not args.only_val:
 99 |         trainer.train(args.resume)
100 | 
101 | if __name__ == '__main__':
102 |     deepspeed.init_distributed()
103 |     main()
104 | 
105 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/optimization/adamw.py:
--------------------------------------------------------------------------------
  1 | """
  2 | AdamW optimizer (weight decay fix)
  3 | copied from hugginface
  4 | """
  5 | import math
  6 | 
  7 | import torch
  8 | from torch.optim import Optimizer
  9 | 
 10 | 
 11 | class AdamW(Optimizer):
 12 |     """ Implements Adam algorithm with weight decay fix.
 13 |     Parameters:
 14 |         lr (float): learning rate. Default 1e-3.
 15 |         betas (tuple of 2 floats): Adams beta parameters (b1, b2).
 16 |             Default: (0.9, 0.999)
 17 |         eps (float): Adams epsilon. Default: 1e-6
 18 |         weight_decay (float): Weight decay. Default: 0.0
 19 |         correct_bias (bool): can be set to False to avoid correcting bias
 20 |             in Adam (e.g. like in Bert TF repository). Default True.
 21 |     """
 22 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
 23 |                  weight_decay=0.0, correct_bias=True):
 24 |         if lr < 0.0:
 25 |             raise ValueError(
 26 |                 "Invalid learning rate: {} - should be >= 0.0".format(lr))
 27 |         if not 0.0 <= betas[0] < 1.0:
 28 |             raise ValueError("Invalid beta parameter: {} - "
 29 |                              "should be in [0.0, 1.0[".format(betas[0]))
 30 |         if not 0.0 <= betas[1] < 1.0:
 31 |             raise ValueError("Invalid beta parameter: {} - "
 32 |                              "should be in [0.0, 1.0[".format(betas[1]))
 33 |         if not 0.0 <= eps:
 34 |             raise ValueError("Invalid epsilon value: {} - "
 35 |                              "should be >= 0.0".format(eps))
 36 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
 37 |                         correct_bias=correct_bias)
 38 |         super(AdamW, self).__init__(params, defaults)
 39 | 
 40 |     def step(self, closure=None):
 41 |         """Performs a single optimization step.
 42 |         Arguments:
 43 |             closure (callable, optional): A closure that reevaluates the model
 44 |                 and returns the loss.
 45 |         """
 46 |         loss = None
 47 |         if closure is not None:
 48 |             loss = closure()
 49 | 
 50 |         for group in self.param_groups:
 51 |             for p in group['params']:
 52 |                 if p.grad is None:
 53 |                     continue
 54 |                 grad = p.grad.data
 55 |                 if grad.is_sparse:
 56 |                     raise RuntimeError(
 57 |                         'Adam does not support sparse '
 58 |                         'gradients, please consider SparseAdam instead')
 59 | 
 60 |                 state = self.state[p]
 61 | 
 62 |                 # State initialization
 63 |                 if len(state) == 0:
 64 |                     state['step'] = 0
 65 |                     # Exponential moving average of gradient values
 66 |                     state['exp_avg'] = torch.zeros_like(p.data)
 67 |                     # Exponential moving average of squared gradient values
 68 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 69 | 
 70 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 71 |                 beta1, beta2 = group['betas']
 72 | 
 73 |                 state['step'] += 1
 74 | 
 75 |                 # Decay the first and second moment running average coefficient
 76 |                 # In-place operations to update the averages at the same time
 77 |                 exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
 78 |                 exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
 79 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 80 | 
 81 |                 step_size = group['lr']
 82 |                 if group['correct_bias']:  # No bias correction for Bert
 83 |                     bias_correction1 = 1.0 - beta1 ** state['step']
 84 |                     bias_correction2 = 1.0 - beta2 ** state['step']
 85 |                     step_size = (step_size * math.sqrt(bias_correction2)
 86 |                                  / bias_correction1)
 87 | 
 88 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
 89 | 
 90 |                 # Just adding the square of the weights to the loss function is
 91 |                 # *not* the correct way of using L2 regularization/weight decay
 92 |                 # with Adam, since that will interact with the m and v
 93 |                 # parameters in strange ways.
 94 |                 #
 95 |                 # Instead we want to decay the weights in a manner that doesn't
 96 |                 # interact with the m/v parameters. This is equivalent to
 97 |                 # adding the square of the weights to the loss with plain
 98 |                 # (non-momentum) SGD.
 99 |                 # Add weight decay at the end (fixed version)
100 |                 if group['weight_decay'] > 0.0:
101 |                     p.data.add_(-group['lr'] * group['weight_decay'], p.data)
102 | 
103 |         return loss
104 | 


--------------------------------------------------------------------------------
/CLIP-ViP/src/modeling/VidCLIP.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from functools import partial
  4 | from transformers.models.clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
  5 | from src.modeling.CLIP_ViP import CLIPModel, clip_loss
  6 | from src.modeling.CLIP import CLIPModel as CLIP
  7 | 
  8 | class VidCLIP(nn.Module):
  9 |     def __init__(self, args):
 10 |         super(VidCLIP, self).__init__()
 11 |         clipconfig = CLIPConfig.from_pretrained(args.clip_config)
 12 |         setattr(clipconfig, "vision_additional_config", args.clip_vision_additional_config)
 13 |         self.vision_additional_config = args.clip_vision_additional_config
 14 |         if args.clip_weights:
 15 |             if self.vision_additional_config.type == "ViP":
 16 |                 self.clipmodel = CLIPModel.from_pretrained(args.clip_weights, config=clipconfig)
 17 |             else:
 18 |                 self.clipmodel = CLIP.from_pretrained(args.clip_weights, config=clipconfig)
 19 |         else:
 20 |             if self.vision_additional_config.type == "ViP":
 21 |                 self.clipmodel = CLIPModel(clipconfig)
 22 |             else:
 23 |                 self.clipmodel = CLIP(clipconfig)
 24 |         
 25 |         # init logit scale from 
 26 |         logit_scale_value = self.vision_additional_config.logit_scale_init_value
 27 |         self.clipmodel.logit_scale.data.fill_(logit_scale_value)
 28 |     
 29 |     def overload_logit_scale(self, overload_logit_scale):
 30 |         self.clipmodel.logit_scale.data.fill_(overload_logit_scale)
 31 | 
 32 |     def forward(self, video, text_input_ids, text_input_mask, \
 33 |                 image=None, caption_ids=None, caption_masks=None):
 34 |         """
 35 |         video [B, n_clips*num_frms, C, H, W]
 36 |         text_input_ids [B, L]
 37 |         text_input_mask [B, L]
 38 |         image [B, img_num, C, H, W]
 39 |         caption_ids [B, img_num, L]
 40 |         caption_masks [B, img_num, L]
 41 |         """
 42 |         B, N, C, H, W = video.shape
 43 | 
 44 |         if self.vision_additional_config.type == "ViP":
 45 |             inputs = {"input_ids": text_input_ids,
 46 |                     "attention_mask": text_input_mask,
 47 |                     "pixel_values": video,
 48 |                     "return_loss": False}
 49 |             outputs = self.clipmodel(**inputs)
 50 |             results = {}
 51 |             results["text_features"] = outputs["text_embeds"]
 52 |             results["vis_features"] = outputs["image_embeds"]
 53 |             # results["loss"] = outputs["loss"]
 54 |         else:
 55 |             video = video.reshape(-1, C, H, W)
 56 |             inputs = {"input_ids": text_input_ids,
 57 |                     "attention_mask": text_input_mask,
 58 |                     "pixel_values": video}
 59 |             outputs = self.clipmodel(**inputs)
 60 |             vis_features = outputs["vision_model_output"][1]
 61 | 
 62 |             vis_features = self.clipmodel.visual_projection(vis_features)
 63 |             vis_features = vis_features / vis_features.norm(dim=-1, keepdim=True)
 64 |             vis_features = vis_features.reshape(B, N, -1).mean(1)
 65 |             vis_features = vis_features / vis_features.norm(dim=-1, keepdim=True)
 66 |             
 67 |             results = {}
 68 |             results["text_features"] = outputs["text_embeds"]
 69 |             results["vis_features"] = vis_features
 70 |         if image is not None:
 71 |             B, img_num, C, H, W = image.shape
 72 |             L = caption_ids.shape[-1]
 73 |             inputs = {"input_ids": caption_ids.reshape(-1, L),
 74 |                     "attention_mask": caption_masks.reshape(-1, L),
 75 |                     "pixel_values": image.reshape(-1, 1, C, H, W),
 76 |                     "return_loss": False}
 77 |             outputs = self.clipmodel(**inputs)
 78 |             results["img_features"] = outputs["image_embeds"]
 79 |             results["cap_features"] = outputs["text_embeds"]
 80 |         
 81 |         return results
 82 |     
 83 |     def forward_video(self, video):
 84 |         inputs = {"pixel_values": video,
 85 |                 "if_norm": True}
 86 |         video_features = self.clipmodel.get_image_features(**inputs)
 87 |         return video_features
 88 |     
 89 |     def forward_text(self, text_input_ids, text_input_mask):
 90 |         inputs = {"input_ids": text_input_ids,
 91 |                 "attention_mask": text_input_mask,
 92 |                 "if_norm": True}
 93 |         text_features = self.clipmodel.get_text_features(**inputs)
 94 |         return text_features
 95 | 
 96 |     def freeze_text_encoder(self, freeze_text_proj):
 97 |         freeze_list = [self.clipmodel.text_model]
 98 |         if freeze_text_proj:
 99 |             freeze_list.append(self.clipmodel.text_projection)
100 |         for m in freeze_list:
101 |             m.eval()
102 |             for param in m.parameters():
103 |                 param.requires_grad = False
104 | 
105 | 


--------------------------------------------------------------------------------