├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── config ├── FT_only_configs │ ├── train-all-multitask-8gpu.json │ ├── train-caption-multitask-8gpu.json │ ├── train-how2qa-8gpu.json │ ├── train-how2qa_sub_only-8gpu.json │ ├── train-how2qa_video_only-8gpu.json │ ├── train-how2r-8gpu.json │ ├── train-how2r_sub_only-8gpu.json │ ├── train-how2r_video_only-8gpu.json │ ├── train-qa-multitask-8gpu.json │ ├── train-retrieval-multitask-8gpu.json │ ├── train-tv_domain-multitask-8gpu.json │ ├── train-tvc-8gpu.json │ ├── train-tvc_sub_only-8gpu.json │ ├── train-tvc_video_only_8gpu.json │ ├── train-tvqa-8gpu.json │ ├── train-tvqa_sub_only-8gpu.json │ ├── train-tvqa_video_only-8gpu.json │ ├── train-tvr-8gpu.json │ ├── train-tvr_sub_only-8gpu.json │ ├── train-tvr_video_only-8gpu.json │ ├── train-vatex_en_c-8gpu.json │ ├── train-vatex_en_c_sub_only-8gpu.json │ ├── train-vatex_en_c_video_only-8gpu.json │ ├── train-vatex_en_r-8gpu.json │ ├── train-vatex_en_r_sub_only-8gpu.json │ ├── train-vatex_en_r_video_only-8gpu.json │ ├── train-violin-8gpu.json │ ├── train-violin_sub_only-8gpu.json │ ├── train-violin_video_only-8gpu.json │ ├── train-vlep-8gpu.json │ ├── train-vlep_sub_only-8gpu.json │ ├── train-vlep_video_only-8gpu.json │ ├── train-yc2c-8gpu.json │ ├── train-yc2c_sub_only-8gpu.json │ ├── train-yc2c_video_only-8gpu.json │ ├── train-yc2r-4gpu.json │ ├── train-yc2r_sub_only-4gpu.json │ ├── train-yc2r_video_only-4gpu.json │ └── train-youtube_domain-multitask-8gpu.json ├── config.py ├── model_config │ ├── hero_finetune.json │ ├── hero_pretrain.json │ ├── hero_videoCap.json │ ├── video_sub_feature_add_finetune.json │ ├── video_sub_feature_concat_finetune.json │ └── video_sub_sequence_finetune.json ├── pretrain-tv-16gpu.json ├── train-all-multitask-8gpu.json ├── train-caption-multitask-8gpu.json ├── train-how2qa-8gpu.json ├── train-how2r-8gpu.json ├── train-qa-multitask-8gpu.json ├── train-retrieval-multitask-8gpu.json ├── train-tv_domain-multitask-8gpu.json ├── train-tvc-8gpu.json ├── train-tvqa-8gpu.json ├── train-tvr-8gpu.json ├── train-vatex_en_c-8gpu.json ├── train-vatex_en_r-8gpu.json ├── train-violin-8gpu.json ├── train-vlep-8gpu.json ├── train-yc2c-8gpu.json ├── train-yc2r-4gpu.json └── train-youtube_domain-multitask-8gpu.json ├── data ├── __init__.py ├── data.py ├── fom.py ├── loader.py ├── mfm.py ├── mlm.py ├── tvc.py ├── vcmr.py ├── videoCap.py ├── videoQA.py ├── violin.py ├── vlep.py ├── vr.py └── vsm.py ├── eval ├── pycocoevalcap │ ├── README.md │ ├── __init__.py │ ├── bleu │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── bleu.py │ │ └── bleu_scorer.py │ ├── cider │ │ ├── __init__.py │ │ ├── cider.py │ │ └── cider_scorer.py │ ├── license.txt │ ├── meteor │ │ ├── __init__.py │ │ ├── meteor.py │ │ └── tests │ │ │ └── test_meteor.py │ ├── rouge │ │ ├── __init__.py │ │ └── rouge.py │ └── tokenizer │ │ ├── __init__.py │ │ └── ptbtokenizer.py ├── tvc.py ├── vatex_en_c.py └── yc2c.py ├── eval_vcmr.py ├── eval_videoQA.py ├── eval_violin.py ├── eval_vr.py ├── inf_tvc.py ├── inf_vatex_en_c.py ├── inf_yc2c.py ├── launch_container.sh ├── load_data.py ├── model ├── __init__.py ├── embed.py ├── encoder.py ├── layers.py ├── model.py ├── modeling_utils.py ├── multitask.py ├── pretrain.py ├── vcmr.py ├── videoCap.py ├── videoQA.py └── vr.py ├── optim ├── __init__.py ├── adamw.py ├── misc.py └── sched.py ├── scripts ├── collect_video_feature_paths.py ├── convert_video_db_single_feature.py ├── convert_videodb.py ├── create_txtdb.sh ├── download_all.sh ├── download_how2.sh ├── download_pretrained.sh ├── download_tvc.sh ├── download_tvqa.sh ├── download_tvr.sh ├── download_vatex_en.sh ├── download_violin.sh ├── download_vlep.sh ├── download_yc2.sh ├── prepro_query.py ├── prepro_sub.py ├── prepro_tvc.py └── prepro_tvc.sh ├── train_all_multitask.py ├── train_captioning.py ├── train_qa.py ├── train_retrieval.py ├── two_stream_eval ├── eval_vcmr.py ├── eval_videoQA.py ├── eval_violin.py ├── eval_vr.py ├── inf_tvc.py ├── inf_vatex_en_c.py ├── inf_yc2c.py └── videocap_generator.py └── utils ├── __init__.py ├── basic_utils.py ├── const.py ├── distributed.py ├── logger.py ├── misc.py ├── save.py ├── tvr_eval_utils.py └── tvr_standalone_eval.py /.gitignore: -------------------------------------------------------------------------------- 1 | # philly 2 | philly/ 3 | .pt* 4 | .amlt* 5 | # ctags 6 | tags 7 | 8 | # capeval tmp files 9 | eval/pycocoevalcap/tokenizer/tmp* 10 | demo/ 11 | 12 | # compiled files # 13 | __pycache__ 14 | *.pyc 15 | 16 | # Packages # 17 | ############ 18 | # it's better to unpack these files and commit the raw source 19 | # git has its own built in compression methods 20 | *.7z 21 | *.dmg 22 | *.gz 23 | *.iso 24 | *.jar 25 | *.rar 26 | *.tar 27 | *.zip 28 | 29 | # Logs and databases # 30 | ###################### 31 | *.log 32 | *.sql 33 | *.sqlite 34 | .ipynb_checkpoints/ 35 | *.swp 36 | *.vscode/ 37 | *.idea/ 38 | 39 | # OS generated files # 40 | ###################### 41 | .DS_Store 42 | .DS_Store? 43 | ._* 44 | .Spotlight-V100 45 | .Trashes 46 | ehthumbs.db 47 | Thumbs.db 48 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:19.10-py3 2 | 3 | # basic python packages 4 | RUN pip install transformers==2.0.0 \ 5 | tensorboardX==1.7 ipdb==0.12 lz4==2.1.9 lmdb==0.97 6 | 7 | ####### horovod for multi-GPU (distributed) training ####### 8 | # horovod 9 | RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITH_PYTORCH=1 \ 10 | pip install --no-cache-dir horovod==0.18.2 &&\ 11 | ldconfig 12 | 13 | # ssh 14 | RUN apt-get update &&\ 15 | apt-get install -y --no-install-recommends openssh-client openssh-server &&\ 16 | mkdir -p /var/run/sshd 17 | 18 | # Allow OpenSSH to talk to containers without asking for confirmation 19 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ 20 | echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ 21 | mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config 22 | 23 | # captioning 24 | 25 | # captioning eval tool (java for PTBtokenizer and METEOR) 26 | RUN apt-get install -y --no-install-recommends openjdk-8-jdk && apt-get clean 27 | 28 | # binaries for cococap eval 29 | ARG PYCOCOEVALCAP=https://github.com/tylin/coco-caption/raw/master/pycocoevalcap 30 | RUN mkdir /workspace/cococap_bin/ && \ 31 | wget $PYCOCOEVALCAP/meteor/meteor-1.5.jar -P /workspace/cococap_bin/ && \ 32 | wget $PYCOCOEVALCAP/meteor/data/paraphrase-en.gz -P /workspace/cococap_bin/ && \ 33 | wget $PYCOCOEVALCAP/tokenizer/stanford-corenlp-3.4.1.jar -P /workspace/cococap_bin/ 34 | 35 | # add new command here 36 | 37 | WORKDIR /src 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 VALUE Benchmark Starter Code 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-caption-multitask-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "tvc_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "cap_txt_db": ["/txt/tvc_train.db"], 9 | "batch_size": 4, 10 | "ratio": 2 11 | }, 12 | { 13 | "task": "videoCap", 14 | "name": "vatex_en_c_video_sub_train", 15 | "sub_txt_db": "/txt/vatex_subtitles.db", 16 | "vfeat_db": "/video/vatex", 17 | "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"], 18 | "batch_size": 128, 19 | "ratio": 2 20 | }, 21 | { 22 | "task": "videoCap", 23 | "name": "yc2c_video_sub_train", 24 | "sub_txt_db": "/txt/yc2_subtitles.db", 25 | "vfeat_db": "/video/yc2", 26 | "cap_txt_db": ["/txt/yc2r_train.db"], 27 | "batch_size": 16, 28 | "ratio": 1 29 | } 30 | ], 31 | "val_datasets": [ 32 | { 33 | "task": "videoCap", 34 | "name": "tvc_video_sub_val", 35 | "sub_txt_db": "/txt/tv_subtitles.db", 36 | "vfeat_db": "/video/tv", 37 | "batch_size": 8, 38 | "gt_anno": "/txt/tvc_val_release.jsonl" 39 | }, 40 | { 41 | "task": "videoCap", 42 | "name": "vatex_en_c_video_sub_val", 43 | "sub_txt_db": "/txt/vatex_subtitles.db", 44 | "vfeat_db": "/video/vatex", 45 | "batch_size": 128, 46 | "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl" 47 | }, 48 | { 49 | "task": "videoCap", 50 | "name": "yc2c_video_sub_val", 51 | "sub_txt_db": "/txt/yc2_subtitles.db", 52 | "vfeat_db": "/video/yc2", 53 | "batch_size": 16, 54 | "gt_anno": "/txt/yc2c_val_release.jsonl" 55 | } 56 | ], 57 | "compressed_db": false, 58 | "model_config": "config/model_config/hero_videoCap.json", 59 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 60 | "load_partial_pretrained": true, 61 | "skip_layer_loading": true, 62 | "output_dir": "/storage/MT_FT_only/caption_multi-task_default", 63 | "max_clip_len": 100, 64 | "max_txt_len": 60, 65 | "max_cap_per_vid": -1, 66 | "max_gen_step": 30, 67 | "vfeat_version": "resnet_slowfast", 68 | "vfeat_interval": 1.5, 69 | "train_batch_size": 4, 70 | "val_batch_size": 8, 71 | "gradient_accumulation_steps": 1, 72 | "learning_rate": 1e-4, 73 | "lr_mul": 10.0, 74 | "valid_steps": 500, 75 | "num_train_steps": 30000, 76 | "optim": "adamw", 77 | "betas": [0.9, 0.98], 78 | "lsr": 0.1, 79 | "dropout": 0.1, 80 | "weight_decay": 0.01, 81 | "grad_norm": 1.0, 82 | "warmup_steps": 3000, 83 | "sub_ctx_len": 1, 84 | "seed": 77, 85 | "no_fp16": false, 86 | "n_workers": 4, 87 | "pin_mem": true 88 | } -------------------------------------------------------------------------------- /config/FT_only_configs/train-how2qa-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "how2qa_video_sub_train", 6 | "sub_txt_db": "/txt/how2_subtitles.db", 7 | "vfeat_db": "/video/how2", 8 | "query_txt_db": "/txt/how2qa_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "how2qa_video_sub_val", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2qa_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/ST_FT_only/how2qa_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 2000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 200, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-how2qa_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "how2qa_sub_only_train", 6 | "sub_txt_db": "/txt/how2_subtitles.db", 7 | "vfeat_db": "/video/how2", 8 | "query_txt_db": "/txt/how2qa_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "how2qa_sub_only_val", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2qa_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_sub_only/how2qa_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 2000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 200, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-how2qa_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "how2qa_video_only_train", 6 | "sub_txt_db": "/txt/how2_subtitles.db", 7 | "vfeat_db": "/video/how2", 8 | "query_txt_db": "/txt/how2qa_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "how2qa_video_only_val", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2qa_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_video_only/how2qa_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 2000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 200, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-how2r-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "how2r_video_sub_train", 6 | "sub_txt_db": "/txt/how2_subtitles.db", 7 | "vfeat_db": "/video/how2", 8 | "query_txt_db": "/txt/how2r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vcmr", 14 | "name": "how2r_video_sub_val", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2r_val_1k.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/ST_FT_only/how2r_default", 26 | "eval_with_query_type": true, 27 | "max_before_nms": 200, 28 | "max_after_nms": 100, 29 | "distributed_eval": true, 30 | "nms_thd": -1, 31 | "q2c_alpha": 20, 32 | "max_vcmr_video": 100, 33 | "full_eval_tasks": [ 34 | "VCMR", 35 | "SVMR", 36 | "VR" 37 | ], 38 | "max_clip_len": 100, 39 | "max_txt_len": 60, 40 | "vfeat_version": "resnet_slowfast", 41 | "vfeat_interval": 1.5, 42 | "min_pred_l": 3, 43 | "max_pred_l": 20, 44 | "drop_svmr_prob": 0.9, 45 | "train_batch_size": 32, 46 | "val_batch_size": 20, 47 | "vcmr_eval_video_batch_size": 50, 48 | "vcmr_eval_batch_size": 80, 49 | "gradient_accumulation_steps":2, 50 | "learning_rate": 1e-04, 51 | "valid_steps": 200, 52 | "save_steps": 200, 53 | "num_train_steps": 6000, 54 | "optim": "adamw", 55 | "betas": [ 56 | 0.9, 57 | 0.98 58 | ], 59 | "dropout": 0.1, 60 | "weight_decay": 0.01, 61 | "grad_norm": 1.0, 62 | "warmup_steps": 600, 63 | "lw_neg_q": 8.0, 64 | "lw_neg_ctx": 8.0, 65 | "lw_st_ed": 0.01, 66 | "ranking_loss_type": "hinge", 67 | "margin": 0.1, 68 | "hard_pool_size": [ 69 | 20 70 | ], 71 | "hard_neg_weights": [ 72 | 10 73 | ], 74 | "hard_negtiave_start_step": [ 75 | 2000 76 | ], 77 | "train_span_start_step": 0, 78 | "sub_ctx_len": 0, 79 | "use_all_neg": true, 80 | "seed": 77, 81 | "no_fp16": false, 82 | "n_workers": 4, 83 | "no_pin_mem": false, 84 | "rank": 0 85 | } 86 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-how2r_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "how2r_sub_only_train", 6 | "sub_txt_db": "/txt/how2_subtitles.db", 7 | "vfeat_db": "/video/how2", 8 | "query_txt_db": "/txt/how2r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vcmr", 14 | "name": "how2r_sub_only_val", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2r_val_1k.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_sub_only/how2r_default", 26 | "eval_with_query_type": true, 27 | "max_before_nms": 200, 28 | "max_after_nms": 100, 29 | "distributed_eval": true, 30 | "nms_thd": -1, 31 | "q2c_alpha": 20, 32 | "max_vcmr_video": 100, 33 | "full_eval_tasks": [ 34 | "VCMR", 35 | "SVMR", 36 | "VR" 37 | ], 38 | "max_clip_len": 100, 39 | "max_txt_len": 60, 40 | "vfeat_version": "resnet_slowfast", 41 | "vfeat_interval": 1.5, 42 | "min_pred_l": 3, 43 | "max_pred_l": 20, 44 | "drop_svmr_prob": 0.9, 45 | "train_batch_size": 32, 46 | "val_batch_size": 20, 47 | "vcmr_eval_video_batch_size": 50, 48 | "vcmr_eval_batch_size": 80, 49 | "gradient_accumulation_steps":2, 50 | "learning_rate": 1e-04, 51 | "valid_steps": 200, 52 | "save_steps": 200, 53 | "num_train_steps": 3000, 54 | "optim": "adamw", 55 | "betas": [ 56 | 0.9, 57 | 0.98 58 | ], 59 | "dropout": 0.1, 60 | "weight_decay": 0.01, 61 | "grad_norm": 1.0, 62 | "warmup_steps": 300, 63 | "lw_neg_q": 8.0, 64 | "lw_neg_ctx": 8.0, 65 | "lw_st_ed": 0.01, 66 | "ranking_loss_type": "hinge", 67 | "margin": 0.1, 68 | "hard_pool_size": [ 69 | 20 70 | ], 71 | "hard_neg_weights": [ 72 | 10 73 | ], 74 | "hard_negtiave_start_step": [ 75 | 1000 76 | ], 77 | "train_span_start_step": 0, 78 | "sub_ctx_len": 0, 79 | "use_all_neg": true, 80 | "seed": 77, 81 | "no_fp16": false, 82 | "n_workers": 4, 83 | "no_pin_mem": false, 84 | "rank": 0 85 | } 86 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-how2r_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "how2r_video_only_train", 6 | "sub_txt_db": "/txt/how2_subtitles.db", 7 | "vfeat_db": "/video/how2", 8 | "query_txt_db": "/txt/how2r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vcmr", 14 | "name": "how2r_video_only_val", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2r_val_1k.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/how2r_default", 26 | "eval_with_query_type": true, 27 | "max_before_nms": 200, 28 | "max_after_nms": 100, 29 | "distributed_eval": true, 30 | "nms_thd": -1, 31 | "q2c_alpha": 20, 32 | "max_vcmr_video": 100, 33 | "full_eval_tasks": [ 34 | "VCMR", 35 | "SVMR", 36 | "VR" 37 | ], 38 | "max_clip_len": 100, 39 | "max_txt_len": 60, 40 | "vfeat_version": "resnet_slowfast", 41 | "vfeat_interval": 1.5, 42 | "min_pred_l": 3, 43 | "max_pred_l": 20, 44 | "drop_svmr_prob": 0.9, 45 | "train_batch_size": 32, 46 | "val_batch_size": 20, 47 | "vcmr_eval_video_batch_size": 50, 48 | "vcmr_eval_batch_size": 80, 49 | "gradient_accumulation_steps":2, 50 | "learning_rate": 1e-04, 51 | "valid_steps": 200, 52 | "save_steps": 200, 53 | "num_train_steps": 6000, 54 | "optim": "adamw", 55 | "betas": [ 56 | 0.9, 57 | 0.98 58 | ], 59 | "dropout": 0.1, 60 | "weight_decay": 0.01, 61 | "grad_norm": 1.0, 62 | "warmup_steps": 600, 63 | "lw_neg_q": 8.0, 64 | "lw_neg_ctx": 8.0, 65 | "lw_st_ed": 0.01, 66 | "ranking_loss_type": "hinge", 67 | "margin": 0.1, 68 | "hard_pool_size": [ 69 | 20 70 | ], 71 | "hard_neg_weights": [ 72 | 10 73 | ], 74 | "hard_negative_start_step": [ 75 | 2000 76 | ], 77 | "train_span_start_step": 0, 78 | "sub_ctx_len": 0, 79 | "use_all_neg": true, 80 | "seed": 77, 81 | "no_fp16": false, 82 | "n_workers": 4, 83 | "no_pin_mem": false, 84 | "rank": 0 85 | } 86 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-qa-multitask-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "tvqa_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvqa_train.db", 9 | "ratio": 5 10 | }, 11 | { 12 | "task": "videoQA", 13 | "name": "how2qa_video_sub_train", 14 | "sub_txt_db": "/txt/how2_subtitles.db", 15 | "vfeat_db": "/video/how2", 16 | "query_txt_db": "/txt/how2qa_train.db", 17 | "ratio": 1 18 | }, 19 | { 20 | "task": "violin", 21 | "name": "violin_video_sub_train", 22 | "sub_txt_db": "/txt/violin_subtitles.db", 23 | "vfeat_db": "/video/violin", 24 | "query_txt_db": "/txt/violin_train.db", 25 | "ratio": 3 26 | }, 27 | { 28 | "task": "videoQA", 29 | "name": "vlep_video_sub_train", 30 | "sub_txt_db": "/txt/vlep_subtitles.db/", 31 | "vfeat_db": "/video/vlep", 32 | "query_txt_db": "/txt/vlep_train.db", 33 | "ratio": 1 34 | } 35 | ], 36 | "val_datasets": [ 37 | { 38 | "task": "videoQA", 39 | "name": "tvqa_video_sub_val", 40 | "sub_txt_db": "/txt/tv_subtitles.db", 41 | "vfeat_db": "/video/tv", 42 | "query_txt_db": "/txt/tvqa_val.db" 43 | }, 44 | { 45 | "task": "videoQA", 46 | "name": "how2qa_video_sub_val", 47 | "sub_txt_db": "/txt/how2_subtitles.db", 48 | "vfeat_db": "/video/how2", 49 | "query_txt_db": "/txt/how2qa_val.db" 50 | }, 51 | { 52 | "task": "violin", 53 | "name": "violin_video_sub_val", 54 | "sub_txt_db": "/txt/violin_subtitles.db", 55 | "vfeat_db": "/video/violin", 56 | "query_txt_db": "/txt/violin_val.db" 57 | }, 58 | { 59 | "task": "videoQA", 60 | "name": "vlep_video_sub_dev", 61 | "sub_txt_db": "/txt/vlep_subtitles.db/", 62 | "vfeat_db": "/video/vlep", 63 | "query_txt_db": "/txt/vlep_dev.db" 64 | } 65 | ], 66 | "compressed_db": false, 67 | "model_config": "config/model_config/hero_finetune.json", 68 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 69 | "load_partial_pretrained": true, 70 | "skip_layer_loading": true, 71 | "output_dir": "/storage/MT_FT_only/qa_multi-task_default", 72 | "max_clip_len": 100, 73 | "max_txt_len": 120, 74 | "vfeat_version": "resnet_slowfast", 75 | "vfeat_interval": 1.5, 76 | "train_batch_size": 4, 77 | "val_batch_size": 10, 78 | "gradient_accumulation_steps": 2, 79 | "learning_rate": 5e-05, 80 | "valid_steps": 200, 81 | "save_steps": 200, 82 | "num_train_steps": 20000, 83 | "optim": "adamw", 84 | "betas": [ 85 | 0.9, 86 | 0.98 87 | ], 88 | "dropout": 0.1, 89 | "weight_decay": 0.01, 90 | "lr_mul": 10.0, 91 | "grad_norm": 1.0, 92 | "warmup_steps": 2000, 93 | "lw_st_ed": 0.4, 94 | "sub_ctx_len": 0, 95 | "seed": 77, 96 | "no_fp16": false, 97 | "n_workers": 4, 98 | "no_pin_mem": false, 99 | "rank": 0 100 | } -------------------------------------------------------------------------------- /config/FT_only_configs/train-retrieval-multitask-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "tvr_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvr_train.db", 9 | "batch_size": 32, 10 | "ratio": 2 11 | }, 12 | { 13 | "task": "vcmr", 14 | "name": "how2r_video_sub_train", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2r_train.db", 18 | "batch_size": 32, 19 | "ratio": 1 20 | }, 21 | { 22 | "task": "vr", 23 | "name": "vatex_en_r_video_sub_train", 24 | "sub_txt_db": "/txt/vatex_subtitles.db/", 25 | "vfeat_db": "/video/vatex", 26 | "query_txt_db": "/txt/vatex_en_r_train.db", 27 | "batch_size": 64, 28 | "ratio": 3 29 | }, 30 | { 31 | "task": "vr", 32 | "name": "yc2r_video_sub_train", 33 | "sub_txt_db": "/txt/yc2_subtitles.db/", 34 | "vfeat_db": "/video/yc2", 35 | "query_txt_db": "/txt/yc2r_train.db", 36 | "batch_size": 48, 37 | "ratio": 1 38 | } 39 | ], 40 | "val_datasets": [ 41 | { 42 | "task": "vcmr", 43 | "name": "tvr_video_sub_val", 44 | "sub_txt_db": "/txt/tv_subtitles.db", 45 | "vfeat_db": "/video/tv", 46 | "query_txt_db": "/txt/tvr_val.db" 47 | }, 48 | { 49 | "task": "vcmr", 50 | "name": "how2r_video_sub_val", 51 | "sub_txt_db": "/txt/how2_subtitles.db", 52 | "vfeat_db": "/video/how2", 53 | "query_txt_db": "/txt/how2r_val_1k.db" 54 | }, 55 | { 56 | "task": "vr", 57 | "name": "vatex_en_r_video_sub_val", 58 | "sub_txt_db": "/txt/vatex_subtitles.db/", 59 | "vfeat_db": "/video/vatex", 60 | "query_txt_db": "/txt/vatex_en_r_val.db" 61 | }, 62 | { 63 | "task": "vr", 64 | "name": "yc2r_video_sub_val", 65 | "sub_txt_db": "/txt/yc2_subtitles.db/", 66 | "vfeat_db": "/video/yc2", 67 | "query_txt_db": "/txt/yc2r_val.db" 68 | } 69 | ], 70 | "compressed_db": false, 71 | "model_config": "config/model_config/hero_finetune.json", 72 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 73 | "load_partial_pretrained": true, 74 | "skip_layer_loading": true, 75 | "output_dir": "/storage/MT_FT_only/retrieval_multi-task_default", 76 | "eval_with_query_type": true, 77 | "max_before_nms": 200, 78 | "max_after_nms": 100, 79 | "distributed_eval": true, 80 | "nms_thd": -1, 81 | "q2c_alpha": 20, 82 | "max_vcmr_video": 100, 83 | "full_eval_tasks": [ 84 | "VCMR", 85 | "SVMR", 86 | "VR" 87 | ], 88 | "max_clip_len": 100, 89 | "max_txt_len": 60, 90 | "vfeat_version": "resnet_slowfast", 91 | "vfeat_interval": 1.5, 92 | "min_pred_l": 2, 93 | "max_pred_l": 16, 94 | "drop_svmr_prob": 0.8, 95 | "train_batch_size": 32, 96 | "val_batch_size": 20, 97 | "vcmr_eval_video_batch_size": 50, 98 | "vcmr_eval_batch_size": 80, 99 | "vr_eval_video_batch_size": 50, 100 | "vr_eval_batch_size": 80, 101 | "gradient_accumulation_steps":2, 102 | "learning_rate": 1e-04, 103 | "valid_steps": 400, 104 | "save_steps": 400, 105 | "num_train_steps": 20000, 106 | "optim": "adamw", 107 | "betas": [ 108 | 0.9, 109 | 0.98 110 | ], 111 | "dropout": 0.1, 112 | "weight_decay": 0.01, 113 | "grad_norm": 1.0, 114 | "warmup_steps": 2000, 115 | "lw_neg_q": 8.0, 116 | "lw_neg_ctx": 8.0, 117 | "lw_st_ed": 0.01, 118 | "ranking_loss_type": "hinge", 119 | "margin": 0.1, 120 | "hard_pool_size": [ 121 | 20 122 | ], 123 | "hard_neg_weights": [ 124 | 10 125 | ], 126 | "hard_negative_start_step": [ 127 | 8000 128 | ], 129 | "train_span_start_step": 0, 130 | "sub_ctx_len": 0, 131 | "use_all_neg": true, 132 | "seed": 77, 133 | "no_fp16": false, 134 | "n_workers": 4, 135 | "no_pin_mem": false, 136 | "rank": 0 137 | } 138 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tv_domain-multitask-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "tvr_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvr_train.db", 9 | "batch_size": 32, 10 | "ratio": 5 11 | }, 12 | { 13 | "task": "videoCap", 14 | "name": "tvc_video_sub_train", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "cap_txt_db": ["/txt/tvc_train.db"], 18 | "batch_size": 4, 19 | "ratio": 5 20 | }, 21 | { 22 | "task": "videoQA", 23 | "name": "tvqa_video_sub_train", 24 | "sub_txt_db": "/txt/tv_subtitles.db", 25 | "vfeat_db": "/video/tv", 26 | "query_txt_db": "/txt/tvqa_train.db", 27 | "batch_size": 4, 28 | "ratio": 5 29 | }, 30 | { 31 | "task": "violin", 32 | "name": "violin_video_sub_train", 33 | "sub_txt_db": "/txt/violin_subtitles.db", 34 | "vfeat_db": "/video/violin", 35 | "query_txt_db": "/txt/violin_train.db", 36 | "batch_size": 4, 37 | "ratio": 3 38 | }, 39 | { 40 | "task": "videoQA", 41 | "name": "vlep_video_sub_train", 42 | "sub_txt_db": "/txt/vlep_subtitles.db/", 43 | "vfeat_db": "/video/vlep", 44 | "query_txt_db": "/txt/vlep_train.db", 45 | "batch_size": 4, 46 | "ratio": 1 47 | } 48 | ], 49 | "val_datasets": [ 50 | { 51 | "task": "vcmr", 52 | "name": "tvr_video_sub_val", 53 | "sub_txt_db": "/txt/tv_subtitles.db", 54 | "vfeat_db": "/video/tv", 55 | "batch_size": 20, 56 | "query_txt_db": "/txt/tvr_val.db" 57 | }, 58 | { 59 | "task": "videoCap", 60 | "name": "tvc_video_sub_val", 61 | "sub_txt_db": "/txt/tv_subtitles.db", 62 | "vfeat_db": "/video/tv", 63 | "batch_size": 8, 64 | "gt_anno": "/txt/tvc_val_release.jsonl" 65 | }, 66 | { 67 | "task": "videoQA", 68 | "name": "tvqa_video_sub_val", 69 | "sub_txt_db": "/txt/tv_subtitles.db", 70 | "vfeat_db": "/video/tv", 71 | "batch_size": 10, 72 | "query_txt_db": "/txt/tvqa_val.db" 73 | }, 74 | { 75 | "task": "violin", 76 | "name": "violin_video_sub_val", 77 | "sub_txt_db": "/txt/violin_subtitles.db", 78 | "vfeat_db": "/video/violin", 79 | "batch_size": 10, 80 | "query_txt_db": "/txt/violin_val.db" 81 | }, 82 | { 83 | "task": "videoQA", 84 | "name": "vlep_video_sub_dev", 85 | "sub_txt_db": "/txt/vlep_subtitles.db/", 86 | "vfeat_db": "/video/vlep", 87 | "batch_size": 10, 88 | "query_txt_db": "/txt/vlep_dev.db" 89 | } 90 | ], 91 | "compressed_db": false, 92 | "model_config": "config/model_config/hero_finetune.json", 93 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 94 | "load_partial_pretrained": true, 95 | "skip_layer_loading": true, 96 | "output_dir": "/storage/MT_FT_only/tv-domain_multi-task_default", 97 | "eval_with_query_type": true, 98 | "max_before_nms": 200, 99 | "max_after_nms": 100, 100 | "distributed_eval": true, 101 | "nms_thd": -1, 102 | "q2c_alpha": 20, 103 | "max_vcmr_video": 100, 104 | "full_eval_tasks": [ 105 | "VCMR", 106 | "SVMR", 107 | "VR" 108 | ], 109 | "max_clip_len": 100, 110 | "max_txt_len": 60, 111 | "vfeat_version": "resnet_slowfast", 112 | "vfeat_interval": 1.5, 113 | "min_pred_l": 2, 114 | "max_pred_l": 16, 115 | "drop_svmr_prob": 0.8, 116 | "train_batch_size": 32, 117 | "val_batch_size": 20, 118 | "vcmr_eval_video_batch_size": 50, 119 | "vcmr_eval_batch_size": 80, 120 | "vr_eval_video_batch_size": 50, 121 | "vr_eval_batch_size": 80, 122 | "gradient_accumulation_steps":2, 123 | "learning_rate": 1e-04, 124 | "valid_steps": 400, 125 | "save_steps": 400, 126 | "num_train_steps": 30000, 127 | "optim": "adamw", 128 | "betas": [ 129 | 0.9, 130 | 0.98 131 | ], 132 | "dropout": 0.1, 133 | "weight_decay": 0.01, 134 | "grad_norm": 1.0, 135 | "warmup_steps": 3000, 136 | "lw_neg_q": 8.0, 137 | "lw_neg_ctx": 8.0, 138 | "lw_st_ed": 0.01, 139 | "ranking_loss_type": "hinge", 140 | "margin": 0.1, 141 | "hard_pool_size": [ 142 | 20 143 | ], 144 | "hard_neg_weights": [ 145 | 10 146 | ], 147 | "hard_negative_start_step": [ 148 | 8000 149 | ], 150 | "train_span_start_step": 0, 151 | "sub_ctx_len": 0, 152 | "use_all_neg": true, 153 | "seed": 77, 154 | "no_fp16": false, 155 | "n_workers": 1, 156 | "no_pin_mem": false, 157 | "rank": 0, 158 | "max_cap_per_vid": -1, 159 | "max_gen_step": 30, 160 | "lr_mul": 10.0, 161 | "lsr": 0.1, 162 | "qa_lw_st_ed": 0.4 163 | } 164 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvc-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "tvc_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "cap_txt_db": ["/txt/tvc_train.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "tvc_video_sub_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "gt_anno": "/txt/tvc_val_release.jsonl" 18 | } 19 | ], 20 | "model_config": "/src/config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/ST_FT_only/tvc_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_cap_per_vid": -1, 28 | "max_gen_step": 30, 29 | "vfeat_version": "resnet_slowfast", 30 | "vfeat_interval": 1.5, 31 | "compressed_db": false, 32 | "train_batch_size": 4, 33 | "val_batch_size": 8, 34 | "gradient_accumulation_steps": 1, 35 | "learning_rate": 1e-4, 36 | "lr_mul": 10.0, 37 | "valid_steps": 500, 38 | "num_train_steps": 7000, 39 | "optim": "adamw", 40 | "betas": [0.9, 0.98], 41 | "lsr": 0.1, 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "grad_norm": 1.0, 45 | "warmup_steps": 700, 46 | "sub_ctx_len": 1, 47 | "seed": 77, 48 | "no_fp16": false, 49 | "n_workers": 4, 50 | "pin_mem": true 51 | } 52 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvc_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "tvc_sub_only_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "cap_txt_db": ["/txt/tvc_train.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "tvc_sub_only_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "gt_anno": "/txt/tvc_val_release.jsonl" 18 | } 19 | ], 20 | "model_config": "/src/config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/multi_channel_ablation_sub_only/tvc_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_cap_per_vid": -1, 28 | "max_gen_step": 30, 29 | "vfeat_version": "resnet_slowfast", 30 | "vfeat_interval": 1.5, 31 | "compressed_db": false, 32 | "train_batch_size": 4, 33 | "val_batch_size": 8, 34 | "gradient_accumulation_steps": 1, 35 | "learning_rate": 1e-4, 36 | "lr_mul": 10.0, 37 | "valid_steps": 500, 38 | "num_train_steps": 7000, 39 | "optim": "adamw", 40 | "betas": [0.9, 0.98], 41 | "lsr": 0.1, 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "grad_norm": 1.0, 45 | "warmup_steps": 700, 46 | "sub_ctx_len": 1, 47 | "seed": 77, 48 | "no_fp16": false, 49 | "n_workers": 4, 50 | "pin_mem": true 51 | } 52 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvc_video_only_8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "tvc_video_only_train", 6 | "sub_txt_db": null, 7 | "vfeat_db": "/video/tv", 8 | "cap_txt_db": ["/txt/tvc_train.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "tvc_video_only_val", 15 | "sub_txt_db": null, 16 | "vfeat_db": "/video/tv", 17 | "gt_anno": "/txt/tvc_val_release.jsonl" 18 | } 19 | ], 20 | "model_config": "/src/config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/multi_channel_ablation_video_only/tvc_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_cap_per_vid": -1, 28 | "max_gen_step": 30, 29 | "vfeat_version": "resnet_slowfast", 30 | "vfeat_interval": 1.5, 31 | "compressed_db": false, 32 | "train_batch_size": 4, 33 | "val_batch_size": 8, 34 | "gradient_accumulation_steps": 1, 35 | "learning_rate": 1e-4, 36 | "lr_mul": 10.0, 37 | "valid_steps": 500, 38 | "num_train_steps": 7000, 39 | "optim": "adamw", 40 | "betas": [0.9, 0.98], 41 | "lsr": 0.1, 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "grad_norm": 1.0, 45 | "warmup_steps": 700, 46 | "sub_ctx_len": 1, 47 | "seed": 77, 48 | "no_fp16": false, 49 | "n_workers": 4, 50 | "pin_mem": true 51 | } 52 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvqa-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "tvqa_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvqa_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "tvqa_video_sub_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "query_txt_db": "/txt/tvqa_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/ST_FT_only/tvqa_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 10000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 1000, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvqa_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "tvqa_sub_only_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvqa_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "tvqa_sub_only_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "query_txt_db": "/txt/tvqa_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_sub_only/tvqa_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 10000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 1000, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvqa_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "tvqa_video_only_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvqa_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "tvqa_video_only_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "query_txt_db": "/txt/tvqa_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_video_only/tvqa_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 10000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 1000, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvr-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "tvr_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvr_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vcmr", 14 | "name": "tvr_video_sub_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "query_txt_db": "/txt/tvr_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/ST_FT_only/tvr_default", 26 | "eval_with_query_type": true, 27 | "max_before_nms": 200, 28 | "max_after_nms": 100, 29 | "distributed_eval": true, 30 | "nms_thd": -1, 31 | "q2c_alpha": 20, 32 | "max_vcmr_video": 100, 33 | "full_eval_tasks": [ 34 | "VCMR", 35 | "SVMR", 36 | "VR" 37 | ], 38 | "max_clip_len": 100, 39 | "max_txt_len": 60, 40 | "vfeat_version": "resnet_slowfast", 41 | "vfeat_interval": 1.5, 42 | "min_pred_l": 2, 43 | "max_pred_l": 16, 44 | "drop_svmr_prob": 0.8, 45 | "train_batch_size": 32, 46 | "val_batch_size": 20, 47 | "vcmr_eval_video_batch_size": 50, 48 | "vcmr_eval_batch_size": 80, 49 | "gradient_accumulation_steps":2, 50 | "learning_rate": 1e-04, 51 | "valid_steps": 400, 52 | "save_steps": 400, 53 | "num_train_steps": 10000, 54 | "optim": "adamw", 55 | "betas": [ 56 | 0.9, 57 | 0.98 58 | ], 59 | "dropout": 0.1, 60 | "weight_decay": 0.01, 61 | "grad_norm": 1.0, 62 | "warmup_steps": 1000, 63 | "lw_neg_q": 8.0, 64 | "lw_neg_ctx": 8.0, 65 | "lw_st_ed": 0.01, 66 | "ranking_loss_type": "hinge", 67 | "margin": 0.1, 68 | "hard_pool_size": [ 69 | 20 70 | ], 71 | "hard_neg_weights": [ 72 | 10 73 | ], 74 | "hard_negative_start_step": [ 75 | 4000 76 | ], 77 | "train_span_start_step": 0, 78 | "sub_ctx_len": 0, 79 | "use_all_neg": true, 80 | "seed": 77, 81 | "no_fp16": false, 82 | "n_workers": 4, 83 | "no_pin_mem": false, 84 | "rank": 0 85 | } 86 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvr_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "tvr_sub_only_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvr_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vcmr", 14 | "name": "tvr_sub_only_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "query_txt_db": "/txt/tvr_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_sub_only/tvr_default", 26 | "eval_with_query_type": true, 27 | "max_before_nms": 200, 28 | "max_after_nms": 100, 29 | "distributed_eval": true, 30 | "nms_thd": -1, 31 | "q2c_alpha": 20, 32 | "max_vcmr_video": 100, 33 | "full_eval_tasks": [ 34 | "VCMR", 35 | "SVMR", 36 | "VR" 37 | ], 38 | "max_clip_len": 100, 39 | "max_txt_len": 60, 40 | "vfeat_version": "resnet_slowfast", 41 | "vfeat_interval": 1.5, 42 | "min_pred_l": 2, 43 | "max_pred_l": 16, 44 | "drop_svmr_prob": 0.8, 45 | "train_batch_size": 32, 46 | "val_batch_size": 20, 47 | "vcmr_eval_video_batch_size": 50, 48 | "vcmr_eval_batch_size": 80, 49 | "gradient_accumulation_steps":2, 50 | "learning_rate": 1e-04, 51 | "valid_steps": 400, 52 | "save_steps": 400, 53 | "num_train_steps": 10000, 54 | "optim": "adamw", 55 | "betas": [ 56 | 0.9, 57 | 0.98 58 | ], 59 | "dropout": 0.1, 60 | "weight_decay": 0.01, 61 | "grad_norm": 1.0, 62 | "warmup_steps": 1000, 63 | "lw_neg_q": 8.0, 64 | "lw_neg_ctx": 8.0, 65 | "lw_st_ed": 0.01, 66 | "ranking_loss_type": "hinge", 67 | "margin": 0.1, 68 | "hard_pool_size": [ 69 | 20 70 | ], 71 | "hard_neg_weights": [ 72 | 10 73 | ], 74 | "hard_negative_start_step": [ 75 | 4000 76 | ], 77 | "train_span_start_step": 0, 78 | "sub_ctx_len": 0, 79 | "use_all_neg": true, 80 | "seed": 77, 81 | "no_fp16": false, 82 | "n_workers": 4, 83 | "no_pin_mem": false, 84 | "rank": 0 85 | } 86 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-tvr_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "tvr_video_only_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvr_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vcmr", 14 | "name": "tvr_video_only_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "query_txt_db": "/txt/tvr_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_video_only/tvr_default", 26 | "eval_with_query_type": true, 27 | "max_before_nms": 200, 28 | "max_after_nms": 100, 29 | "distributed_eval": true, 30 | "nms_thd": -1, 31 | "q2c_alpha": 20, 32 | "max_vcmr_video": 100, 33 | "full_eval_tasks": [ 34 | "VCMR", 35 | "SVMR", 36 | "VR" 37 | ], 38 | "max_clip_len": 100, 39 | "max_txt_len": 60, 40 | "vfeat_version": "resnet_slowfast", 41 | "vfeat_interval": 1.5, 42 | "min_pred_l": 2, 43 | "max_pred_l": 16, 44 | "drop_svmr_prob": 0.8, 45 | "train_batch_size": 32, 46 | "val_batch_size": 20, 47 | "vcmr_eval_video_batch_size": 50, 48 | "vcmr_eval_batch_size": 80, 49 | "gradient_accumulation_steps":2, 50 | "learning_rate": 1e-04, 51 | "valid_steps": 400, 52 | "save_steps": 400, 53 | "num_train_steps": 10000, 54 | "optim": "adamw", 55 | "betas": [ 56 | 0.9, 57 | 0.98 58 | ], 59 | "dropout": 0.1, 60 | "weight_decay": 0.01, 61 | "grad_norm": 1.0, 62 | "warmup_steps": 1000, 63 | "lw_neg_q": 8.0, 64 | "lw_neg_ctx": 8.0, 65 | "lw_st_ed": 0.01, 66 | "ranking_loss_type": "hinge", 67 | "margin": 0.1, 68 | "hard_pool_size": [ 69 | 20 70 | ], 71 | "hard_neg_weights": [ 72 | 10 73 | ], 74 | "hard_negative_start_step": [ 75 | 4000 76 | ], 77 | "train_span_start_step": 0, 78 | "sub_ctx_len": 0, 79 | "use_all_neg": true, 80 | "seed": 77, 81 | "no_fp16": false, 82 | "n_workers": 4, 83 | "no_pin_mem": false, 84 | "rank": 0 85 | } 86 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vatex_en_c-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "vatex_en_c_video_sub_train", 6 | "sub_txt_db": "/txt/vatex_subtitles.db", 7 | "vfeat_db": "/video/vatex", 8 | "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "vatex_en_c_video_sub_val", 15 | "sub_txt_db": "/txt/vatex_subtitles.db", 16 | "vfeat_db": "/video/vatex", 17 | "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl" 18 | } 19 | ], 20 | "model_config": "config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/ST_FT_only/vatex_en_c_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_gen_step": 30, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "compressed_db": false, 31 | "train_batch_size": 128, 32 | "val_batch_size": 128, 33 | "gradient_accumulation_steps": 1, 34 | "learning_rate": 1e-4, 35 | "lr_mul": 10.0, 36 | "valid_steps": 500, 37 | "num_train_steps": 7000, 38 | "optim": "adamw", 39 | "betas": [0.9, 0.98], 40 | "lsr": 0.1, 41 | "dropout": 0.1, 42 | "weight_decay": 0.01, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 700, 45 | "sub_ctx_len": 1, 46 | "seed": 77, 47 | "no_fp16": false, 48 | "n_workers": 4, 49 | "pin_mem": true 50 | } 51 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vatex_en_c_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "vatex_en_c_sub_only_train", 6 | "sub_txt_db": "/txt/vatex_subtitles.db", 7 | "vfeat_db": "/video/vatex", 8 | "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "vatex_en_c_sub_only_val", 15 | "sub_txt_db": "/txt/vatex_subtitles.db", 16 | "vfeat_db": "/video/vatex", 17 | "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl" 18 | } 19 | ], 20 | "model_config": "config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/multi_channel_ablation_sub_only/vatex_en_c_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_gen_step": 30, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "compressed_db": false, 31 | "train_batch_size": 128, 32 | "val_batch_size": 128, 33 | "gradient_accumulation_steps": 1, 34 | "learning_rate": 1e-4, 35 | "lr_mul": 10.0, 36 | "valid_steps": 500, 37 | "num_train_steps": 7000, 38 | "optim": "adamw", 39 | "betas": [0.9, 0.98], 40 | "lsr": 0.1, 41 | "dropout": 0.1, 42 | "weight_decay": 0.01, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 700, 45 | "sub_ctx_len": 1, 46 | "seed": 77, 47 | "no_fp16": false, 48 | "n_workers": 4, 49 | "pin_mem": true 50 | } 51 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vatex_en_c_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "vatex_en_c_video_only_train", 6 | "sub_txt_db": null, 7 | "vfeat_db": "/video/vatex", 8 | "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "vatex_en_c_video_only_val", 15 | "sub_txt_db": null, 16 | "vfeat_db": "/video/vatex", 17 | "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl" 18 | } 19 | ], 20 | "model_config": "config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/multi_channel_ablation_video_only/vatex_en_c_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_gen_step": 30, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "compressed_db": false, 31 | "train_batch_size": 128, 32 | "val_batch_size": 128, 33 | "gradient_accumulation_steps": 1, 34 | "learning_rate": 1e-4, 35 | "lr_mul": 10.0, 36 | "valid_steps": 500, 37 | "num_train_steps": 7000, 38 | "optim": "adamw", 39 | "betas": [0.9, 0.98], 40 | "lsr": 0.1, 41 | "dropout": 0.1, 42 | "weight_decay": 0.01, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 700, 45 | "sub_ctx_len": 1, 46 | "seed": 77, 47 | "no_fp16": false, 48 | "n_workers": 4, 49 | "pin_mem": true 50 | } 51 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vatex_en_r-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vr", 5 | "name": "vatex_en_r_video_sub_train", 6 | "sub_txt_db": "/txt/vatex_subtitles.db/", 7 | "vfeat_db": "/video/vatex", 8 | "query_txt_db": "/txt/vatex_en_r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vr", 14 | "name": "vatex_en_r_video_sub_val", 15 | "sub_txt_db": "/txt/vatex_subtitles.db/", 16 | "vfeat_db": "/video/vatex", 17 | "query_txt_db": "/txt/vatex_en_r_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/ST_FT_only/vatex_en_default", 26 | "distributed_eval": true, 27 | "max_vr_video": 100, 28 | "max_clip_len": 100, 29 | "max_txt_len": 60, 30 | "vfeat_version": "resnet_slowfast", 31 | "vfeat_interval": 1.5, 32 | "train_batch_size": 64, 33 | "val_batch_size": 20, 34 | "vr_eval_video_batch_size": 50, 35 | "vr_eval_q_batch_size": 80, 36 | "gradient_accumulation_steps": 2, 37 | "learning_rate": 7e-05, 38 | "valid_steps": 200, 39 | "save_steps": 200, 40 | "num_train_steps": 4000, 41 | "optim": "adamw", 42 | "betas": [ 43 | 0.9, 44 | 0.98 45 | ], 46 | "dropout": 0.1, 47 | "weight_decay": 0.01, 48 | "grad_norm": 1.0, 49 | "warmup_steps": 400, 50 | "lw_neg_q": 10.0, 51 | "lw_neg_ctx": 10.0, 52 | "ranking_loss_type": "hinge", 53 | "margin": 0.1, 54 | "hard_pool_size": [ 55 | 80 56 | ], 57 | "hard_neg_weights": [ 58 | 10 59 | ], 60 | "hard_negative_start_step": [ 61 | 2000 62 | ], 63 | "use_all_neg": true, 64 | "sub_ctx_len": 1, 65 | "seed": 77, 66 | "no_fp16": false, 67 | "n_workers": 4, 68 | "no_pin_mem": false, 69 | "rank": 0 70 | } 71 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vatex_en_r_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vr", 5 | "name": "vatex_en_r_sub_only_train", 6 | "sub_txt_db": "/txt/vatex_subtitles.db/", 7 | "vfeat_db": "/video/vatex", 8 | "query_txt_db": "/txt/vatex_en_r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vr", 14 | "name": "vatex_en_r_sub_only_val", 15 | "sub_txt_db": "/txt/vatex_subtitles.db/", 16 | "vfeat_db": "/video/vatex", 17 | "query_txt_db": "/txt/vatex_en_r_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_sub_only/vatex_en_r_default", 26 | "distributed_eval": true, 27 | "max_vr_video": 100, 28 | "max_clip_len": 100, 29 | "max_txt_len": 60, 30 | "vfeat_version": "resnet_slowfast", 31 | "vfeat_interval": 1.5, 32 | "train_batch_size": 64, 33 | "val_batch_size": 20, 34 | "vr_eval_video_batch_size": 50, 35 | "vr_eval_q_batch_size": 80, 36 | "gradient_accumulation_steps": 2, 37 | "learning_rate": 7e-05, 38 | "valid_steps": 200, 39 | "save_steps": 200, 40 | "num_train_steps": 4000, 41 | "optim": "adamw", 42 | "betas": [ 43 | 0.9, 44 | 0.98 45 | ], 46 | "dropout": 0.1, 47 | "weight_decay": 0.01, 48 | "grad_norm": 1.0, 49 | "warmup_steps": 400, 50 | "lw_neg_q": 10.0, 51 | "lw_neg_ctx": 10.0, 52 | "ranking_loss_type": "hinge", 53 | "margin": 0.1, 54 | "hard_pool_size": [ 55 | 80 56 | ], 57 | "hard_neg_weights": [ 58 | 10 59 | ], 60 | "hard_negative_start_step": [ 61 | 2000 62 | ], 63 | "use_all_neg": true, 64 | "sub_ctx_len": 1, 65 | "seed": 77, 66 | "no_fp16": false, 67 | "n_workers": 4, 68 | "no_pin_mem": false, 69 | "rank": 0 70 | } 71 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vatex_en_r_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vr", 5 | "name": "vatex_en_r_video_only_train", 6 | "sub_txt_db": "/txt/vatex_subtitles.db/", 7 | "vfeat_db": "/video/vatex", 8 | "query_txt_db": "/txt/vatex_en_r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vr", 14 | "name": "vatex_en_r_video_only_val", 15 | "sub_txt_db": "/txt/vatex_subtitles.db/", 16 | "vfeat_db": "/video/vatex", 17 | "query_txt_db": "/txt/vatex_en_r_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_video_only/vatex_en_r_default", 26 | "distributed_eval": true, 27 | "max_vr_video": 100, 28 | "max_clip_len": 100, 29 | "max_txt_len": 60, 30 | "vfeat_version": "resnet_slowfast", 31 | "vfeat_interval": 1.5, 32 | "train_batch_size": 64, 33 | "val_batch_size": 20, 34 | "vr_eval_video_batch_size": 50, 35 | "vr_eval_q_batch_size": 80, 36 | "gradient_accumulation_steps": 2, 37 | "learning_rate": 7e-05, 38 | "valid_steps": 200, 39 | "save_steps": 200, 40 | "num_train_steps": 4000, 41 | "optim": "adamw", 42 | "betas": [ 43 | 0.9, 44 | 0.98 45 | ], 46 | "dropout": 0.1, 47 | "weight_decay": 0.01, 48 | "grad_norm": 1.0, 49 | "warmup_steps": 400, 50 | "lw_neg_q": 10.0, 51 | "lw_neg_ctx": 10.0, 52 | "ranking_loss_type": "hinge", 53 | "margin": 0.1, 54 | "hard_pool_size": [ 55 | 80 56 | ], 57 | "hard_neg_weights": [ 58 | 10 59 | ], 60 | "hard_negative_start_step": [ 61 | 2000 62 | ], 63 | "use_all_neg": true, 64 | "sub_ctx_len": 1, 65 | "seed": 77, 66 | "no_fp16": false, 67 | "n_workers": 4, 68 | "no_pin_mem": false, 69 | "rank": 0 70 | } 71 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-violin-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "violin", 5 | "name": "violin_video_sub_train", 6 | "sub_txt_db": "/txt/violin_subtitles.db", 7 | "vfeat_db": "/video/violin", 8 | "query_txt_db": "/txt/violin_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "violin", 14 | "name": "violin_video_sub_val", 15 | "sub_txt_db": "/txt/violin_subtitles.db", 16 | "vfeat_db": "/video/violin", 17 | "query_txt_db": "/txt/violin_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/ST_FT_only/violin_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 3e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 6000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 8.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 600, 47 | "sub_ctx_len": 2, 48 | "seed": 77, 49 | "no_fp16": false, 50 | "n_workers": 4, 51 | "no_pin_mem": false, 52 | "rank": 0 53 | } 54 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-violin_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "violin", 5 | "name": "violin_sub_only_train", 6 | "sub_txt_db": "/txt/violin_subtitles.db", 7 | "vfeat_db": "/video/violin", 8 | "query_txt_db": "/txt/violin_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "violin", 14 | "name": "violin_sub_only__val", 15 | "sub_txt_db": "/txt/violin_subtitles.db", 16 | "vfeat_db": "/video/violin", 17 | "query_txt_db": "/txt/violin_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_sub_only/violin_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 3e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 6000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 8.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 600, 47 | "sub_ctx_len": 2, 48 | "seed": 77, 49 | "no_fp16": false, 50 | "n_workers": 4, 51 | "no_pin_mem": false, 52 | "rank": 0 53 | } 54 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-violin_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "violin", 5 | "name": "violin_video_only_train", 6 | "sub_txt_db": "/txt/violin_subtitles.db", 7 | "vfeat_db": "/video/violin", 8 | "query_txt_db": "/txt/violin_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "violin", 14 | "name": "violin_video_only_val", 15 | "sub_txt_db": "/txt/violin_subtitles.db", 16 | "vfeat_db": "/video/violin", 17 | "query_txt_db": "/txt/violin_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_video_only/violin_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 3e-05, 34 | "valid_steps": 200, 35 | "save_steps": 200, 36 | "num_train_steps": 6000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 8.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 600, 47 | "sub_ctx_len": 2, 48 | "seed": 77, 49 | "no_fp16": false, 50 | "n_workers": 4, 51 | "no_pin_mem": false, 52 | "rank": 0 53 | } 54 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vlep-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "vlep_video_sub_train", 6 | "sub_txt_db": "/txt/vlep_subtitles.db/", 7 | "vfeat_db": "/video/vlep", 8 | "query_txt_db": "/txt/vlep_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "vlep_video_sub_dev", 15 | "sub_txt_db": "/txt/vlep_subtitles.db/", 16 | "vfeat_db": "/video/vlep", 17 | "query_txt_db": "/txt/vlep_dev.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/ST_FT_only/vlep_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 100, 35 | "save_steps": 200, 36 | "num_train_steps": 2000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 200, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vlep_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "vlep_sub_only_train", 6 | "sub_txt_db": "/txt/vlep_subtitles.db/", 7 | "vfeat_db": "/video/vlep", 8 | "query_txt_db": "/txt/vlep_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "vlep_sub_only_dev", 15 | "sub_txt_db": "/txt/vlep_subtitles.db/", 16 | "vfeat_db": "/video/vlep", 17 | "query_txt_db": "/txt/vlep_dev.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_sub_only/vlep_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 100, 35 | "save_steps": 200, 36 | "num_train_steps": 2000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 200, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-vlep_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "vlep_video_only_train", 6 | "sub_txt_db": "/txt/vlep_subtitles.db/", 7 | "vfeat_db": "/video/vlep", 8 | "query_txt_db": "/txt/vlep_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "vlep_video_only_dev", 15 | "sub_txt_db": "/txt/vlep_subtitles.db/", 16 | "vfeat_db": "/video/vlep", 17 | "query_txt_db": "/txt/vlep_dev.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_video_only/vlep_default", 26 | "max_clip_len": 100, 27 | "max_txt_len": 120, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 4, 31 | "val_batch_size": 10, 32 | "gradient_accumulation_steps": 2, 33 | "learning_rate": 5e-05, 34 | "valid_steps": 100, 35 | "save_steps": 200, 36 | "num_train_steps": 2000, 37 | "optim": "adamw", 38 | "betas": [ 39 | 0.9, 40 | 0.98 41 | ], 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "lr_mul": 10.0, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 200, 47 | "lw_st_ed": 0.4, 48 | "sub_ctx_len": 0, 49 | "seed": 77, 50 | "no_fp16": false, 51 | "n_workers": 4, 52 | "no_pin_mem": false, 53 | "rank": 0 54 | } 55 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-yc2c-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "yc2c_video_sub_train", 6 | "sub_txt_db": "/txt/yc2_subtitles.db", 7 | "vfeat_db": "/video/yc2", 8 | "cap_txt_db": ["/txt/yc2r_train.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "yc2c_video_sub_val", 15 | "sub_txt_db": "/txt/yc2_subtitles.db", 16 | "vfeat_db": "/video/yc2", 17 | "gt_anno": "/txt/yc2c_val_release.jsonl" 18 | } 19 | ], 20 | "model_config": "config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/ST_FT_only/yc2c_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_gen_step": 30, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "compressed_db": false, 31 | "train_batch_size": 16, 32 | "val_batch_size": 16, 33 | "gradient_accumulation_steps": 1, 34 | "learning_rate": 1e-4, 35 | "lr_mul": 10.0, 36 | "valid_steps": 500, 37 | "num_train_steps": 7000, 38 | "optim": "adamw", 39 | "betas": [0.9, 0.98], 40 | "lsr": 0.1, 41 | "dropout": 0.1, 42 | "weight_decay": 0.01, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 700, 45 | "sub_ctx_len": 1, 46 | "seed": 77, 47 | "no_fp16": false, 48 | "n_workers": 4, 49 | "pin_mem": true 50 | } 51 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-yc2c_sub_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "yc2c_sub_only_train", 6 | "sub_txt_db": "/txt/yc2_subtitles.db", 7 | "vfeat_db": "/video/yc2", 8 | "cap_txt_db": ["/txt/yc2r_train.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "yc2c_sub_only_val", 15 | "sub_txt_db": "/txt/yc2_subtitles.db", 16 | "vfeat_db": "/video/yc2", 17 | "gt_anno": "/txt/yc2c_val_release.jsonl" 18 | } 19 | ], 20 | "model_config": "config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/multi_channel_ablation_sub_only/yc2c_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_gen_step": 30, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "compressed_db": false, 31 | "train_batch_size": 16, 32 | "val_batch_size": 16, 33 | "gradient_accumulation_steps": 1, 34 | "learning_rate": 1e-4, 35 | "lr_mul": 10.0, 36 | "valid_steps": 500, 37 | "num_train_steps": 7000, 38 | "optim": "adamw", 39 | "betas": [0.9, 0.98], 40 | "lsr": 0.1, 41 | "dropout": 0.1, 42 | "weight_decay": 0.01, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 700, 45 | "sub_ctx_len": 1, 46 | "seed": 77, 47 | "no_fp16": false, 48 | "n_workers": 4, 49 | "pin_mem": true 50 | } 51 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-yc2c_video_only-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "yc2c_video_only_train", 6 | "sub_txt_db": null, 7 | "vfeat_db": "/video/yc2", 8 | "cap_txt_db": ["/txt/yc2r_train.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "yc2c_video_only_val", 15 | "sub_txt_db": null, 16 | "vfeat_db": "/video/yc2", 17 | "gt_anno": "/txt/yc2c_val_release.jsonl" 18 | } 19 | ], 20 | "model_config": "config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 22 | "load_partial_pretrained": true, 23 | "skip_layer_loading": true, 24 | "output_dir": "/storage/multi_channel_ablation_video_only/yc2c_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_gen_step": 30, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "compressed_db": false, 31 | "train_batch_size": 16, 32 | "val_batch_size": 16, 33 | "gradient_accumulation_steps": 1, 34 | "learning_rate": 1e-4, 35 | "lr_mul": 10.0, 36 | "valid_steps": 500, 37 | "num_train_steps": 7000, 38 | "optim": "adamw", 39 | "betas": [0.9, 0.98], 40 | "lsr": 0.1, 41 | "dropout": 0.1, 42 | "weight_decay": 0.01, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 700, 45 | "sub_ctx_len": 1, 46 | "seed": 77, 47 | "no_fp16": false, 48 | "n_workers": 4, 49 | "pin_mem": true 50 | } 51 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-yc2r-4gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vr", 5 | "name": "yc2r_video_sub_train", 6 | "sub_txt_db": "/txt/yc2_subtitles.db/", 7 | "vfeat_db": "/video/yc2", 8 | "query_txt_db": "/txt/yc2r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vr", 14 | "name": "yc2r_video_sub_val", 15 | "sub_txt_db": "/txt/yc2_subtitles.db/", 16 | "vfeat_db": "/video/yc2", 17 | "query_txt_db": "/txt/yc2r_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/ST_FT_only/yc2r_default", 26 | "distributed_eval": true, 27 | "max_vr_video": 100, 28 | "max_clip_len": 100, 29 | "max_txt_len": 60, 30 | "vfeat_version": "resnet_slowfast", 31 | "vfeat_interval": 1.5, 32 | "train_batch_size": 48, 33 | "val_batch_size": 20, 34 | "vr_eval_video_batch_size": 50, 35 | "vr_eval_q_batch_size": 80, 36 | "gradient_accumulation_steps": 2, 37 | "learning_rate": 7e-05, 38 | "valid_steps": 200, 39 | "save_steps": 200, 40 | "num_train_steps": 4000, 41 | "optim": "adamw", 42 | "betas": [ 43 | 0.9, 44 | 0.98 45 | ], 46 | "dropout": 0.1, 47 | "weight_decay": 0.01, 48 | "grad_norm": 1.0, 49 | "warmup_steps": 400, 50 | "lw_neg_q": 10.0, 51 | "lw_neg_ctx": 10.0, 52 | "ranking_loss_type": "hinge", 53 | "margin": 0.1, 54 | "hard_pool_size": [ 55 | 80 56 | ], 57 | "hard_neg_weights": [ 58 | 10 59 | ], 60 | "hard_negative_start_step": [ 61 | 2000 62 | ], 63 | "use_all_neg": true, 64 | "sub_ctx_len": 1, 65 | "seed": 77, 66 | "no_fp16": false, 67 | "n_workers": 4, 68 | "no_pin_mem": false, 69 | "rank": 0 70 | } 71 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-yc2r_sub_only-4gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vr", 5 | "name": "yc2r_sub_only_train", 6 | "sub_txt_db": "/txt/yc2_subtitles.db/", 7 | "vfeat_db": "/video/yc2", 8 | "query_txt_db": "/txt/yc2r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vr", 14 | "name": "yc2r_sub_only_val", 15 | "sub_txt_db": "/txt/yc2_subtitles.db/", 16 | "vfeat_db": "/video/yc2", 17 | "query_txt_db": "/txt/yc2r_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_sub_only/yc2r_default", 26 | "distributed_eval": true, 27 | "max_vr_video": 100, 28 | "max_clip_len": 100, 29 | "max_txt_len": 60, 30 | "vfeat_version": "resnet_slowfast", 31 | "vfeat_interval": 1.5, 32 | "train_batch_size": 40, 33 | "val_batch_size": 20, 34 | "vr_eval_video_batch_size": 50, 35 | "vr_eval_q_batch_size": 80, 36 | "gradient_accumulation_steps": 2, 37 | "learning_rate": 7e-05, 38 | "valid_steps": 200, 39 | "save_steps": 200, 40 | "num_train_steps": 4000, 41 | "optim": "adamw", 42 | "betas": [ 43 | 0.9, 44 | 0.98 45 | ], 46 | "dropout": 0.1, 47 | "weight_decay": 0.01, 48 | "grad_norm": 1.0, 49 | "warmup_steps": 400, 50 | "lw_neg_q": 10.0, 51 | "lw_neg_ctx": 10.0, 52 | "ranking_loss_type": "hinge", 53 | "margin": 0.1, 54 | "hard_pool_size": [ 55 | 80 56 | ], 57 | "hard_neg_weights": [ 58 | 10 59 | ], 60 | "hard_negative_start_step": [ 61 | 2000 62 | ], 63 | "use_all_neg": true, 64 | "sub_ctx_len": 1, 65 | "seed": 77, 66 | "no_fp16": false, 67 | "n_workers": 4, 68 | "no_pin_mem": false, 69 | "rank": 0 70 | } 71 | -------------------------------------------------------------------------------- /config/FT_only_configs/train-yc2r_video_only-4gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vr", 5 | "name": "yc2r_video_only_train", 6 | "sub_txt_db": "/txt/yc2_subtitles.db/", 7 | "vfeat_db": "/video/yc2", 8 | "query_txt_db": "/txt/yc2r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vr", 14 | "name": "yc2r_video_only_val", 15 | "sub_txt_db": "/txt/yc2_subtitles.db/", 16 | "vfeat_db": "/video/yc2", 17 | "query_txt_db": "/txt/yc2r_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 23 | "load_partial_pretrained": true, 24 | "skip_layer_loading": true, 25 | "output_dir": "/storage/multi_channel_ablation_video_only/yc2r_default", 26 | "distributed_eval": true, 27 | "max_vr_video": 100, 28 | "max_clip_len": 100, 29 | "max_txt_len": 60, 30 | "vfeat_version": "resnet_slowfast", 31 | "vfeat_interval": 1.5, 32 | "train_batch_size": 64, 33 | "val_batch_size": 20, 34 | "vr_eval_video_batch_size": 50, 35 | "vr_eval_q_batch_size": 80, 36 | "gradient_accumulation_steps": 2, 37 | "learning_rate": 7e-05, 38 | "valid_steps": 200, 39 | "save_steps": 200, 40 | "num_train_steps": 4000, 41 | "optim": "adamw", 42 | "betas": [ 43 | 0.9, 44 | 0.98 45 | ], 46 | "dropout": 0.1, 47 | "weight_decay": 0.01, 48 | "grad_norm": 1.0, 49 | "warmup_steps": 400, 50 | "lw_neg_q": 10.0, 51 | "lw_neg_ctx": 10.0, 52 | "ranking_loss_type": "hinge", 53 | "margin": 0.1, 54 | "hard_pool_size": [ 55 | 80 56 | ], 57 | "hard_neg_weights": [ 58 | 10 59 | ], 60 | "hard_negative_start_step": [ 61 | 2000 62 | ], 63 | "use_all_neg": true, 64 | "sub_ctx_len": 1, 65 | "seed": 77, 66 | "no_fp16": false, 67 | "n_workers": 4, 68 | "no_pin_mem": false, 69 | "rank": 0 70 | } 71 | -------------------------------------------------------------------------------- /config/model_config/hero_finetune.json: -------------------------------------------------------------------------------- 1 | {"f_config":{ 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 514, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 6, 11 | "type_vocab_size": 2, 12 | "vocab_size": 50272 13 | }, 14 | "c_config": { 15 | "attention_probs_dropout_prob": 0.1, 16 | "hidden_act": "gelu", 17 | "hidden_dropout_prob": 0.1, 18 | "hidden_size": 768, 19 | "initializer_range": 0.02, 20 | "intermediate_size": 3072, 21 | "max_position_embeddings": 514, 22 | "num_attention_heads": 12, 23 | "num_hidden_layers": 3, 24 | "type_vocab_size": 2 25 | }, 26 | "q_config": { 27 | "attention_probs_dropout_prob": 0.1, 28 | "hidden_act": "gelu", 29 | "hidden_dropout_prob": 0.1, 30 | "hidden_size": 768, 31 | "initializer_range": 0.02, 32 | "intermediate_size": 3072, 33 | "num_attention_heads": 12, 34 | "max_position_embeddings": 514, 35 | "num_hidden_layers": 0, 36 | "type_vocab_size": 1, 37 | "vocab_size": 50272 38 | }, 39 | "d_config": { 40 | "attention_probs_dropout_prob": 0.1, 41 | "hidden_act": "gelu", 42 | "hidden_dropout_prob": 0.1, 43 | "hidden_size": 768, 44 | "initializer_range": 0.02, 45 | "intermediate_size": 3072, 46 | "max_position_embeddings": 1024, 47 | "num_attention_heads": 12, 48 | "num_hidden_layers": 2, 49 | "type_vocab_size": 1, 50 | "vocab_size": 50272 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /config/model_config/hero_pretrain.json: -------------------------------------------------------------------------------- 1 | {"f_config":{ 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 514, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 6, 11 | "type_vocab_size": 1, 12 | "vocab_size": 50265 13 | }, 14 | "c_config": { 15 | "attention_probs_dropout_prob": 0.1, 16 | "hidden_act": "gelu", 17 | "hidden_dropout_prob": 0.1, 18 | "hidden_size": 768, 19 | "initializer_range": 0.02, 20 | "intermediate_size": 3072, 21 | "max_position_embeddings": 514, 22 | "num_attention_heads": 12, 23 | "num_hidden_layers": 3, 24 | "type_vocab_size": 2 25 | }, 26 | "q_config": { 27 | "attention_probs_dropout_prob": 0.1, 28 | "hidden_act": "gelu", 29 | "hidden_dropout_prob": 0.1, 30 | "hidden_size": 768, 31 | "initializer_range": 0.02, 32 | "intermediate_size": 3072, 33 | "num_attention_heads": 12, 34 | "max_position_embeddings": 514, 35 | "num_hidden_layers": 0, 36 | "type_vocab_size": 1, 37 | "vocab_size": 50265 38 | } 39 | } -------------------------------------------------------------------------------- /config/model_config/hero_videoCap.json: -------------------------------------------------------------------------------- 1 | { "model": "hero", 2 | "f_config":{ 3 | "attention_probs_dropout_prob": 0.1, 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 514, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 6, 12 | "type_vocab_size": 2, 13 | "vocab_size": 50272 14 | }, 15 | "c_config": { 16 | "attention_probs_dropout_prob": 0.1, 17 | "hidden_act": "gelu", 18 | "hidden_dropout_prob": 0.1, 19 | "hidden_size": 768, 20 | "initializer_range": 0.02, 21 | "intermediate_size": 3072, 22 | "max_position_embeddings": 514, 23 | "num_attention_heads": 12, 24 | "num_hidden_layers": 3, 25 | "type_vocab_size": 2 26 | }, 27 | "d_config": { 28 | "attention_probs_dropout_prob": 0.1, 29 | "hidden_act": "gelu", 30 | "hidden_dropout_prob": 0.1, 31 | "hidden_size": 768, 32 | "initializer_range": 0.02, 33 | "intermediate_size": 3072, 34 | "max_position_embeddings": 1024, 35 | "num_attention_heads": 12, 36 | "num_hidden_layers": 2, 37 | "type_vocab_size": 1, 38 | "vocab_size": 50272 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /config/model_config/video_sub_feature_add_finetune.json: -------------------------------------------------------------------------------- 1 | { "model": "video_sub_feature_fusion", 2 | "video_sub_fusion_method": "add", 3 | "f_config":{ 4 | "attention_probs_dropout_prob": 0.1, 5 | "hidden_act": "gelu", 6 | "hidden_dropout_prob": 0.1, 7 | "hidden_size": 768, 8 | "initializer_range": 0.02, 9 | "intermediate_size": 3072, 10 | "max_position_embeddings": 514, 11 | "num_attention_heads": 12, 12 | "num_hidden_layers": 6, 13 | "type_vocab_size": 2, 14 | "vocab_size": 50272 15 | }, 16 | "c_config": { 17 | "attention_probs_dropout_prob": 0.1, 18 | "hidden_act": "gelu", 19 | "hidden_dropout_prob": 0.1, 20 | "hidden_size": 768, 21 | "initializer_range": 0.02, 22 | "intermediate_size": 3072, 23 | "max_position_embeddings": 514, 24 | "num_attention_heads": 12, 25 | "num_hidden_layers": 3, 26 | "type_vocab_size": 2 27 | }, 28 | "q_config": { 29 | "attention_probs_dropout_prob": 0.1, 30 | "hidden_act": "gelu", 31 | "hidden_dropout_prob": 0.1, 32 | "hidden_size": 768, 33 | "initializer_range": 0.02, 34 | "intermediate_size": 3072, 35 | "num_attention_heads": 12, 36 | "max_position_embeddings": 514, 37 | "num_hidden_layers": 0, 38 | "type_vocab_size": 1, 39 | "vocab_size": 50272 40 | }, 41 | "d_config": { 42 | "attention_probs_dropout_prob": 0.1, 43 | "hidden_act": "gelu", 44 | "hidden_dropout_prob": 0.1, 45 | "hidden_size": 768, 46 | "initializer_range": 0.02, 47 | "intermediate_size": 3072, 48 | "max_position_embeddings": 1024, 49 | "num_attention_heads": 12, 50 | "num_hidden_layers": 2, 51 | "type_vocab_size": 1, 52 | "vocab_size": 50272 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /config/model_config/video_sub_feature_concat_finetune.json: -------------------------------------------------------------------------------- 1 | { "model": "video_sub_feature_fusion", 2 | "video_sub_fusion_method": "concat", 3 | "f_config":{ 4 | "attention_probs_dropout_prob": 0.1, 5 | "hidden_act": "gelu", 6 | "hidden_dropout_prob": 0.1, 7 | "hidden_size": 768, 8 | "initializer_range": 0.02, 9 | "intermediate_size": 3072, 10 | "max_position_embeddings": 514, 11 | "num_attention_heads": 12, 12 | "num_hidden_layers": 6, 13 | "type_vocab_size": 2, 14 | "vocab_size": 50272 15 | }, 16 | "c_config": { 17 | "attention_probs_dropout_prob": 0.1, 18 | "hidden_act": "gelu", 19 | "hidden_dropout_prob": 0.1, 20 | "hidden_size": 768, 21 | "initializer_range": 0.02, 22 | "intermediate_size": 3072, 23 | "max_position_embeddings": 514, 24 | "num_attention_heads": 12, 25 | "num_hidden_layers": 3, 26 | "type_vocab_size": 2 27 | }, 28 | "q_config": { 29 | "attention_probs_dropout_prob": 0.1, 30 | "hidden_act": "gelu", 31 | "hidden_dropout_prob": 0.1, 32 | "hidden_size": 768, 33 | "initializer_range": 0.02, 34 | "intermediate_size": 3072, 35 | "num_attention_heads": 12, 36 | "max_position_embeddings": 514, 37 | "num_hidden_layers": 0, 38 | "type_vocab_size": 1, 39 | "vocab_size": 50272 40 | }, 41 | "d_config": { 42 | "attention_probs_dropout_prob": 0.1, 43 | "hidden_act": "gelu", 44 | "hidden_dropout_prob": 0.1, 45 | "hidden_size": 768, 46 | "initializer_range": 0.02, 47 | "intermediate_size": 3072, 48 | "max_position_embeddings": 1024, 49 | "num_attention_heads": 12, 50 | "num_hidden_layers": 2, 51 | "type_vocab_size": 1, 52 | "vocab_size": 50272 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /config/model_config/video_sub_sequence_finetune.json: -------------------------------------------------------------------------------- 1 | { "model": "video_sub_sequence_model", 2 | "f_config":{ 3 | "attention_probs_dropout_prob": 0.1, 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 514, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 6, 12 | "type_vocab_size": 2, 13 | "vocab_size": 50272 14 | }, 15 | "c_config": { 16 | "attention_probs_dropout_prob": 0.1, 17 | "hidden_act": "gelu", 18 | "hidden_dropout_prob": 0.1, 19 | "hidden_size": 768, 20 | "initializer_range": 0.02, 21 | "intermediate_size": 3072, 22 | "max_position_embeddings": 514, 23 | "num_attention_heads": 12, 24 | "num_hidden_layers": 3, 25 | "type_vocab_size": 2 26 | }, 27 | "q_config": { 28 | "attention_probs_dropout_prob": 0.1, 29 | "hidden_act": "gelu", 30 | "hidden_dropout_prob": 0.1, 31 | "hidden_size": 768, 32 | "initializer_range": 0.02, 33 | "intermediate_size": 3072, 34 | "num_attention_heads": 12, 35 | "max_position_embeddings": 514, 36 | "num_hidden_layers": 0, 37 | "type_vocab_size": 1, 38 | "vocab_size": 50272 39 | }, 40 | "d_config": { 41 | "attention_probs_dropout_prob": 0.1, 42 | "hidden_act": "gelu", 43 | "hidden_dropout_prob": 0.1, 44 | "hidden_size": 768, 45 | "initializer_range": 0.02, 46 | "intermediate_size": 3072, 47 | "max_position_embeddings": 1024, 48 | "num_attention_heads": 12, 49 | "num_hidden_layers": 2, 50 | "type_vocab_size": 1, 51 | "vocab_size": 50272 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /config/pretrain-tv-16gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "txt_db": "/txt", 3 | "img_db": "/video", 4 | "targets": [ 5 | {"name": "tv", 6 | "sub_txt_db": "tv_subtitles.db", 7 | "vfeat_db": "tv", 8 | "vfeat_interval": 1.5, 9 | "splits": [ 10 | {"name": "all", 11 | "tasks": ["mlm", "mfm-nce", "fom", "vsm"], 12 | "train_idx": "pretrain_splits/tv_train.json", 13 | "val_idx": "pretrain_splits/tv_val.json", 14 | "ratio": [2, 2, 1, 2] 15 | } 16 | ] 17 | } 18 | ], 19 | "targets_ratio": [1], 20 | "mask_prob": 0.15, 21 | "compressed_db": false, 22 | "model_config": "config/model_config/hero_pretrain.json", 23 | "checkpoint": "/pretrain/pretrain-tv-init.bin", 24 | "load_partial_pretrained" : true, 25 | "skip_layer_loading" : true, 26 | "output_dir": "/storage/default_pretrain_tv", 27 | "max_clip_len": 100, 28 | "max_txt_len": 60, 29 | "vfeat_version": "resnet_slowfast", 30 | "drop_svmr_prob": 0.8, 31 | "train_batch_size": 32, 32 | "val_batch_size": 32, 33 | "gradient_accumulation_steps": 2, 34 | "learning_rate": 3e-05, 35 | "valid_steps": 500, 36 | "save_steps": 500, 37 | "num_train_steps": 100000, 38 | "optim": "adamw", 39 | "betas": [ 40 | 0.9, 41 | 0.98 42 | ], 43 | "dropout": 0.1, 44 | "weight_decay": 0.01, 45 | "grad_norm": 1.0, 46 | "warmup_steps": 10000, 47 | "lw_neg_q": 8.0, 48 | "lw_neg_ctx": 8.0, 49 | "lw_st_ed": 0.01, 50 | "ranking_loss_type": "hinge", 51 | "margin": 0.1, 52 | "hard_pool_size": [ 53 | 20 54 | ], 55 | "hard_neg_weights": [ 56 | 10 57 | ], 58 | "hard_negative_start_step": [ 59 | 20000 60 | ], 61 | "train_span_start_step": 0, 62 | "sub_ctx_len": 0, 63 | "use_all_neg": true, 64 | "seed": 77, 65 | "no_fp16": false, 66 | "n_workers": 4, 67 | "no_pin_mem": false, 68 | "rank": 0 69 | } 70 | -------------------------------------------------------------------------------- /config/train-caption-multitask-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "tvc_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "cap_txt_db": ["/txt/tvc_train.db"], 9 | "batch_size": 4, 10 | "ratio": 2 11 | }, 12 | { 13 | "task": "videoCap", 14 | "name": "vatex_en_c_video_sub_train", 15 | "sub_txt_db": "/txt/vatex_subtitles.db", 16 | "vfeat_db": "/video/vatex", 17 | "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"], 18 | "batch_size": 128, 19 | "ratio": 2 20 | }, 21 | { 22 | "task": "videoCap", 23 | "name": "yc2c_video_sub_train", 24 | "sub_txt_db": "/txt/yc2_subtitles.db", 25 | "vfeat_db": "/video/yc2", 26 | "cap_txt_db": ["/txt/yc2r_train.db"], 27 | "batch_size": 16, 28 | "ratio": 1 29 | } 30 | ], 31 | "val_datasets": [ 32 | { 33 | "task": "videoCap", 34 | "name": "tvc_video_sub_val", 35 | "sub_txt_db": "/txt/tv_subtitles.db", 36 | "vfeat_db": "/video/tv", 37 | "batch_size": 8, 38 | "gt_anno": "/txt/tvc_val_release.jsonl" 39 | }, 40 | { 41 | "task": "videoCap", 42 | "name": "vatex_en_c_video_sub_val", 43 | "sub_txt_db": "/txt/vatex_subtitles.db", 44 | "vfeat_db": "/video/vatex", 45 | "batch_size": 128, 46 | "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl" 47 | }, 48 | { 49 | "task": "videoCap", 50 | "name": "yc2c_video_sub_val", 51 | "sub_txt_db": "/txt/yc2_subtitles.db", 52 | "vfeat_db": "/video/yc2", 53 | "batch_size": 16, 54 | "gt_anno": "/txt/yc2c_val_release.jsonl" 55 | } 56 | ], 57 | "compressed_db": false, 58 | "model_config": "/src/config/model_config/hero_videoCap.json", 59 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 60 | "output_dir": "/storage/MT_PT_FT/captioning_multi-task_default", 61 | "max_clip_len": 100, 62 | "max_txt_len": 60, 63 | "max_cap_per_vid": -1, 64 | "max_gen_step": 30, 65 | "vfeat_version": "resnet_slowfast", 66 | "vfeat_interval": 1.5, 67 | "train_batch_size": 4, 68 | "val_batch_size": 8, 69 | "gradient_accumulation_steps": 1, 70 | "learning_rate": 1e-4, 71 | "lr_mul": 10.0, 72 | "valid_steps": 500, 73 | "num_train_steps": 30000, 74 | "optim": "adamw", 75 | "betas": [0.9, 0.98], 76 | "lsr": 0.1, 77 | "dropout": 0.1, 78 | "weight_decay": 0.01, 79 | "grad_norm": 1.0, 80 | "warmup_steps": 3000, 81 | "sub_ctx_len": 1, 82 | "seed": 77, 83 | "no_fp16": false, 84 | "n_workers": 4, 85 | "pin_mem": true 86 | } 87 | -------------------------------------------------------------------------------- /config/train-how2qa-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "how2qa_video_sub_train", 6 | "sub_txt_db": "/txt/how2_subtitles.db", 7 | "vfeat_db": "/video/how2", 8 | "query_txt_db": "/txt/how2qa_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "how2qa_video_sub_val", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2qa_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 23 | "output_dir": "/storage/ST_PT_FT/how2qa_default", 24 | "max_clip_len": 100, 25 | "max_txt_len": 120, 26 | "vfeat_version": "resnet_slowfast", 27 | "vfeat_interval": 1.5, 28 | "train_batch_size": 4, 29 | "val_batch_size": 10, 30 | "gradient_accumulation_steps": 2, 31 | "learning_rate": 5e-05, 32 | "valid_steps": 200, 33 | "save_steps": 200, 34 | "num_train_steps": 2000, 35 | "optim": "adamw", 36 | "betas": [ 37 | 0.9, 38 | 0.98 39 | ], 40 | "dropout": 0.1, 41 | "weight_decay": 0.01, 42 | "lr_mul": 10.0, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 200, 45 | "lw_st_ed": 0.4, 46 | "sub_ctx_len": 0, 47 | "seed": 77, 48 | "no_fp16": false, 49 | "n_workers": 4, 50 | "no_pin_mem": false, 51 | "rank": 0 52 | } 53 | -------------------------------------------------------------------------------- /config/train-how2r-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "how2r_video_sub_train", 6 | "sub_txt_db": "/txt/how2_subtitles.db", 7 | "vfeat_db": "/video/how2", 8 | "query_txt_db": "/txt/how2r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vcmr", 14 | "name": "how2r_video_sub_val", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2r_val_1k.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 23 | "output_dir": "/storage/ST_PT_FT/how2r_default", 24 | "eval_with_query_type": true, 25 | "max_before_nms": 200, 26 | "max_after_nms": 100, 27 | "distributed_eval": true, 28 | "nms_thd": -1, 29 | "q2c_alpha": 20, 30 | "max_vcmr_video": 100, 31 | "full_eval_tasks": [ 32 | "VCMR", 33 | "SVMR", 34 | "VR" 35 | ], 36 | "max_clip_len": 100, 37 | "max_txt_len": 60, 38 | "vfeat_version": "resnet_slowfast", 39 | "vfeat_interval": 1.5, 40 | "min_pred_l": 3, 41 | "max_pred_l": 20, 42 | "drop_svmr_prob": 0.9, 43 | "train_batch_size": 32, 44 | "val_batch_size": 20, 45 | "vcmr_eval_video_batch_size": 50, 46 | "vcmr_eval_batch_size": 80, 47 | "gradient_accumulation_steps":2, 48 | "learning_rate": 1e-04, 49 | "valid_steps": 200, 50 | "save_steps": 200, 51 | "num_train_steps": 3000, 52 | "optim": "adamw", 53 | "betas": [ 54 | 0.9, 55 | 0.98 56 | ], 57 | "dropout": 0.1, 58 | "weight_decay": 0.01, 59 | "grad_norm": 1.0, 60 | "warmup_steps": 300, 61 | "lw_neg_q": 8.0, 62 | "lw_neg_ctx": 8.0, 63 | "lw_st_ed": 0.01, 64 | "ranking_loss_type": "hinge", 65 | "margin": 0.1, 66 | "hard_pool_size": [ 67 | 20 68 | ], 69 | "hard_neg_weights": [ 70 | 10 71 | ], 72 | "hard_negtiave_start_step": [ 73 | 1000 74 | ], 75 | "train_span_start_step": 0, 76 | "sub_ctx_len": 0, 77 | "use_all_neg": true, 78 | "seed": 77, 79 | "no_fp16": false, 80 | "n_workers": 4, 81 | "no_pin_mem": false, 82 | "rank": 0 83 | } 84 | -------------------------------------------------------------------------------- /config/train-qa-multitask-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "tvqa_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvqa_train.db", 9 | "ratio": 5 10 | }, 11 | { 12 | "task": "videoQA", 13 | "name": "how2qa_video_sub_train", 14 | "sub_txt_db": "/txt/how2_subtitles.db", 15 | "vfeat_db": "/video/how2", 16 | "query_txt_db": "/txt/how2qa_train.db", 17 | "ratio": 1 18 | }, 19 | { 20 | "task": "violin", 21 | "name": "violin_video_sub_train", 22 | "sub_txt_db": "/txt/violin_subtitles.db", 23 | "vfeat_db": "/video/violin", 24 | "query_txt_db": "/txt/violin_train.db", 25 | "ratio": 3 26 | }, 27 | { 28 | "task": "videoQA", 29 | "name": "vlep_video_sub_train", 30 | "sub_txt_db": "/txt/vlep_subtitles.db/", 31 | "vfeat_db": "/video/vlep", 32 | "query_txt_db": "/txt/vlep_train.db", 33 | "ratio": 1 34 | } 35 | ], 36 | "val_datasets": [ 37 | { 38 | "task": "videoQA", 39 | "name": "tvqa_video_sub_val", 40 | "sub_txt_db": "/txt/tv_subtitles.db", 41 | "vfeat_db": "/video/tv", 42 | "query_txt_db": "/txt/tvqa_val.db" 43 | }, 44 | { 45 | "task": "videoQA", 46 | "name": "how2qa_video_sub_val", 47 | "sub_txt_db": "/txt/how2_subtitles.db", 48 | "vfeat_db": "/video/how2", 49 | "query_txt_db": "/txt/how2qa_val.db" 50 | }, 51 | { 52 | "task": "violin", 53 | "name": "violin_video_sub_val", 54 | "sub_txt_db": "/txt/violin_subtitles.db", 55 | "vfeat_db": "/video/violin", 56 | "query_txt_db": "/txt/violin_val.db" 57 | }, 58 | { 59 | "task": "videoQA", 60 | "name": "vlep_video_sub_dev", 61 | "sub_txt_db": "/txt/vlep_subtitles.db/", 62 | "vfeat_db": "/video/vlep", 63 | "query_txt_db": "/txt/vlep_dev.db" 64 | } 65 | ], 66 | "compressed_db": false, 67 | "model_config": "config/model_config/hero_finetune.json", 68 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 69 | "output_dir": "/storage/MT_PT_FT/qa_multi-task_default", 70 | "max_clip_len": 100, 71 | "max_txt_len": 120, 72 | "vfeat_version": "resnet_slowfast", 73 | "vfeat_interval": 1.5, 74 | "train_batch_size": 4, 75 | "val_batch_size": 10, 76 | "gradient_accumulation_steps": 2, 77 | "learning_rate": 5e-05, 78 | "valid_steps": 200, 79 | "save_steps": 200, 80 | "num_train_steps": 20000, 81 | "optim": "adamw", 82 | "betas": [ 83 | 0.9, 84 | 0.98 85 | ], 86 | "dropout": 0.1, 87 | "weight_decay": 0.01, 88 | "lr_mul": 10.0, 89 | "grad_norm": 1.0, 90 | "warmup_steps": 2000, 91 | "lw_st_ed": 0.4, 92 | "sub_ctx_len": 0, 93 | "seed": 77, 94 | "no_fp16": false, 95 | "n_workers": 4, 96 | "no_pin_mem": false, 97 | "rank": 0 98 | } -------------------------------------------------------------------------------- /config/train-retrieval-multitask-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "tvr_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvr_train.db", 9 | "batch_size": 32, 10 | "ratio": 2 11 | }, 12 | { 13 | "task": "vcmr", 14 | "name": "how2r_video_sub_train", 15 | "sub_txt_db": "/txt/how2_subtitles.db", 16 | "vfeat_db": "/video/how2", 17 | "query_txt_db": "/txt/how2r_train.db", 18 | "batch_size": 32, 19 | "ratio": 1 20 | }, 21 | { 22 | "task": "vr", 23 | "name": "vatex_en_r_video_sub_train", 24 | "sub_txt_db": "/txt/vatex_subtitles.db/", 25 | "vfeat_db": "/video/vatex", 26 | "query_txt_db": "/txt/vatex_en_r_train.db", 27 | "batch_size": 64, 28 | "ratio": 3 29 | }, 30 | { 31 | "task": "vr", 32 | "name": "yc2r_video_sub_train", 33 | "sub_txt_db": "/txt/yc2_subtitles.db/", 34 | "vfeat_db": "/video/yc2", 35 | "query_txt_db": "/txt/yc2r_train.db", 36 | "batch_size": 48, 37 | "ratio": 1 38 | } 39 | ], 40 | "val_datasets": [ 41 | { 42 | "task": "vcmr", 43 | "name": "tvr_video_sub_val", 44 | "sub_txt_db": "/txt/tv_subtitles.db", 45 | "vfeat_db": "/video/tv", 46 | "query_txt_db": "/txt/tvr_val.db" 47 | }, 48 | { 49 | "task": "vcmr", 50 | "name": "how2r_video_sub_val", 51 | "sub_txt_db": "/txt/how2_subtitles.db", 52 | "vfeat_db": "/video/how2", 53 | "query_txt_db": "/txt/how2r_val_1k.db" 54 | }, 55 | { 56 | "task": "vr", 57 | "name": "vatex_en_r_video_sub_val", 58 | "sub_txt_db": "/txt/vatex_subtitles.db/", 59 | "vfeat_db": "/video/vatex", 60 | "query_txt_db": "/txt/vatex_en_r_val.db" 61 | }, 62 | { 63 | "task": "vr", 64 | "name": "yc2r_video_sub_val", 65 | "sub_txt_db": "/txt/yc2_subtitles.db/", 66 | "vfeat_db": "/video/yc2", 67 | "query_txt_db": "/txt/yc2r_val.db" 68 | } 69 | ], 70 | "compressed_db": false, 71 | "model_config": "config/model_config/hero_finetune.json", 72 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 73 | "output_dir": "/storage/MT_PT_FT/retrieval_multi-task_default", 74 | "eval_with_query_type": true, 75 | "max_before_nms": 200, 76 | "max_after_nms": 100, 77 | "distributed_eval": true, 78 | "nms_thd": -1, 79 | "q2c_alpha": 20, 80 | "max_vcmr_video": 100, 81 | "full_eval_tasks": [ 82 | "VCMR", 83 | "SVMR", 84 | "VR" 85 | ], 86 | "max_clip_len": 100, 87 | "max_txt_len": 60, 88 | "vfeat_version": "resnet_slowfast", 89 | "vfeat_interval": 1.5, 90 | "min_pred_l": 2, 91 | "max_pred_l": 16, 92 | "drop_svmr_prob": 0.8, 93 | "train_batch_size": 32, 94 | "val_batch_size": 20, 95 | "vcmr_eval_video_batch_size": 50, 96 | "vcmr_eval_batch_size": 80, 97 | "vr_eval_video_batch_size": 50, 98 | "vr_eval_batch_size": 80, 99 | "gradient_accumulation_steps":2, 100 | "learning_rate": 1e-04, 101 | "valid_steps": 400, 102 | "save_steps": 400, 103 | "num_train_steps": 10000, 104 | "optim": "adamw", 105 | "betas": [ 106 | 0.9, 107 | 0.98 108 | ], 109 | "dropout": 0.1, 110 | "weight_decay": 0.01, 111 | "grad_norm": 1.0, 112 | "warmup_steps": 1000, 113 | "lw_neg_q": 8.0, 114 | "lw_neg_ctx": 8.0, 115 | "lw_st_ed": 0.01, 116 | "ranking_loss_type": "hinge", 117 | "margin": 0.1, 118 | "hard_pool_size": [ 119 | 20 120 | ], 121 | "hard_neg_weights": [ 122 | 10 123 | ], 124 | "hard_negative_start_step": [ 125 | 4000 126 | ], 127 | "train_span_start_step": 0, 128 | "sub_ctx_len": 0, 129 | "use_all_neg": true, 130 | "seed": 77, 131 | "no_fp16": false, 132 | "n_workers": 4, 133 | "no_pin_mem": false, 134 | "rank": 0 135 | } 136 | -------------------------------------------------------------------------------- /config/train-tvc-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "tvc_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "cap_txt_db": ["/txt/tvc_train.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "tvc_video_sub_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "gt_anno": "/txt/tvc_val_release.jsonl" 18 | } 19 | ], 20 | "model_config": "/src/config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 22 | "load_partial_pretrained": false, 23 | "skip_layer_loading": false, 24 | "output_dir": "/storage/ST_PT_FT/tvc_default", 25 | "max_clip_len": 100, 26 | "max_txt_len": 60, 27 | "max_cap_per_vid": -1, 28 | "max_gen_step": 30, 29 | "vfeat_version": "resnet_slowfast", 30 | "vfeat_interval": 1.5, 31 | "compressed_db": false, 32 | "train_batch_size": 4, 33 | "val_batch_size": 8, 34 | "gradient_accumulation_steps": 1, 35 | "learning_rate": 1e-4, 36 | "lr_mul": 10.0, 37 | "valid_steps": 500, 38 | "num_train_steps": 7000, 39 | "optim": "adamw", 40 | "betas": [0.9, 0.98], 41 | "lsr": 0.1, 42 | "dropout": 0.1, 43 | "weight_decay": 0.01, 44 | "grad_norm": 1.0, 45 | "warmup_steps": 700, 46 | "sub_ctx_len": 1, 47 | "seed": 77, 48 | "no_fp16": false, 49 | "n_workers": 4, 50 | "pin_mem": true 51 | } 52 | -------------------------------------------------------------------------------- /config/train-tvqa-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "tvqa_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvqa_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "tvqa_video_sub_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "query_txt_db": "/txt/tvqa_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 23 | "output_dir": "/storage/ST_PT_FT/tvqa_default", 24 | "max_clip_len": 100, 25 | "max_txt_len": 120, 26 | "vfeat_version": "resnet_slowfast", 27 | "vfeat_interval": 1.5, 28 | "train_batch_size": 4, 29 | "val_batch_size": 10, 30 | "gradient_accumulation_steps": 2, 31 | "learning_rate": 5e-05, 32 | "valid_steps": 200, 33 | "save_steps": 200, 34 | "num_train_steps": 10000, 35 | "optim": "adamw", 36 | "betas": [ 37 | 0.9, 38 | 0.98 39 | ], 40 | "dropout": 0.1, 41 | "weight_decay": 0.01, 42 | "lr_mul": 10.0, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 1000, 45 | "lw_st_ed": 0.4, 46 | "sub_ctx_len": 0, 47 | "seed": 77, 48 | "no_fp16": false, 49 | "n_workers": 4, 50 | "no_pin_mem": false, 51 | "rank": 0 52 | } 53 | -------------------------------------------------------------------------------- /config/train-tvr-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vcmr", 5 | "name": "tvr_video_sub_train", 6 | "sub_txt_db": "/txt/tv_subtitles.db", 7 | "vfeat_db": "/video/tv", 8 | "query_txt_db": "/txt/tvr_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vcmr", 14 | "name": "tvr_video_sub_val", 15 | "sub_txt_db": "/txt/tv_subtitles.db", 16 | "vfeat_db": "/video/tv", 17 | "query_txt_db": "/txt/tvr_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 23 | "output_dir": "/storage/ST_PT_FT/tvr_default", 24 | "eval_with_query_type": true, 25 | "max_before_nms": 200, 26 | "max_after_nms": 100, 27 | "distributed_eval": true, 28 | "nms_thd": -1, 29 | "q2c_alpha": 20, 30 | "max_vcmr_video": 100, 31 | "full_eval_tasks": [ 32 | "VCMR", 33 | "SVMR", 34 | "VR" 35 | ], 36 | "max_clip_len": 100, 37 | "max_txt_len": 60, 38 | "vfeat_version": "resnet_slowfast", 39 | "vfeat_interval": 1.5, 40 | "min_pred_l": 2, 41 | "max_pred_l": 16, 42 | "drop_svmr_prob": 0.8, 43 | "train_batch_size": 32, 44 | "val_batch_size": 20, 45 | "vcmr_eval_video_batch_size": 50, 46 | "vcmr_eval_batch_size": 80, 47 | "gradient_accumulation_steps":2, 48 | "learning_rate": 1e-04, 49 | "valid_steps": 200, 50 | "save_steps": 200, 51 | "num_train_steps": 5000, 52 | "optim": "adamw", 53 | "betas": [ 54 | 0.9, 55 | 0.98 56 | ], 57 | "dropout": 0.1, 58 | "weight_decay": 0.01, 59 | "grad_norm": 1.0, 60 | "warmup_steps": 500, 61 | "lw_neg_q": 8.0, 62 | "lw_neg_ctx": 8.0, 63 | "lw_st_ed": 0.01, 64 | "ranking_loss_type": "hinge", 65 | "margin": 0.1, 66 | "hard_pool_size": [ 67 | 20 68 | ], 69 | "hard_neg_weights": [ 70 | 10 71 | ], 72 | "hard_negative_start_step": [ 73 | 2000 74 | ], 75 | "train_span_start_step": 0, 76 | "sub_ctx_len": 0, 77 | "use_all_neg": true, 78 | "seed": 77, 79 | "no_fp16": false, 80 | "n_workers": 4, 81 | "no_pin_mem": false, 82 | "rank": 0 83 | } 84 | -------------------------------------------------------------------------------- /config/train-vatex_en_c-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "vatex_en_c_video_sub_train", 6 | "sub_txt_db": "/txt/vatex_subtitles.db", 7 | "vfeat_db": "/video/vatex", 8 | "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "vatex_en_c_video_sub_val", 15 | "sub_txt_db": "/txt/vatex_subtitles.db", 16 | "vfeat_db": "/video/vatex", 17 | "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl" 18 | } 19 | ], 20 | "model_config": "config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 22 | "output_dir": "/storage/ST_PT_FT/vatex_en_c_default", 23 | "max_clip_len": 100, 24 | "max_txt_len": 60, 25 | "max_gen_step": 30, 26 | "vfeat_version": "resnet_slowfast", 27 | "vfeat_interval": 1.5, 28 | "compressed_db": false, 29 | "train_batch_size": 128, 30 | "val_batch_size": 128, 31 | "gradient_accumulation_steps": 1, 32 | "learning_rate": 1e-4, 33 | "lr_mul": 10.0, 34 | "valid_steps": 500, 35 | "num_train_steps": 7000, 36 | "optim": "adamw", 37 | "betas": [0.9, 0.98], 38 | "lsr": 0.1, 39 | "dropout": 0.1, 40 | "weight_decay": 0.01, 41 | "grad_norm": 1.0, 42 | "warmup_steps": 700, 43 | "sub_ctx_len": 1, 44 | "seed": 77, 45 | "no_fp16": false, 46 | "n_workers": 4, 47 | "pin_mem": true 48 | } 49 | -------------------------------------------------------------------------------- /config/train-vatex_en_r-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vr", 5 | "name": "vatex_en_r_video_sub_train", 6 | "sub_txt_db": "/txt/vatex_subtitles.db/", 7 | "vfeat_db": "/video/vatex", 8 | "query_txt_db": "/txt/vatex_en_r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vr", 14 | "name": "vatex_en_r_video_sub_val", 15 | "sub_txt_db": "/txt/vatex_subtitles.db/", 16 | "vfeat_db": "/video/vatex", 17 | "query_txt_db": "/txt/vatex_en_r_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 23 | "output_dir": "/storage/ST_PT_FT/vatex_en_r_default", 24 | "distributed_eval": true, 25 | "max_vr_video": 100, 26 | "max_clip_len": 100, 27 | "max_txt_len": 60, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 64, 31 | "val_batch_size": 20, 32 | "vr_eval_video_batch_size": 50, 33 | "vr_eval_q_batch_size": 80, 34 | "gradient_accumulation_steps": 2, 35 | "learning_rate": 7e-05, 36 | "valid_steps": 200, 37 | "save_steps": 200, 38 | "num_train_steps": 4000, 39 | "optim": "adamw", 40 | "betas": [ 41 | 0.9, 42 | 0.98 43 | ], 44 | "dropout": 0.1, 45 | "weight_decay": 0.01, 46 | "grad_norm": 1.0, 47 | "warmup_steps": 400, 48 | "lw_neg_q": 10.0, 49 | "lw_neg_ctx": 10.0, 50 | "ranking_loss_type": "hinge", 51 | "margin": 0.1, 52 | "hard_pool_size": [ 53 | 80 54 | ], 55 | "hard_neg_weights": [ 56 | 10 57 | ], 58 | "hard_negative_start_step": [ 59 | 2000 60 | ], 61 | "use_all_neg": true, 62 | "sub_ctx_len": 1, 63 | "seed": 77, 64 | "no_fp16": false, 65 | "n_workers": 4, 66 | "no_pin_mem": false, 67 | "rank": 0 68 | } 69 | -------------------------------------------------------------------------------- /config/train-violin-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "violin", 5 | "name": "violin_video_sub_train", 6 | "sub_txt_db": "/txt/violin_subtitles.db", 7 | "vfeat_db": "/video/violin", 8 | "query_txt_db": "/txt/violin_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "violin", 14 | "name": "violin_video_sub_val", 15 | "sub_txt_db": "/txt/violin_subtitles.db", 16 | "vfeat_db": "/video/violin", 17 | "query_txt_db": "/txt/violin_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 23 | "output_dir": "/storage/ST_PT_FT/violin_default", 24 | "max_clip_len": 100, 25 | "max_txt_len": 120, 26 | "vfeat_version": "resnet_slowfast", 27 | "vfeat_interval": 1.5, 28 | "train_batch_size": 4, 29 | "val_batch_size": 10, 30 | "gradient_accumulation_steps": 2, 31 | "learning_rate": 3e-05, 32 | "valid_steps": 200, 33 | "save_steps": 200, 34 | "num_train_steps": 6000, 35 | "optim": "adamw", 36 | "betas": [ 37 | 0.9, 38 | 0.98 39 | ], 40 | "dropout": 0.1, 41 | "weight_decay": 0.01, 42 | "lr_mul": 8.0, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 600, 45 | "sub_ctx_len": 2, 46 | "seed": 77, 47 | "no_fp16": false, 48 | "n_workers": 4, 49 | "no_pin_mem": false, 50 | "rank": 0 51 | } 52 | -------------------------------------------------------------------------------- /config/train-vlep-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoQA", 5 | "name": "vlep_video_sub_train", 6 | "sub_txt_db": "/txt/vlep_subtitles.db/", 7 | "vfeat_db": "/video/vlep", 8 | "query_txt_db": "/txt/vlep_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoQA", 14 | "name": "vlep_video_sub_dev", 15 | "sub_txt_db": "/txt/vlep_subtitles.db/", 16 | "vfeat_db": "/video/vlep", 17 | "query_txt_db": "/txt/vlep_dev.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 23 | "output_dir": "/storage/ST_PT_FT/vlep_default", 24 | "max_clip_len": 100, 25 | "max_txt_len": 120, 26 | "vfeat_version": "resnet_slowfast", 27 | "vfeat_interval": 1.5, 28 | "train_batch_size": 4, 29 | "val_batch_size": 10, 30 | "gradient_accumulation_steps": 2, 31 | "learning_rate": 5e-05, 32 | "valid_steps": 100, 33 | "save_steps": 200, 34 | "num_train_steps": 1000, 35 | "optim": "adamw", 36 | "betas": [ 37 | 0.9, 38 | 0.98 39 | ], 40 | "dropout": 0.1, 41 | "weight_decay": 0.01, 42 | "lr_mul": 10.0, 43 | "grad_norm": 1.0, 44 | "warmup_steps": 100, 45 | "lw_st_ed": 0.4, 46 | "sub_ctx_len": 0, 47 | "seed": 77, 48 | "no_fp16": false, 49 | "n_workers": 4, 50 | "no_pin_mem": false, 51 | "rank": 0 52 | } 53 | -------------------------------------------------------------------------------- /config/train-yc2c-8gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "videoCap", 5 | "name": "yc2c_video_sub_train", 6 | "sub_txt_db": "/txt/yc2_subtitles.db", 7 | "vfeat_db": "/video/yc2", 8 | "cap_txt_db": ["/txt/yc2r_train.db"] 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "videoCap", 14 | "name": "yc2c_video_sub_val", 15 | "sub_txt_db": "/txt/yc2_subtitles.db", 16 | "vfeat_db": "/video/yc2", 17 | "gt_anno": "/txt/yc2c_val_release.jsonl" 18 | } 19 | ], 20 | "model_config": "config/model_config/hero_videoCap.json", 21 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 22 | "output_dir": "/storage/ST_PT_FT/yc2c_default", 23 | "max_clip_len": 100, 24 | "max_txt_len": 60, 25 | "max_gen_step": 30, 26 | "vfeat_version": "resnet_slowfast", 27 | "vfeat_interval": 1.5, 28 | "compressed_db": false, 29 | "train_batch_size": 16, 30 | "val_batch_size": 16, 31 | "gradient_accumulation_steps": 1, 32 | "learning_rate": 1e-4, 33 | "lr_mul": 10.0, 34 | "valid_steps": 500, 35 | "num_train_steps": 7000, 36 | "optim": "adamw", 37 | "betas": [0.9, 0.98], 38 | "lsr": 0.1, 39 | "dropout": 0.1, 40 | "weight_decay": 0.01, 41 | "grad_norm": 1.0, 42 | "warmup_steps": 700, 43 | "sub_ctx_len": 1, 44 | "seed": 77, 45 | "no_fp16": false, 46 | "n_workers": 4, 47 | "pin_mem": true 48 | } 49 | -------------------------------------------------------------------------------- /config/train-yc2r-4gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_datasets": [ 3 | { 4 | "task": "vr", 5 | "name": "yc2r_video_sub_train", 6 | "sub_txt_db": "/txt/yc2_subtitles.db/", 7 | "vfeat_db": "/video/yc2", 8 | "query_txt_db": "/txt/yc2r_train.db" 9 | } 10 | ], 11 | "val_datasets": [ 12 | { 13 | "task": "vr", 14 | "name": "yc2r_video_sub_val", 15 | "sub_txt_db": "/txt/yc2_subtitles.db/", 16 | "vfeat_db": "/video/yc2", 17 | "query_txt_db": "/txt/yc2r_val.db" 18 | } 19 | ], 20 | "compressed_db": false, 21 | "model_config": "config/model_config/hero_finetune.json", 22 | "checkpoint": "/pretrain/hero-tv-ht100.pt", 23 | "output_dir": "/storage/ST_PT_FT/yc2r_video_sub_default", 24 | "distributed_eval": true, 25 | "max_vr_video": 100, 26 | "max_clip_len": 100, 27 | "max_txt_len": 60, 28 | "vfeat_version": "resnet_slowfast", 29 | "vfeat_interval": 1.5, 30 | "train_batch_size": 48, 31 | "val_batch_size": 20, 32 | "vr_eval_video_batch_size": 50, 33 | "vr_eval_q_batch_size": 80, 34 | "gradient_accumulation_steps": 2, 35 | "learning_rate": 7e-05, 36 | "valid_steps": 200, 37 | "save_steps": 200, 38 | "num_train_steps": 4000, 39 | "optim": "adamw", 40 | "betas": [ 41 | 0.9, 42 | 0.98 43 | ], 44 | "dropout": 0.1, 45 | "weight_decay": 0.01, 46 | "grad_norm": 1.0, 47 | "warmup_steps": 400, 48 | "lw_neg_q": 10.0, 49 | "lw_neg_ctx": 10.0, 50 | "ranking_loss_type": "hinge", 51 | "margin": 0.1, 52 | "hard_pool_size": [ 53 | 80 54 | ], 55 | "hard_neg_weights": [ 56 | 10 57 | ], 58 | "hard_negative_start_step": [ 59 | 2000 60 | ], 61 | "use_all_neg": true, 62 | "sub_ctx_len": 1, 63 | "seed": 77, 64 | "no_fp16": false, 65 | "n_workers": 4, 66 | "no_pin_mem": false, 67 | "rank": 0 68 | } 69 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | """ 6 | from .data import ( 7 | TxtTokLmdb, VideoFeatLmdb, SubTokLmdb, 8 | QueryTokLmdb, VideoFeatSubTokDataset, video_collate, 9 | VideoFeatDataset, QaQueryTokLmdb, SubOnlyDataset) 10 | from .loader import PrefetchLoader, MetaLoader 11 | from .vcmr import ( 12 | VcmrDataset, vcmr_collate, VcmrEvalDataset, vcmr_eval_collate, 13 | VcmrFullEvalDataset, vcmr_full_eval_collate, 14 | VcmrVideoOnlyDataset, VcmrVideoOnlyEvalDataset, 15 | VcmrVideoOnlyFullEvalDataset, 16 | VcmrSubOnlyDataset, VcmrSubOnlyEvalDataset, 17 | VcmrSubOnlyFullEvalDataset) 18 | from .vr import ( 19 | VrDataset, VrEvalDataset, VrSubTokLmdb, VrQueryTokLmdb, 20 | MsrvttQueryTokLmdb, 21 | VrFullEvalDataset, vr_collate, vr_eval_collate, 22 | vr_full_eval_collate, 23 | VrVideoOnlyDataset, VrVideoOnlyEvalDataset, 24 | VrVideoOnlyFullEvalDataset, 25 | VrSubOnlyDataset, VrSubOnlyEvalDataset, 26 | VrSubOnlyFullEvalDataset) 27 | from .videoQA import ( 28 | VideoQaDataset, video_qa_collate, 29 | VideoQaEvalDataset, video_qa_eval_collate, 30 | VideoQaVideoOnlyDataset, VideoQaVideoOnlyEvalDataset, 31 | VideoQaSubOnlyDataset, VideoQaSubOnlyEvalDataset) 32 | from .vlep import ( 33 | VlepDataset, vlep_collate, 34 | VlepEvalDataset, vlep_eval_collate, 35 | VlepVideoOnlyDataset, VlepVideoOnlyEvalDataset, 36 | VlepSubOnlyDataset, VlepSubOnlyEvalDataset) 37 | from .violin import ( 38 | ViolinDataset, violin_collate, 39 | ViolinEvalDataset, violin_eval_collate, 40 | ViolinVideoOnlyDataset, ViolinVideoOnlyEvalDataset, 41 | ViolinSubOnlyDataset, ViolinSubOnlyEvalDataset) 42 | from .fom import ( 43 | FomDataset, fom_collate, 44 | FomEvalDataset, fom_eval_collate) 45 | from .vsm import VsmDataset, vsm_collate 46 | from .mlm import ( 47 | VideoMlmDataset, mlm_collate) 48 | from .mfm import MfmDataset, mfm_collate 49 | from .videoCap import (VideoCapTrainDataset, VideoCapValDataset, 50 | CaptionTokLmdb, 51 | VideoCapEvalDataset, 52 | VideoCapVideoOnlyTrainDataset, 53 | VideoCapVideoOnlyValDataset, 54 | VideoCapVideoOnlyEvalDataset, 55 | VideoCapSubOnlyTrainDataset, 56 | VideoCapSubOnlyValDataset, 57 | VideoCapSubOnlyEvalDataset) 58 | from .tvc import ( 59 | TvcTrainDataset, TvcValDataset, TvcTokLmdb, 60 | TvcEvalDataset, 61 | TvcVideoOnlyValDataset, TvcVideoOnlyTrainDataset, 62 | TvcVideoOnlyEvalDataset, 63 | TvcSubOnlyValDataset, TvcSubOnlyTrainDataset, 64 | TvcSubOnlyEvalDataset) 65 | -------------------------------------------------------------------------------- /data/fom.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | Pretrain FOM dataset 6 | 7 | copied/modified from HERO 8 | (https://github.com/linjieli222/HERO) 9 | """ 10 | import copy 11 | import random 12 | 13 | from torch.utils.data import Dataset 14 | import torch 15 | from toolz.sandbox import unzip 16 | import horovod.torch as hvd 17 | 18 | from .data import VideoFeatSubTokDataset, _check_ngpu, video_collate 19 | 20 | 21 | class FomDataset(Dataset): 22 | def __init__(self, video_ids, vid_sub_db, random_reorder_p=0.15): 23 | assert isinstance(vid_sub_db, VideoFeatSubTokDataset) 24 | self.vid_sub_db = vid_sub_db 25 | if _check_ngpu() > 1: 26 | self.ids = video_ids[hvd.rank()::hvd.size()] 27 | else: 28 | self.ids = video_ids 29 | self.random_reorder_p = random_reorder_p 30 | 31 | def __len__(self): 32 | return len(self.ids) 33 | 34 | def __getitem__(self, i): 35 | vid_ = self.ids[i] 36 | (f_sub_input_ids, f_v_feats, f_attn_masks, 37 | c_v_feats, c_attn_masks, 38 | num_subs, sub2frames) = self.vid_sub_db[vid_] 39 | c_pos_ids = [i for i in range(len(c_v_feats))] 40 | # Random shuffle 15% of pos_ids 41 | orders, targets = random_reorder( 42 | list(range(len(c_pos_ids))), self.random_reorder_p) 43 | orders = torch.tensor(orders, dtype=torch.long) 44 | targets = torch.tensor(targets, dtype=torch.long) 45 | video_inputs = ( 46 | f_sub_input_ids, f_v_feats, f_attn_masks, 47 | c_v_feats, c_attn_masks, 48 | num_subs, sub2frames) 49 | out = (video_inputs, orders, targets) 50 | return out 51 | 52 | 53 | def fom_collate(inputs): 54 | (video_inputs, orders, targets) = map(list, unzip(inputs)) 55 | batch = video_collate(video_inputs) 56 | 57 | clip_level_v_feats = batch["c_v_feats"] 58 | num_frames = [item.size(0) for item in orders] 59 | 60 | all_orders = torch.arange( 61 | 0, clip_level_v_feats.size(1), dtype=torch.long).unsqueeze(0).repeat( 62 | clip_level_v_feats.size(0), 1) 63 | all_targets = torch.ones_like(all_orders) * -1 64 | for i, nframe in enumerate(num_frames): 65 | all_orders[i, :nframe] = orders[i] 66 | all_targets[i, :nframe] = targets[i] 67 | reordered_frame_idx = [] 68 | binary_targets = [] 69 | bs, max_vl = all_orders.size() 70 | for clip_idx in range(bs): 71 | for i in range(num_frames[clip_idx]): 72 | if all_targets[clip_idx, i] == -1: 73 | continue 74 | for j in range(i+1, num_frames[clip_idx]): 75 | if all_targets[clip_idx, j] == -1: 76 | continue 77 | reordered_frame_idx.append(clip_idx*max_vl+i) 78 | reordered_frame_idx.append(clip_idx*max_vl+j) 79 | if all_targets[clip_idx, i] > all_targets[clip_idx, j]: 80 | binary_targets.append(0) 81 | else: 82 | binary_targets.append(1) 83 | 84 | reordered_frame_idx.append(clip_idx*max_vl+j) 85 | reordered_frame_idx.append(clip_idx*max_vl+i) 86 | if all_targets[clip_idx, j] > all_targets[clip_idx, i]: 87 | binary_targets.append(0) 88 | else: 89 | binary_targets.append(1) 90 | reordered_frame_idx = torch.tensor(reordered_frame_idx, dtype=torch.long) 91 | binary_targets = torch.tensor(binary_targets, dtype=torch.long) 92 | batch["shuffled_orders"] = all_orders 93 | batch["targets"] = all_targets 94 | batch['reordered_frame_idx'] = reordered_frame_idx 95 | batch['binary_targets'] = binary_targets 96 | return batch 97 | 98 | 99 | def random_reorder(pos_ids, random_reorder_p=0.15): 100 | """ 101 | random reorder frame positions 102 | """ 103 | selected_pos = [] 104 | target_pos = [] 105 | for i, pos_id in enumerate(pos_ids): 106 | prob = random.random() 107 | # mask token with 15% probability 108 | if prob < random_reorder_p: 109 | selected_pos.append(i) 110 | target_pos.append(pos_id) 111 | target_pos_shuffled = copy.deepcopy(target_pos) 112 | random.shuffle(target_pos_shuffled) 113 | output_order = copy.deepcopy(pos_ids) 114 | output_target = [-1] * len(output_order) 115 | for i, pos in enumerate(selected_pos): 116 | output_order[pos] = target_pos_shuffled[i] 117 | output_target[target_pos_shuffled[i]] = pos 118 | return output_order, output_target 119 | 120 | 121 | class FomEvalDataset(FomDataset): 122 | def __getitem__(self, i): 123 | vid = self.ids[i] 124 | tensors = super().__getitem__(i) 125 | return (vid, *tensors) 126 | 127 | 128 | def fom_eval_collate(inputs): 129 | vids, batch = [], [] 130 | for id_, *tensors in inputs: 131 | vids.append(id_) 132 | batch.append(tensors) 133 | batch = fom_collate(batch) 134 | batch['vids'] = vids 135 | return batch 136 | -------------------------------------------------------------------------------- /data/mfm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | Pretrain MFM dataset 6 | 7 | copied/modified from HERO 8 | (https://github.com/linjieli222/HERO) 9 | """ 10 | import random 11 | 12 | import torch 13 | from torch.nn.utils.rnn import pad_sequence 14 | from torch.utils.data import Dataset 15 | from toolz.sandbox import unzip 16 | from cytoolz import concat 17 | import horovod.torch as hvd 18 | 19 | from .data import VideoFeatSubTokDataset, video_collate, _check_ngpu 20 | 21 | 22 | def _get_img_mask(mask_prob, num_frame): 23 | img_mask = [random.random() < mask_prob for _ in range(num_frame)] 24 | if not any(img_mask): 25 | # at least mask 1 26 | img_mask[random.choice(range(num_frame))] = True 27 | img_mask = torch.tensor(img_mask) 28 | return img_mask 29 | 30 | 31 | def _get_feat_target(img_feat, img_masks): 32 | img_masks_ext = img_masks.unsqueeze(-1).expand_as(img_feat) # (n, m, d) 33 | feat_dim = img_feat.size(-1) 34 | feat_targets = img_feat[img_masks_ext].contiguous().view( 35 | -1, feat_dim) # (s, d) 36 | return feat_targets 37 | 38 | 39 | def _mask_img_feat(img_feat, img_masks): 40 | img_masks_ext = img_masks.unsqueeze(-1).expand_as(img_feat) 41 | img_feat_masked = img_feat.data.masked_fill(img_masks_ext, 0) 42 | return img_feat_masked 43 | 44 | 45 | class MfmDataset(Dataset): 46 | def __init__(self, video_ids, vid_sub_db, mask_prob=0.15): 47 | assert isinstance(vid_sub_db, VideoFeatSubTokDataset) 48 | self.mask_prob = mask_prob 49 | self.vid_sub_db = vid_sub_db 50 | if _check_ngpu() > 1: 51 | self.ids = video_ids[hvd.rank()::hvd.size()] 52 | else: 53 | self.ids = video_ids 54 | 55 | def __len__(self): 56 | return len(self.ids) 57 | 58 | def __getitem__(self, i): 59 | vid = self.ids[i] 60 | (all_input_ids, f_v_feats, f_attn_masks, 61 | c_v_feats, c_attn_masks, 62 | num_subs, sub2frames) = self.vid_sub_db[vid] 63 | 64 | c_frame_mask = _get_img_mask(self.mask_prob, c_v_feats.size(0)) 65 | frame_masks = [] 66 | for i, frames in sub2frames: 67 | if len(frames): 68 | frame_masks.append( 69 | c_frame_mask.index_select(0, torch.tensor(frames))) 70 | else: 71 | frame_masks.append(torch.zeros(1, dtype=torch.bool)) 72 | c_pos_ids = torch.tensor(range(len(c_v_feats)), dtype=torch.long) 73 | c_frame_mask = c_frame_mask.index_select(0, c_pos_ids) 74 | return ((all_input_ids, f_v_feats, f_attn_masks, 75 | c_v_feats, c_attn_masks, 76 | num_subs, sub2frames), 77 | frame_masks, c_frame_mask) 78 | 79 | 80 | def mfm_collate(inputs): 81 | video_inputs, all_frame_masks, c_frame_masks = map(list, unzip(inputs)) 82 | batch = video_collate(video_inputs) 83 | 84 | # mask features 85 | frame_masks = pad_sequence(list(concat(all_frame_masks)), 86 | batch_first=True, padding_value=0) 87 | c_frame_masks = pad_sequence(c_frame_masks, 88 | batch_first=True, padding_value=0) 89 | f_v_feats = batch['f_v_feats'] 90 | f_v_feats = _mask_img_feat(f_v_feats, frame_masks) 91 | c_v_feats = batch['c_v_feats'] 92 | feat_targets = _get_feat_target(c_v_feats, c_frame_masks) 93 | c_v_feats = _mask_img_feat(c_v_feats, c_frame_masks) 94 | 95 | batch['f_v_feats'] = f_v_feats 96 | batch['f_v_masks'] = frame_masks 97 | batch['c_v_feats'] = c_v_feats 98 | batch['c_v_masks'] = c_frame_masks 99 | batch['feat_targets'] = feat_targets 100 | return batch 101 | -------------------------------------------------------------------------------- /data/vlep.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | VLEP dataset 6 | 7 | copied/modified from HERO 8 | (https://github.com/linjieli222/HERO) 9 | """ 10 | import torch 11 | from .videoQA import ( 12 | VideoQaDataset, video_qa_eval_collate, video_qa_collate) 13 | from .data import (SubOnlyDataset, VideoFeatDataset, QaQueryTokLmdb) 14 | 15 | 16 | class VlepDataset(VideoQaDataset): 17 | 18 | def __getitem__(self, i): 19 | vid, qids = self.getids(i) 20 | video_inputs = self.video_db.__getitem__(vid) 21 | (frame_level_input_ids, frame_level_v_feats, 22 | frame_level_attn_masks, frame_level_sub_attn_masks, 23 | clip_level_v_feats, clip_level_attn_masks, num_subs, 24 | sub_idx2frame_idx) = video_inputs 25 | nframes = len(clip_level_v_feats) 26 | 27 | all_vids = [] 28 | all_targets = [] 29 | all_ts_targets = [] 30 | all_qa_input_ids = [] 31 | all_qa_attn_masks = [] 32 | all_video_qa_inputs = [] 33 | for qid in qids: 34 | example = self.query_db[qid] 35 | if example['target'] is not None: 36 | target = torch.LongTensor([example['target']]) 37 | else: 38 | target = torch.LongTensor([-1]) 39 | if example['ts'] is not None: 40 | st_idx, ed_idx = self.get_st_ed_label( 41 | example['ts'], max_idx=nframes-1) 42 | ts_target = torch.LongTensor( 43 | [st_idx, ed_idx]) 44 | else: 45 | ts_target = torch.LongTensor([-1, -1]) 46 | 47 | input_ids = example["input_ids"] 48 | for a_input_ids in input_ids: 49 | f_sub_qa_input_ids = [] 50 | f_sub_qa_attn_masks = [] 51 | sub_qa_attn_masks = [] 52 | curr_qa_input_id = torch.tensor( 53 | [self.query_db.sep] + a_input_ids) 54 | curr_qa_attn_masks = torch.tensor([1]*len(curr_qa_input_id)) 55 | all_qa_input_ids.append(curr_qa_input_id) 56 | all_qa_attn_masks.append(curr_qa_attn_masks) 57 | for f_sub_input_ids, f_attn_masks, sub_attn_masks in zip( 58 | frame_level_input_ids, frame_level_attn_masks, 59 | frame_level_sub_attn_masks): 60 | curr_f_sub_qa_input_ids = torch.cat(( 61 | f_sub_input_ids, curr_qa_input_id)) 62 | curr_f_sub_qa_attn_masks = torch.cat(( 63 | f_attn_masks, curr_qa_attn_masks)) 64 | curr_sub_qa_attn_masks = torch.cat( 65 | (sub_attn_masks, curr_qa_attn_masks)) 66 | f_sub_qa_input_ids.append(curr_f_sub_qa_input_ids) 67 | f_sub_qa_attn_masks.append(curr_f_sub_qa_attn_masks) 68 | sub_qa_attn_masks.append(curr_sub_qa_attn_masks) 69 | curr_video_qa_inputs = ( 70 | f_sub_qa_input_ids, frame_level_v_feats, 71 | f_sub_qa_attn_masks, sub_qa_attn_masks, 72 | clip_level_v_feats, clip_level_attn_masks, num_subs, 73 | sub_idx2frame_idx) 74 | all_video_qa_inputs.append(curr_video_qa_inputs) 75 | all_vids.append(vid) 76 | all_targets.append(target) 77 | all_ts_targets.append(ts_target) 78 | out = (all_video_qa_inputs, all_qa_input_ids, all_qa_attn_masks, 79 | all_vids, all_targets, all_ts_targets) 80 | return out 81 | 82 | 83 | vlep_collate = video_qa_collate 84 | 85 | 86 | class VlepEvalDataset(VlepDataset): 87 | def __getitem__(self, i): 88 | vid, qids = self.getids(i) 89 | outs = super().__getitem__(i) 90 | return qids, outs 91 | 92 | 93 | vlep_eval_collate = video_qa_eval_collate 94 | 95 | 96 | class VlepVideoOnlyDataset(VlepDataset): 97 | def __validate_input_db__(self): 98 | assert isinstance(self.query_db, QaQueryTokLmdb) 99 | assert isinstance(self.video_db, VideoFeatDataset) 100 | 101 | 102 | class VlepVideoOnlyEvalDataset(VlepVideoOnlyDataset): 103 | def __getitem__(self, i): 104 | vid, qids = self.getids(i) 105 | outs = super().__getitem__(i) 106 | return qids, outs 107 | 108 | 109 | class VlepSubOnlyDataset(VlepDataset): 110 | def __validate_input_db__(self): 111 | assert isinstance(self.query_db, QaQueryTokLmdb) 112 | assert isinstance(self.video_db, SubOnlyDataset) 113 | 114 | 115 | class VlepSubOnlyEvalDataset(VlepSubOnlyDataset): 116 | def __getitem__(self, i): 117 | vid, qids = self.getids(i) 118 | outs = super().__getitem__(i) 119 | return qids, outs 120 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/README.md: -------------------------------------------------------------------------------- 1 | # coco-caption 2 | 3 | Original README can be found at [tylin/coco-caption](https://github.com/tylin/coco-caption/blob/3f0fe9b819c0ea881a56441e4de1146924a394eb/README.md). 4 | 5 | ## License 6 | 7 | All files in the pycocoevalcap directory are under 8 | [BSD 2-clause "Simplified" License](https://github.com/tylin/coco-caption/blob/3f0fe9b819c0ea881a56441e4de1146924a394eb/license.txt) 9 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from .bleu_scorer import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(gts.keys() == res.keys()) 24 | imgIds = gts.keys() 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) >= 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=0) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from .cider_scorer import CiderScorer 11 | import pdb 12 | 13 | class Cider: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | assert(gts.keys() == res.keys()) 33 | imgIds = gts.keys() 34 | 35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 36 | 37 | for id in imgIds: 38 | hypo = res[id] 39 | ref = gts[id] 40 | 41 | # Sanity check. 42 | assert(type(hypo) is list) 43 | assert(len(hypo) == 1) 44 | assert(type(ref) is list) 45 | assert(len(ref) > 0) 46 | 47 | cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr" 55 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/meteor/tests/test_meteor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import unittest 5 | 6 | from nlgeval.pycocoevalcap.meteor.meteor import Meteor 7 | 8 | 9 | class TestMeteor(unittest.TestCase): 10 | def test_compute_score(self): 11 | m = Meteor() 12 | 13 | s = m.compute_score({0: ["test"]}, {0: ["test"]}) 14 | self.assertEqual(s, (1.0, [1.0])) 15 | 16 | s = m.compute_score({0: ["テスト"]}, {0: ["テスト"]}) 17 | self.assertEqual(s, (1.0, [1.0])) 18 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/rouge/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | def my_lcs(string, sub): 14 | """ 15 | Calculates longest common subsequence for a pair of tokenized strings 16 | :param string : list of str : tokens from a string split using whitespace 17 | :param sub : list of str : shorter string, also split using whitespace 18 | :returns: length (list of int): length of the longest common subsequence between the two strings 19 | 20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 21 | """ 22 | if(len(string)< len(sub)): 23 | sub, string = string, sub 24 | 25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 26 | 27 | for j in range(1,len(sub)+1): 28 | for i in range(1,len(string)+1): 29 | if(string[i-1] == sub[j-1]): 30 | lengths[i][j] = lengths[i-1][j-1] + 1 31 | else: 32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 33 | 34 | return lengths[len(string)][len(sub)] 35 | 36 | class Rouge(): 37 | ''' 38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 39 | 40 | ''' 41 | def __init__(self): 42 | # vrama91: updated the value below based on discussion with Hovey 43 | self.beta = 1.2 44 | 45 | def calc_score(self, candidate, refs): 46 | """ 47 | Compute ROUGE-L score given one candidate and references for an image 48 | :param candidate: str : candidate sentence to be evaluated 49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 50 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 51 | """ 52 | assert(len(candidate)==1) 53 | assert(len(refs)>0) 54 | prec = [] 55 | rec = [] 56 | 57 | # split into tokens 58 | token_c = candidate[0].split(" ") 59 | 60 | for reference in refs: 61 | # split into tokens 62 | token_r = reference.split(" ") 63 | # compute the longest common subsequence 64 | lcs = my_lcs(token_r, token_c) 65 | prec.append(lcs/float(len(token_c))) 66 | rec.append(lcs/float(len(token_r))) 67 | 68 | prec_max = max(prec) 69 | rec_max = max(rec) 70 | 71 | if(prec_max!=0 and rec_max !=0): 72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 73 | else: 74 | score = 0.0 75 | return score 76 | 77 | def compute_score(self, gts, res): 78 | """ 79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 80 | Invoked by evaluate_captions.py 81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 84 | """ 85 | assert(gts.keys() == res.keys()) 86 | imgIds = gts.keys() 87 | 88 | score = [] 89 | for id in imgIds: 90 | hypo = res[id] 91 | ref = gts[id] 92 | 93 | score.append(self.calc_score(hypo, ref)) 94 | 95 | # Sanity check. 96 | assert(type(hypo) is list) 97 | assert(len(hypo) == 1) 98 | assert(type(ref) is list) 99 | assert(len(ref) > 0) 100 | 101 | average_score = np.mean(np.array(score)) 102 | return average_score, np.array(score) 103 | 104 | def method(self): 105 | return "Rouge" 106 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /eval/pycocoevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import subprocess 13 | import tempfile 14 | 15 | # path to the stanford corenlp jar 16 | STANFORD_CORENLP_3_4_1_JAR = ('/workspace/cococap_bin/' 17 | 'stanford-corenlp-3.4.1.jar') 18 | 19 | # punctuations to be removed from the sentences 20 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 21 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 22 | 23 | class PTBTokenizer: 24 | """Python wrapper of Stanford PTBTokenizer""" 25 | 26 | def tokenize(self, captions_for_image): 27 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 28 | 'edu.stanford.nlp.process.PTBTokenizer', \ 29 | '-preserveLines', '-lowerCase'] 30 | 31 | # ====================================================== 32 | # prepare data for PTB Tokenizer 33 | # ====================================================== 34 | final_tokenized_captions_for_image = {} 35 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 36 | sentences = '\n'.join([c.replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 37 | 38 | # ====================================================== 39 | # save sentences to temporary file 40 | # ====================================================== 41 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 42 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) 43 | tmp_file.write(sentences.encode()) 44 | tmp_file.close() 45 | 46 | # ====================================================== 47 | # tokenize sentence 48 | # ====================================================== 49 | cmd.append(os.path.basename(tmp_file.name)) 50 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ 51 | stdout=subprocess.PIPE) 52 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 53 | token_lines = token_lines.decode() 54 | lines = token_lines.split('\n') 55 | # remove temp file 56 | os.remove(tmp_file.name) 57 | 58 | # ====================================================== 59 | # create dictionary for tokenized captions 60 | # ====================================================== 61 | for k, line in zip(image_id, lines): 62 | if not k in final_tokenized_captions_for_image: 63 | final_tokenized_captions_for_image[k] = [] 64 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 65 | if w not in PUNCTUATIONS]) 66 | final_tokenized_captions_for_image[k].append(tokenized_caption) 67 | 68 | return final_tokenized_captions_for_image 69 | -------------------------------------------------------------------------------- /eval/tvc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | reproduce TVC evaluation using pycocoevalcap from Maluuba nlg-eval (Python 3) 6 | 7 | copied/modified from HERO 8 | (https://github.com/linjieli222/HERO) 9 | """ 10 | import json 11 | 12 | from .pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 13 | from .pycocoevalcap.bleu.bleu import Bleu 14 | from .pycocoevalcap.cider.cider import Cider 15 | from .pycocoevalcap.meteor.meteor import Meteor 16 | from .pycocoevalcap.rouge.rouge import Rouge 17 | 18 | 19 | def _remove_nonascii(text): 20 | return ''.join([i if ord(i) < 128 else ' ' for i in text]) 21 | 22 | 23 | class TvcEval(object): 24 | """ preload evaluation tools and references for repeated evaluation """ 25 | def __init__(self, ref_path): 26 | self.tokenizer = PTBTokenizer() 27 | id2refs = {ex['clip_id']: [_remove_nonascii(cap['desc'].strip()) 28 | for cap in ex['descs']] 29 | for ex in map(json.loads, open(ref_path))} 30 | self.id2refs = self.tokenizer.tokenize(id2refs) 31 | self.scorers = [] 32 | self.scorers.append((Bleu(4), 33 | ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) 34 | self.scorers.append((Meteor(), "METEOR")) 35 | self.scorers.append((Rouge(), "ROUGE_L")) 36 | self.scorers.append((Cider(), "CIDEr")) 37 | 38 | def __call__(self, json_res): 39 | """ corpus level metrics, take list of results """ 40 | id2hyps = { 41 | res['clip_id']: [_remove_nonascii(res['descs'][0]['desc'].strip())] 42 | for res in json_res 43 | } 44 | id2hyps = self.tokenizer.tokenize(id2hyps) 45 | assert len(id2hyps) == len(self.id2refs) 46 | 47 | ret_scores = {} 48 | for scorer, method in self.scorers: 49 | print(f"Computing {method} score...") 50 | score, scores = scorer.compute_score(self.id2refs, id2hyps) 51 | if isinstance(method, list): 52 | for sc, scs, m in zip(score, scores, method): 53 | ret_scores[m] = sc * 100 54 | else: 55 | ret_scores[method] = score * 100 56 | 57 | return ret_scores 58 | -------------------------------------------------------------------------------- /eval/vatex_en_c.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | VATEX captioning evaluation 6 | """ 7 | import json 8 | 9 | from .pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 10 | from .pycocoevalcap.bleu.bleu import Bleu 11 | from .pycocoevalcap.cider.cider import Cider 12 | from .pycocoevalcap.meteor.meteor import Meteor 13 | from .pycocoevalcap.rouge.rouge import Rouge 14 | 15 | 16 | def _remove_nonascii(text): 17 | return ''.join([i if ord(i) < 128 else ' ' for i in text]) 18 | 19 | 20 | class Vatex_en_c_Eval(object): 21 | """ preload evaluation tools and references for repeated evaluation """ 22 | def __init__(self, ref_path): 23 | self.tokenizer = PTBTokenizer() 24 | id2refs = {ex['clip_id']: [_remove_nonascii(cap['desc'].strip()) 25 | for cap in ex['descs']] 26 | for ex in map(json.loads, open(ref_path))} 27 | self.id2refs = self.tokenizer.tokenize(id2refs) 28 | self.scorers = [] 29 | self.scorers.append((Bleu(4), 30 | ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) 31 | self.scorers.append((Meteor(), "METEOR")) 32 | self.scorers.append((Rouge(), "ROUGE_L")) 33 | self.scorers.append((Cider(), "CIDEr")) 34 | 35 | def __call__(self, json_res): 36 | """ corpus level metrics, take list of results """ 37 | id2hyps = { 38 | res['clip_id']: 39 | [_remove_nonascii(res['descs'][0]['desc'].strip())] 40 | for res in json_res} 41 | id2hyps = self.tokenizer.tokenize(id2hyps) 42 | assert len(id2hyps) == len(self.id2refs) 43 | 44 | ret_scores = {} 45 | for scorer, method in self.scorers: 46 | print(f"Computing {method} score...") 47 | score, scores = scorer.compute_score(self.id2refs, id2hyps) 48 | if isinstance(method, list): 49 | for sc, scs, m in zip(score, scores, method): 50 | ret_scores[m] = sc * 100 51 | else: 52 | ret_scores[method] = score * 100 53 | 54 | return ret_scores 55 | -------------------------------------------------------------------------------- /inf_yc2c.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | run evaluation of YC2C or infenrece for submission 6 | generate prediction from JSON file 7 | 8 | copied/modified from HERO 9 | (https://github.com/linjieli222/HERO 10 | """ 11 | import argparse 12 | import json 13 | 14 | from horovod import torch as hvd 15 | from transformers import RobertaTokenizer 16 | 17 | from model.videoCap import VideoCapGenerator 18 | from eval.yc2c import Yc2cEval 19 | from utils.distributed import all_gather_list 20 | from utils.basic_utils import save_jsonl 21 | 22 | from inf_tvc import load_model 23 | from inf_vatex_en_c import load_inf_data, decode 24 | 25 | 26 | def main(opts): 27 | hvd.init() 28 | if hvd.rank() == 0: 29 | toker = RobertaTokenizer.from_pretrained('roberta-base') 30 | all_gather_list(None) 31 | else: 32 | all_gather_list(None) 33 | toker = RobertaTokenizer.from_pretrained('roberta-base') 34 | 35 | bos = toker.convert_tokens_to_ids([''])[0] 36 | eos = toker.convert_tokens_to_ids([''])[0] 37 | 38 | model_opts, model = load_model(opts.model_dir, opts.ckpt_step, opts) 39 | loader = load_inf_data(opts, model_opts, mode="video_sub") 40 | model.eval() 41 | generator = VideoCapGenerator( 42 | model, opts.max_gen_step, bos, eos, not opts.no_fp16) 43 | results = decode(loader, generator, toker) 44 | import os 45 | output_path = os.path.join(opts.model_dir, opts.output) 46 | save_jsonl(results, output_path) 47 | 48 | # evaluate score if possible 49 | if (hvd.rank() == 0 50 | and 'descs' in json.loads(next(iter(open(opts.target_clip))))): 51 | evaluator = Yc2cEval(opts.target_clip) 52 | score = evaluator(results) 53 | print(score) 54 | 55 | 56 | if __name__ == "__main__": 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--sub_txt_db", 59 | default="/txt/yc2_subtitles.db", 60 | type=str, 61 | help="The input video subtitle corpus. (LMDB)") 62 | parser.add_argument("--vfeat_db", 63 | default="/video/yc2", type=str, 64 | help="The input video frame features.") 65 | parser.add_argument("--model_dir", required=True, type=str, 66 | help="dir root to trained model") 67 | parser.add_argument("--ckpt_step", required=True, type=int, 68 | help="checkpoint step") 69 | parser.add_argument("--output", type=str, required=True, 70 | help="output file name") 71 | 72 | parser.add_argument("--batch_size", default=16, type=int, 73 | help="validation batch size (per GPU)") 74 | parser.add_argument("--max_gen_step", default=30, type=int, 75 | help="max generation steps") 76 | 77 | parser.add_argument('--n_workers', type=int, default=4, 78 | help="number of data workers") 79 | parser.add_argument('--no_pin_mem', action='store_true', 80 | help="disable pin memory") 81 | parser.add_argument("--no_fp16", action='store_true', 82 | help="disable fp16") 83 | 84 | parser.add_argument("--target_clip", required=True, type=str, 85 | help="jsonl annotation") 86 | 87 | args = parser.parse_args() 88 | 89 | main(args) 90 | -------------------------------------------------------------------------------- /launch_container.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | # Modified from UNITER 4 | # (https://github.com/ChenRocks/UNITER) 5 | 6 | TXT_DB=$1 7 | VID_DIR=$2 8 | OUTPUT=$3 9 | PRETRAIN_DIR=$4 10 | 11 | if [ -z $CUDA_VISIBLE_DEVICES ]; then 12 | CUDA_VISIBLE_DEVICES='all' 13 | fi 14 | 15 | if [ "$5" = "--prepro" ]; then 16 | RO="" 17 | else 18 | RO=",readonly" 19 | fi 20 | 21 | docker run --gpus '"'device=$CUDA_VISIBLE_DEVICES'"' --ipc=host --network=host --rm -it \ 22 | --mount src=$(pwd),dst=/src,type=bind \ 23 | --mount src=$OUTPUT,dst=/storage,type=bind \ 24 | --mount src=$PRETRAIN_DIR,dst=/pretrain,type=bind,readonly \ 25 | --mount src=$TXT_DB,dst=/txt,type=bind$RO \ 26 | --mount src=$VID_DIR,dst=/video,type=bind,readonly \ 27 | -e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \ 28 | -w /src linjieli222/hero 29 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VALUE-Leaderboard/StarterCode/fe600a7dd552227a5d0297ab953a52d5ea667c9a/model/__init__.py -------------------------------------------------------------------------------- /model/vcmr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | HERO for Video Corpus Moment Retrieval Tasks, shared by: 6 | 1. TVR 7 | 2. How2R 8 | 9 | copied/modified from HERO 10 | (https://github.com/linjieli222/HERO) 11 | """ 12 | from .pretrain import HeroForPretraining 13 | import torch 14 | 15 | 16 | class HeroForVcmr(HeroForPretraining): 17 | def __init__(self, config, vfeat_dim, max_frm_seq_len, 18 | conv_stride=1, conv_kernel_size=5, 19 | ranking_loss_type="hinge", margin=0.1, 20 | lw_neg_ctx=0, lw_neg_q=0, lw_st_ed=0.01, drop_svmr_prob=0, 21 | use_hard_negative=False, hard_pool_size=20, 22 | hard_neg_weight=10, use_all_neg=True): 23 | super(HeroForVcmr, self).__init__( 24 | config, vfeat_dim, max_frm_seq_len, 25 | conv_stride, conv_kernel_size, 26 | ranking_loss_type, margin, 27 | lw_neg_ctx, lw_neg_q, lw_st_ed, drop_svmr_prob, 28 | use_hard_negative, hard_pool_size, 29 | hard_neg_weight, use_all_neg) 30 | assert lw_st_ed > 0 or lw_neg_ctx > 0 or lw_neg_q > 0 31 | 32 | def forward(self, batch, task='vcmr', compute_loss=True): 33 | if task == "vcmr": 34 | return super(HeroForVcmr, self).forward( 35 | batch, task='vsm', compute_loss=compute_loss) 36 | elif task == "vr": 37 | if compute_loss: 38 | _, loss_neg_ctx, loss_neg_q = super(HeroForVcmr, self).forward( 39 | batch, task='vsm', compute_loss=True) 40 | return torch.zeros_like(loss_neg_ctx), loss_neg_ctx, loss_neg_q 41 | else: 42 | q2video_scores, _, _ = super(HeroForVcmr, self).forward( 43 | batch, task='vsm', compute_loss=False) 44 | return q2video_scores 45 | else: 46 | raise ValueError(f'Unrecognized task {task}') 47 | 48 | def get_pred_from_raw_query(self, frame_embeddings, c_attn_masks, 49 | query_input_ids, query_pos_ids, 50 | query_attn_masks, cross=False, 51 | val_gather_gpus=False): 52 | modularized_query = self.encode_txt_inputs( 53 | query_input_ids, query_pos_ids, 54 | query_attn_masks, attn_layer=self.q_feat_attn, 55 | normalized=False) 56 | if self.lw_st_ed != 0: 57 | st_prob, ed_prob = self.get_pred_from_mod_query( 58 | frame_embeddings, c_attn_masks, 59 | modularized_query, cross=cross) 60 | else: 61 | st_prob, ed_prob = None, None 62 | 63 | if self.lw_neg_ctx != 0 or self.lw_neg_q != 0: 64 | q2video_scores = self.get_video_level_scores( 65 | modularized_query, frame_embeddings, c_attn_masks, 66 | val_gather_gpus) 67 | else: 68 | q2video_scores = None 69 | return q2video_scores, st_prob, ed_prob 70 | -------------------------------------------------------------------------------- /model/vr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | HERO for Video Retrieval Tasks, shared by: 6 | 1. VATEX_EN_R 7 | 2. YC2R 8 | 9 | copied/modified from HERO 10 | (https://github.com/linjieli222/HERO) 11 | """ 12 | from .vcmr import HeroForVcmr 13 | 14 | 15 | class HeroForVr(HeroForVcmr): 16 | def __init__(self, config, vfeat_dim, max_frm_seq_len, 17 | ranking_loss_type="hinge", margin=0.1, 18 | lw_neg_ctx=1, lw_neg_q=1, 19 | use_hard_negative=False, hard_pool_size=20, 20 | hard_neg_weight=10, use_all_neg=True): 21 | assert lw_neg_ctx != 0 or lw_neg_q != 0,\ 22 | "Need to set lw_neg_ctx or lw_neg_q for VR training" 23 | super(HeroForVr, self).__init__( 24 | config, vfeat_dim, max_frm_seq_len, 25 | ranking_loss_type=ranking_loss_type, margin=margin, 26 | lw_neg_ctx=lw_neg_ctx, lw_neg_q=lw_neg_q, 27 | lw_st_ed=0, drop_svmr_prob=1.0, 28 | use_hard_negative=use_hard_negative, 29 | hard_pool_size=hard_pool_size, 30 | hard_neg_weight=hard_neg_weight, 31 | use_all_neg=use_all_neg) 32 | assert self.lw_st_ed == 0, "For VR, lw_st_ed should be 0" 33 | 34 | def forward(self, batch, task='msrvtt_video_sub', compute_loss=True): 35 | if task in [ 36 | 'msrvtt_video_sub', 'msrvtt_video_only', 37 | 'yc2r_video_sub', 'yc2r_video_only', 38 | 'vatex_video_only', 'vatex_video_sub']: 39 | if compute_loss: 40 | _, loss_neg_ctx, loss_neg_q = super(HeroForVr, self).forward( 41 | batch, task='tvr', compute_loss=True) 42 | return loss_neg_ctx, loss_neg_q 43 | else: 44 | q2video_scores, _, _ = super(HeroForVr, self).forward( 45 | batch, task='tvr', compute_loss=False) 46 | return q2video_scores 47 | else: 48 | raise ValueError(f'Unrecognized task {task}') 49 | 50 | def get_pred_from_raw_query(self, frame_embeddings, c_attn_masks, 51 | query_input_ids, query_pos_ids, 52 | query_attn_masks, cross=False, 53 | val_gather_gpus=False): 54 | modularized_query = self.encode_txt_inputs( 55 | query_input_ids, query_pos_ids, 56 | query_attn_masks, attn_layer=self.q_feat_attn, 57 | normalized=False) 58 | 59 | q2video_scores = self.get_video_level_scores( 60 | modularized_query, frame_embeddings, c_attn_masks, 61 | val_gather_gpus) 62 | return q2video_scores 63 | -------------------------------------------------------------------------------- /optim/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | Copied from UNITER 6 | (https://github.com/ChenRocks/UNITER) 7 | """ 8 | from .sched import noam_schedule, warmup_linear, vqa_schedule, get_lr_sched 9 | from .adamw import AdamW 10 | -------------------------------------------------------------------------------- /optim/adamw.py: -------------------------------------------------------------------------------- 1 | """ 2 | AdamW optimizer (weight decay fix) 3 | originally from hugginface (https://github.com/huggingface/transformers). 4 | 5 | Copied from UNITER 6 | (https://github.com/ChenRocks/UNITER) 7 | """ 8 | import math 9 | 10 | import torch 11 | from torch.optim import Optimizer 12 | 13 | 14 | class AdamW(Optimizer): 15 | """ Implements Adam algorithm with weight decay fix. 16 | Parameters: 17 | lr (float): learning rate. Default 1e-3. 18 | betas (tuple of 2 floats): Adams beta parameters (b1, b2). 19 | Default: (0.9, 0.999) 20 | eps (float): Adams epsilon. Default: 1e-6 21 | weight_decay (float): Weight decay. Default: 0.0 22 | correct_bias (bool): can be set to False to avoid correcting bias 23 | in Adam (e.g. like in Bert TF repository). Default True. 24 | """ 25 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, 26 | weight_decay=0.0, correct_bias=True): 27 | if lr < 0.0: 28 | raise ValueError( 29 | "Invalid learning rate: {} - should be >= 0.0".format(lr)) 30 | if not 0.0 <= betas[0] < 1.0: 31 | raise ValueError("Invalid beta parameter: {} - " 32 | "should be in [0.0, 1.0[".format(betas[0])) 33 | if not 0.0 <= betas[1] < 1.0: 34 | raise ValueError("Invalid beta parameter: {} - " 35 | "should be in [0.0, 1.0[".format(betas[1])) 36 | if not 0.0 <= eps: 37 | raise ValueError("Invalid epsilon value: {} - " 38 | "should be >= 0.0".format(eps)) 39 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, 40 | correct_bias=correct_bias) 41 | super(AdamW, self).__init__(params, defaults) 42 | 43 | def step(self, closure=None): 44 | """Performs a single optimization step. 45 | Arguments: 46 | closure (callable, optional): A closure that reevaluates the model 47 | and returns the loss. 48 | """ 49 | loss = None 50 | if closure is not None: 51 | loss = closure() 52 | 53 | for group in self.param_groups: 54 | for p in group['params']: 55 | if p.grad is None: 56 | continue 57 | grad = p.grad.data 58 | if grad.is_sparse: 59 | raise RuntimeError( 60 | 'Adam does not support sparse ' 61 | 'gradients, please consider SparseAdam instead') 62 | 63 | state = self.state[p] 64 | 65 | # State initialization 66 | if len(state) == 0: 67 | state['step'] = 0 68 | # Exponential moving average of gradient values 69 | state['exp_avg'] = torch.zeros_like(p.data) 70 | # Exponential moving average of squared gradient values 71 | state['exp_avg_sq'] = torch.zeros_like(p.data) 72 | 73 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 74 | beta1, beta2 = group['betas'] 75 | 76 | state['step'] += 1 77 | 78 | # Decay the first and second moment running average coefficient 79 | # In-place operations to update the averages at the same time 80 | exp_avg.mul_(beta1).add_(1.0 - beta1, grad) 81 | exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) 82 | denom = exp_avg_sq.sqrt().add_(group['eps']) 83 | 84 | step_size = group['lr'] 85 | if group['correct_bias']: # No bias correction for Bert 86 | bias_correction1 = 1.0 - beta1 ** state['step'] 87 | bias_correction2 = 1.0 - beta2 ** state['step'] 88 | step_size = (step_size * math.sqrt(bias_correction2) 89 | / bias_correction1) 90 | 91 | p.data.addcdiv_(-step_size, exp_avg, denom) 92 | 93 | # Just adding the square of the weights to the loss function is 94 | # *not* the correct way of using L2 regularization/weight decay 95 | # with Adam, since that will interact with the m and v 96 | # parameters in strange ways. 97 | # 98 | # Instead we want to decay the weights in a manner that doesn't 99 | # interact with the m/v parameters. This is equivalent to 100 | # adding the square of the weights to the loss with plain 101 | # (non-momentum) SGD. 102 | # Add weight decay at the end (fixed version) 103 | if group['weight_decay'] > 0.0: 104 | p.data.add_(-group['lr'] * group['weight_decay'], p.data) 105 | 106 | return loss 107 | -------------------------------------------------------------------------------- /optim/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | Copied from UNITER 6 | (https://github.com/ChenRocks/UNITER) 7 | 8 | Misc lr helper 9 | """ 10 | from torch.optim import Adam, Adamax 11 | from .adamw import AdamW 12 | 13 | 14 | def build_optimizer(model, opts): 15 | # Prepare optimizer 16 | param_optimizer = [(n, p) for n, p in model.named_parameters() 17 | if 'v_encoder' in n and p.requires_grad] 18 | # top layer has larger learning rate 19 | param_top = [(n, p) for n, p in model.named_parameters() 20 | if 'v_encoder' not in n and p.requires_grad] 21 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 22 | optimizer_grouped_parameters = [ 23 | {'params': [p for n, p in param_top 24 | if not any(nd in n for nd in no_decay)], 25 | 'lr': opts.lr_mul*opts.learning_rate, 26 | 'weight_decay': opts.weight_decay}, 27 | {'params': [p for n, p in param_top 28 | if any(nd in n for nd in no_decay)], 29 | 'lr': opts.lr_mul*opts.learning_rate, 30 | 'weight_decay': 0.0}, 31 | {'params': [p for n, p in param_optimizer 32 | if not any(nd in n for nd in no_decay)], 33 | 'weight_decay': opts.weight_decay}, 34 | {'params': [p for n, p in param_optimizer 35 | if any(nd in n for nd in no_decay)], 36 | 'weight_decay': 0.0} 37 | ] 38 | 39 | # currently Adam only 40 | if opts.optim == 'adam': 41 | OptimCls = Adam 42 | elif opts.optim == 'adamax': 43 | OptimCls = Adamax 44 | elif opts.optim == 'adamw': 45 | OptimCls = AdamW 46 | else: 47 | raise ValueError('invalid optimizer') 48 | optimizer = OptimCls(optimizer_grouped_parameters, 49 | lr=opts.learning_rate, betas=opts.betas) 50 | return optimizer 51 | -------------------------------------------------------------------------------- /optim/sched.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | Copied from UNITER 6 | (https://github.com/ChenRocks/UNITER) 7 | 8 | optimizer learning rate scheduling helpers 9 | """ 10 | from math import ceil 11 | 12 | 13 | def noam_schedule(step, warmup_step=4000): 14 | """ original Transformer schedule""" 15 | if step <= warmup_step: 16 | return step / warmup_step 17 | return (warmup_step ** 0.5) * (step ** -0.5) 18 | 19 | 20 | def warmup_linear(step, warmup_step, tot_step): 21 | """ BERT schedule """ 22 | if step < warmup_step: 23 | return step / warmup_step 24 | return max(0, (tot_step-step)/(tot_step-warmup_step)) 25 | 26 | 27 | def vqa_schedule(step, warmup_interval, decay_interval, 28 | decay_start, decay_rate): 29 | """ VQA schedule from MCAN """ 30 | if step < warmup_interval: 31 | return 1/4 32 | elif step < 2 * warmup_interval: 33 | return 2/4 34 | elif step < 3 * warmup_interval: 35 | return 3/4 36 | elif step >= decay_start: 37 | num_decay = ceil((step - decay_start) / decay_interval) 38 | return decay_rate ** num_decay 39 | else: 40 | return 1 41 | 42 | 43 | def get_lr_sched(global_step, opts): 44 | # learning rate scheduling 45 | lr_this_step = opts.learning_rate * warmup_linear( 46 | global_step, opts.warmup_steps, opts.num_train_steps) 47 | if lr_this_step <= 0: 48 | lr_this_step = 1e-8 49 | return lr_this_step -------------------------------------------------------------------------------- /scripts/collect_video_feature_paths.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | gather feature paths 6 | 7 | copied/modified from HERO 8 | (https://github.com/linjieli222/HERO) 9 | """ 10 | import os 11 | import numpy as np 12 | import pickle as pkl 13 | import argparse 14 | from tqdm import tqdm 15 | from cytoolz import curry 16 | import multiprocessing as mp 17 | # released feature .tar filename: 'resnet' 'slowfast' 'mil-nce-s3d' 'clip-vit' 18 | FEAT_DIR = { 19 | "resnet": "resnet", 20 | "slowfast": "slowfast", 21 | "mil-nce": "mil-nce-s3d", 22 | "clip-vit": "clip-vit"} 23 | 24 | 25 | @curry 26 | def load_npz(dir_3d, dir_2d, f_3d): 27 | vid = f_3d.split("/")[-1].split(".npz")[0] 28 | folder_name = f_3d.split("/")[-2] 29 | f_2d = f_3d.replace(dir_3d, dir_2d) 30 | try: 31 | feature_3d = np.load(f_3d, allow_pickle=True) 32 | feat_len_3d = max(0, len(feature_3d["features"])) 33 | except Exception: 34 | feat_len_3d = 0 35 | feat_len_2d = 0 36 | if feat_len_3d == 0: 37 | f_3d = "" 38 | print(f"Corrupted {dir_3d.split('/')[-1]} feature for {vid}") 39 | # print(f_2d) 40 | if not os.path.exists(f_2d): 41 | f_2d = "" 42 | print(f"{dir_2d.split('/')[-1]} files for {vid} does not exists") 43 | else: 44 | try: 45 | feature_2d = np.load(f_2d, allow_pickle=True) 46 | feat_len_2d = len(feature_2d["features"]) 47 | except Exception: 48 | feat_len_2d = 0 49 | f_2d = "" 50 | print(f"Corrupted {dir_2d.split('/')[-1]} files for {vid}") 51 | frame_len = min(feat_len_3d, feat_len_2d) 52 | return vid, frame_len, f_3d, f_2d, folder_name 53 | 54 | 55 | def main(opts): 56 | name_2d, name_3d = opts.feat_version.split("_") 57 | dir_3d = os.path.join(opts.feature_dir, FEAT_DIR[name_3d]) 58 | dir_2d = os.path.join(opts.feature_dir, FEAT_DIR[name_3d]) 59 | failed_2d_files = [] 60 | failed_3d_files = [] 61 | loaded_file = [] 62 | for root, dirs, curr_files in os.walk(f'{dir_3d}/'): 63 | for f in curr_files: 64 | if f.endswith('.npz'): 65 | f_3d = os.path.join(root, f) 66 | loaded_file.append(f_3d) 67 | print(f"Found {len(loaded_file)} {name_3d} files....") 68 | print(f"sample loaded_file: {loaded_file[:3]}") 69 | failed_2d_files, failed_3d_files = [], [] 70 | files = {} 71 | load = load_npz(dir_3d, dir_2d) 72 | with mp.Pool(opts.nproc) as pool, tqdm(total=len(loaded_file)) as pbar: 73 | for i, (vid, frame_len, f_3d, 74 | f_2d, folder_name) in enumerate( 75 | pool.imap_unordered(load, loaded_file, chunksize=128)): 76 | files[vid] = (frame_len, f_3d, f_2d, folder_name) 77 | if f_2d == "": 78 | video_file = os.path.join(folder_name, vid) 79 | failed_2d_files.append(video_file) 80 | if f_3d == "": 81 | video_file = os.path.join(folder_name, vid) 82 | failed_3d_files.append(video_file) 83 | pbar.update(1) 84 | output_dir = os.path.join(opts.output, opts.dataset) 85 | if not os.path.exists(output_dir): 86 | os.makedirs(output_dir, exist_ok=True) 87 | pkl.dump(files, open(os.path.join( 88 | output_dir, f"{opts.feat_version}_info.pkl"), "wb")) 89 | if len(failed_3d_files): 90 | pkl.dump(failed_3d_files, open(os.path.join( 91 | output_dir, f"failed_{name_3d}_files.pkl"), "wb")) 92 | if len(failed_2d_files): 93 | pkl.dump(failed_2d_files, open(os.path.join( 94 | output_dir, f"failed_{name_2d}_files.pkl"), "wb")) 95 | 96 | 97 | if __name__ == '__main__': 98 | parser = argparse.ArgumentParser() 99 | parser.add_argument("--feature_dir", 100 | default="", 101 | type=str, help="The input video feature dir.") 102 | parser.add_argument("--output", default=None, type=str, 103 | help="output dir") 104 | parser.add_argument('--dataset', type=str, 105 | default="") 106 | parser.add_argument('--feat_version', type=str, 107 | choices=[ 108 | "resnet_slowfast", "resnet_mil-nce", 109 | "clip-vit_slowfast", "clip-vit_mil-nce"], 110 | default="resnet_slowfast") 111 | parser.add_argument('--nproc', type=int, default=10, 112 | help='number of cores used') 113 | args = parser.parse_args() 114 | main(args) 115 | -------------------------------------------------------------------------------- /scripts/create_txtdb.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | TXT_DB=$1 6 | ANN_DIR=$2 7 | VIDEO_DB=$3 8 | 9 | set -e 10 | 11 | # annotations 12 | DataBLOB='https://datarelease.blob.core.windows.net/value-leaderboard/tv_tasks' 13 | TVR='https://raw.githubusercontent.com/jayleicn/TVRetrieval/master/data/' 14 | 15 | if [ ! -d $TXT_DB ]; then 16 | mkdir -p $TXT_DB 17 | fi 18 | if [ ! -d $ANN_DIR ]; then 19 | mkdir -p $ANN_DIR 20 | fi 21 | 22 | 23 | for SPLIT in 'train' 'val'; do 24 | if [ ! -f $ANN_DIR/tvr_${SPLIT}_release.jsonl ]; then 25 | echo "downloading ${SPLIT} annotations..." 26 | wget $TVR/tvr_${SPLIT}_release.jsonl -O $ANN_DIR/tvr_${SPLIT}_release.jsonl 27 | fi 28 | done 29 | if [ ! -f $ANN_DIR/tvr_test_release.jsonl ]; then 30 | echo "downloading test annotations..." 31 | wget $DataBLOB/tvr_test_release.jsonl -O $ANN_DIR/tvr_test_release.jsonl 32 | fi 33 | 34 | for SPLIT in 'train' 'val' 'test'; do 35 | if [ ! -d $TXT_DB/tvr_${SPLIT}.db ]; then 36 | echo "preprocessing tvr ${SPLIT} annotations..." 37 | docker run --ipc=host --rm -it \ 38 | --mount src=$(pwd),dst=/src,type=bind \ 39 | --mount src=$TXT_DB,dst=/txt_db,type=bind \ 40 | --mount src=$ANN_DIR,dst=/ann,type=bind,readonly \ 41 | -w /src linjieli222/hero \ 42 | python scripts/prepro_query.py --annotation /ann/tvr_${SPLIT}_release.jsonl \ 43 | --output /txt_db/tvr_${SPLIT}.db \ 44 | --task tvr 45 | fi 46 | done 47 | 48 | 49 | 50 | if [ ! -d $VIDEO_DB ]; then 51 | echo "Make sure you have constructed/downloaded the video dbs before processing the subtitles..." 52 | else 53 | if [ ! -f $ANN_DIR/tv_subtitles.jsonl ]; then 54 | echo "downloading raw subtitle and additional annotations..." 55 | wget $DataBLOB/tvr_video2dur_idx.json -O $ANN_DIR/vid2dur_idx.json 56 | 57 | wget $TVR/tvqa_preprocessed_subtitles.jsonl -O $ANN_DIR/tv_subtitles.jsonl 58 | fi 59 | 60 | if [ ! -d $TXT_DB/tv_subtitles.db ]; then 61 | echo "preprocessing tv subtitles..." 62 | docker run --ipc=host --rm -it \ 63 | --mount src=$(pwd),dst=/src,type=bind \ 64 | --mount src=$TXT_DB,dst=/txt_db,type=bind \ 65 | --mount src=$ANN_DIR,dst=/ann,type=bind,readonly \ 66 | --mount src=$VIDEO_DB,dst=/video_db,type=bind,readonly \ 67 | -w /src linjieli222/hero \ 68 | /bin/bash -c "python scripts/prepro_sub.py --annotation /ann/tv_subtitles.jsonl --output /txt_db/tv_subtitles.db --vid2nframe /video_db/tv/id2nframe_1.5.json --frame_length 1.5; cp /ann/vid2dur_idx.json /txt_db/tv_subtitles.db/" 69 | echo "done" 70 | fi 71 | fi -------------------------------------------------------------------------------- /scripts/download_all.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | DOWNLOAD=$1 5 | 6 | # checkpoint 7 | bash ./scripts/download_pretrained.sh $DOWNLOAD 8 | 9 | # data 10 | bash ./scripts/download_tvr.sh $DOWNLOAD 11 | bash ./scripts/download_tvqa.sh $DOWNLOAD 12 | bash ./scripts/download_tvc.sh $DOWNLOAD 13 | bash ./scripts/download_how2.sh $DOWNLOAD 14 | bash ./scripts/download_violin.sh $DOWNLOAD 15 | bash ./scripts/download_vlep.sh $DOWNLOAD 16 | bash ./scripts/download_yc2.sh $DOWNLOAD 17 | bash ./scripts/download_vatex.sh $DOWNLOAD -------------------------------------------------------------------------------- /scripts/download_how2.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | DOWNLOAD=$1 5 | 6 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do 7 | if [ ! -d $DOWNLOAD/$FOLDER ] ; then 8 | mkdir -p $DOWNLOAD/$FOLDER 9 | fi 10 | done 11 | 12 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data' 13 | 14 | # Use azcopy for video db downloading 15 | if [ -f ~/azcopy/azcopy ]; then 16 | echo "azcopy exists, skip downloading" 17 | else 18 | echo "azcopy does not exist, start downloading" 19 | wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy 20 | fi 21 | chmod +x ~/azcopy/azcopy 22 | 23 | # video dbs 24 | if [ ! -d $DOWNLOAD/video_db/how2/ ] ; then 25 | ~/azcopy/azcopy cp $BLOB/video_db/how2.tar $DOWNLOAD/video_db/how2.tar 26 | tar -xvf $DOWNLOAD/video_db/how2.tar -C $DOWNLOAD/video_db 27 | rm $DOWNLOAD/video_db/how2.tar 28 | fi 29 | 30 | # text dbs 31 | if [ ! -d $DOWNLOAD/txt_db/how2_subtitles.db/ ] ; then 32 | wget $BLOB/txt_db/how2_subtitles.db.tar -P $DOWNLOAD/txt_db/ 33 | tar -xvf $DOWNLOAD/txt_db/how2_subtitles.db.tar -C $DOWNLOAD/txt_db 34 | rm $DOWNLOAD/txt_db/how2_subtitles.db.tar 35 | fi 36 | # how2r 37 | for SPLIT in 'train' 'val_1k' 'test_public_1k' ; do 38 | if [ ! -d $DOWNLOAD/txt_db/how2r_$SPLIT.db/ ] ; then 39 | wget $BLOB/txt_db/how2r_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 40 | tar -xvf $DOWNLOAD/txt_db/how2r_$SPLIT.db.tar -C $DOWNLOAD/txt_db 41 | rm $DOWNLOAD/txt_db/how2r_$SPLIT.db.tar 42 | fi 43 | done 44 | # how2qa 45 | for SPLIT in 'train' 'val' 'test_public' ; do 46 | if [ ! -d $DOWNLOAD/txt_db/how2qa_$SPLIT.db/ ] ; then 47 | wget $BLOB/txt_db/how2qa_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 48 | tar -xvf $DOWNLOAD/txt_db/how2qa_$SPLIT.db.tar -C $DOWNLOAD/txt_db 49 | rm $DOWNLOAD/txt_db/how2qa_$SPLIT.db.tar 50 | fi 51 | done 52 | 53 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero' 54 | # pretrained 55 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then 56 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/ 57 | fi 58 | -------------------------------------------------------------------------------- /scripts/download_pretrained.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | DOWNLOAD=$1 6 | 7 | if [ ! -d $DOWNLOAD/pretrained ] ; then 8 | mkdir -p $DOWNLOAD/pretrained 9 | fi 10 | 11 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero' 12 | 13 | # This will overwrite models 14 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -O $DOWNLOAD/pretrained/hero-tv-ht100.pt 15 | 16 | # converted RoBERTa 17 | if [ ! -f $DOWNLOAD/pretrained/pretrain-tv-init.bin ] ; then 18 | wget $HEROBLOB/pretrained/pretrain-tv-init.bin -O $DOWNLOAD/pretrained/pretrain-tv-init.bin 19 | fi -------------------------------------------------------------------------------- /scripts/download_tvc.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | DOWNLOAD=$1 6 | 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do 8 | if [ ! -d $DOWNLOAD/$FOLDER ] ; then 9 | mkdir -p $DOWNLOAD/$FOLDER 10 | fi 11 | done 12 | 13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data' 14 | 15 | # Use azcopy for video db downloading 16 | if [ -f ~/azcopy/azcopy ]; then 17 | echo "azcopy exists, skip downloading" 18 | else 19 | echo "azcopy does not exist, start downloading" 20 | wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy 21 | fi 22 | chmod +x ~/azcopy/azcopy 23 | 24 | # video dbs 25 | if [ ! -d $DOWNLOAD/video_db/tv/ ] ; then 26 | ~/azcopy/azcopy cp $BLOB/video_db/tv.tar $DOWNLOAD/video_db/tv.tar 27 | tar -xvf $DOWNLOAD/video_db/tv.tar -C $DOWNLOAD/video_db 28 | rm $DOWNLOAD/video_db/tv.tar 29 | fi 30 | 31 | # text dbs 32 | for SPLIT in 'train' 'val' ; do 33 | if [ ! -d $DOWNLOAD/txt_db/tvc_$SPLIT.db/ ] ; then 34 | wget $BLOB/txt_db/tvc_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 35 | tar -xvf $DOWNLOAD/txt_db/tvc_$SPLIT.db.tar -C $DOWNLOAD/txt_db 36 | rm $DOWNLOAD/txt_db/tvc_$SPLIT.db.tar 37 | fi 38 | done 39 | if [ ! -d $DOWNLOAD/txt_db/tv_subtitles.db/ ] ; then 40 | wget $BLOB/txt_db/tv_subtitles.db.tar -P $DOWNLOAD/txt_db/ 41 | tar -xvf $DOWNLOAD/txt_db/tv_subtitles.db.tar -C $DOWNLOAD/txt_db 42 | rm $DOWNLOAD/txt_db/tv_subtitles.db.tar 43 | fi 44 | 45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero' 46 | # pretrained 47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then 48 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/ 49 | fi 50 | 51 | # raw_data (for evaluation and inference) 52 | TVBLOB='https://datarelease.blob.core.windows.net/value-leaderboard/tv_tasks' 53 | TVC='https://raw.githubusercontent.com/jayleicn/TVCaption/master/data/' 54 | 55 | wget -nc $TVC/tvc_val_release.jsonl -P $DOWNLOAD/txt_db 56 | wget -nc $TVBLOB/tvc_test_release.jsonl -P $DOWNLOAD/txt_db 57 | -------------------------------------------------------------------------------- /scripts/download_tvqa.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | DOWNLOAD=$1 6 | 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do 8 | if [ ! -d $DOWNLOAD/$FOLDER ] ; then 9 | mkdir -p $DOWNLOAD/$FOLDER 10 | fi 11 | done 12 | 13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data' 14 | 15 | # Use azcopy for video db downloading 16 | if [ -f ~/azcopy/azcopy ]; then 17 | echo "azcopy exists, skip downloading" 18 | else 19 | echo "azcopy does not exist, start downloading" 20 | wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy 21 | fi 22 | chmod +x ~/azcopy/azcopy 23 | 24 | # video dbs 25 | if [ ! -d $DOWNLOAD/video_db/tv/ ] ; then 26 | ~/azcopy/azcopy cp $BLOB/video_db/tv.tar $DOWNLOAD/video_db/tv.tar 27 | tar -xvf $DOWNLOAD/video_db/tv.tar -C $DOWNLOAD/video_db 28 | rm $DOWNLOAD/video_db/tv.tar 29 | fi 30 | 31 | # text dbs 32 | for SPLIT in 'train' 'val' 'test'; do 33 | if [ ! -d $DOWNLOAD/txt_db/tvqa_$SPLIT.db/ ] ; then 34 | wget $BLOB/txt_db/tvqa_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 35 | tar -xvf $DOWNLOAD/txt_db/tvqa_$SPLIT.db.tar -C $DOWNLOAD/txt_db 36 | rm $DOWNLOAD/txt_db/tvqa_$SPLIT.db.tar 37 | fi 38 | done 39 | if [ ! -d $DOWNLOAD/txt_db/tv_subtitles.db/ ] ; then 40 | wget $BLOB/txt_db/tv_subtitles.db.tar -P $DOWNLOAD/txt_db/ 41 | tar -xvf $DOWNLOAD/txt_db/tv_subtitles.db.tar -C $DOWNLOAD/txt_db 42 | rm $DOWNLOAD/txt_db/tv_subtitles.db.tar 43 | fi 44 | 45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero' 46 | # pretrained 47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then 48 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/ 49 | fi 50 | -------------------------------------------------------------------------------- /scripts/download_tvr.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | DOWNLOAD=$1 6 | 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do 8 | if [ ! -d $DOWNLOAD/$FOLDER ] ; then 9 | mkdir -p $DOWNLOAD/$FOLDER 10 | fi 11 | done 12 | 13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data' 14 | 15 | # Use azcopy for video db downloading 16 | if [ -f ~/azcopy/azcopy ]; then 17 | echo "azcopy exists, skip downloading" 18 | else 19 | echo "azcopy does not exist, start downloading" 20 | wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy 21 | fi 22 | chmod +x ~/azcopy/azcopy 23 | 24 | # video dbs 25 | if [ ! -d $DOWNLOAD/video_db/tv/ ] ; then 26 | ~/azcopy/azcopy cp $BLOB/video_db/tv.tar $DOWNLOAD/video_db/tv.tar 27 | tar -xvf $DOWNLOAD/video_db/tv.tar -C $DOWNLOAD/video_db 28 | rm $DOWNLOAD/video_db/tv.tar 29 | fi 30 | 31 | # text dbs 32 | for SPLIT in 'train' 'val' 'test'; do 33 | if [ ! -d $DOWNLOAD/txt_db/tvr_$SPLIT.db/ ] ; then 34 | wget $BLOB/txt_db/tvr_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 35 | tar -xvf $DOWNLOAD/txt_db/tvr_$SPLIT.db.tar -C $DOWNLOAD/txt_db 36 | rm $DOWNLOAD/txt_db/tvr_$SPLIT.db.tar 37 | fi 38 | done 39 | if [ ! -d $DOWNLOAD/txt_db/tv_subtitles.db/ ] ; then 40 | wget $BLOB/txt_db/tv_subtitles.db.tar -P $DOWNLOAD/txt_db/ 41 | tar -xvf $DOWNLOAD/txt_db/tv_subtitles.db.tar -C $DOWNLOAD/txt_db 42 | rm $DOWNLOAD/txt_db/tv_subtitles.db.tar 43 | fi 44 | 45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero' 46 | # pretrained 47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then 48 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/ 49 | fi 50 | -------------------------------------------------------------------------------- /scripts/download_vatex_en.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | DOWNLOAD=$1 6 | 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do 8 | if [ ! -d $DOWNLOAD/$FOLDER ] ; then 9 | mkdir -p $DOWNLOAD/$FOLDER 10 | fi 11 | done 12 | 13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data' 14 | 15 | # Use azcopy for video db downloading 16 | if [ -f ~/azcopy/azcopy ]; then 17 | echo "azcopy exists, skip downloading" 18 | else 19 | echo "azcopy does not exist, start downloading" 20 | wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy 21 | fi 22 | chmod +x ~/azcopy/azcopy 23 | 24 | # video dbs 25 | if [ ! -d $DOWNLOAD/video_db/vatex/ ] ; then 26 | ~/azcopy/azcopy cp $BLOB/video_db/vatex.tar $DOWNLOAD/video_db/vatex.tar 27 | tar -xvf $DOWNLOAD/video_db/vatex.tar -C $DOWNLOAD/video_db 28 | rm $DOWNLOAD/video_db/vatex.tar 29 | fi 30 | 31 | # text dbs 32 | if [ ! -d $DOWNLOAD/txt_db/vatex_subtitles.db/ ] ; then 33 | wget $BLOB/txt_db/vatex_subtitles.db.tar -P $DOWNLOAD/txt_db/ 34 | tar -xvf $DOWNLOAD/txt_db/vatex_subtitles.db.tar -C $DOWNLOAD/txt_db 35 | rm $DOWNLOAD/txt_db/vatex_subtitles.db.tar 36 | fi 37 | # vatex_en_r 38 | for SPLIT in 'train' 'val' 'test_public' ; do 39 | if [ ! -d $DOWNLOAD/txt_db/vatex_en_r_$SPLIT.db/ ] ; then 40 | wget $BLOB/txt_db/vatex_en_r_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 41 | tar -xvf $DOWNLOAD/txt_db/vatex_en_r_$SPLIT.db.tar -C $DOWNLOAD/txt_db 42 | rm $DOWNLOAD/txt_db/vatex_en_r_$SPLIT.db.tar 43 | fi 44 | done 45 | 46 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero' 47 | # pretrained 48 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then 49 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/ 50 | fi 51 | 52 | VATEXCBLOB='https://datarelease.blob.core.windows.net/value-leaderboard/vatex_en_c' 53 | # vatex_en_c raw data (evaluation and inference) 54 | for SPLIT in 'test_public' 'test_private'; do 55 | wget -nc $VATEXCBLOB/vatex_en_c_${SPLIT}_release.jsonl -O $DOWNLOAD/txt_db/vatex_en_c_${SPLIT}_release.jsonl 56 | done 57 | -------------------------------------------------------------------------------- /scripts/download_violin.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | DOWNLOAD=$1 6 | 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do 8 | if [ ! -d $DOWNLOAD/$FOLDER ] ; then 9 | mkdir -p $DOWNLOAD/$FOLDER 10 | fi 11 | done 12 | 13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data' 14 | 15 | # Use azcopy for video db downloading 16 | if [ -f ~/azcopy/azcopy ]; then 17 | echo "azcopy exists, skip downloading" 18 | else 19 | echo "azcopy does not exist, start downloading" 20 | wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy 21 | fi 22 | chmod +x ~/azcopy/azcopy 23 | 24 | # video dbs 25 | if [ ! -d $DOWNLOAD/video_db/violin/ ] ; then 26 | ~/azcopy/azcopy cp $BLOB/video_db/violin.tar $DOWNLOAD/video_db/violin.tar 27 | tar -xvf $DOWNLOAD/video_db/violin.tar -C $DOWNLOAD/video_db 28 | rm $DOWNLOAD/video_db/violin.tar 29 | fi 30 | 31 | # text dbs 32 | for SPLIT in 'train' 'val' 'test' 'test_private'; do 33 | if [ ! -d $DOWNLOAD/txt_db/violin_$SPLIT.db/ ] ; then 34 | wget $BLOB/txt_db/violin_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 35 | tar -xvf $DOWNLOAD/txt_db/violin_$SPLIT.db.tar -C $DOWNLOAD/txt_db 36 | rm $DOWNLOAD/txt_db/violin_$SPLIT.db.tar 37 | fi 38 | done 39 | if [ ! -d $DOWNLOAD/txt_db/violin_subtitles.db/ ] ; then 40 | wget $BLOB/txt_db/violin_subtitles.db.tar -P $DOWNLOAD/txt_db/ 41 | tar -xvf $DOWNLOAD/txt_db/violin_subtitles.db.tar -C $DOWNLOAD/txt_db 42 | rm $DOWNLOAD/txt_db/violin_subtitles.db.tar 43 | fi 44 | 45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero' 46 | # pretrained 47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then 48 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/ 49 | fi 50 | -------------------------------------------------------------------------------- /scripts/download_vlep.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | DOWNLOAD=$1 6 | 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do 8 | if [ ! -d $DOWNLOAD/$FOLDER ] ; then 9 | mkdir -p $DOWNLOAD/$FOLDER 10 | fi 11 | done 12 | 13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data' 14 | 15 | # Use azcopy for video db downloading 16 | if [ -f ~/azcopy/azcopy ]; then 17 | echo "azcopy exists, skip downloading" 18 | else 19 | echo "azcopy does not exist, start downloading" 20 | wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy 21 | fi 22 | chmod +x ~/azcopy/azcopy 23 | 24 | # video dbs 25 | if [ ! -d $DOWNLOAD/video_db/vlep/ ] ; then 26 | ~/azcopy/azcopy cp $BLOB/video_db/vlep.tar $DOWNLOAD/video_db/vlep.tar 27 | tar -xvf $DOWNLOAD/video_db/vlep.tar -C $DOWNLOAD/video_db 28 | rm $DOWNLOAD/video_db/vlep.tar 29 | fi 30 | 31 | # text dbs 32 | for SPLIT in 'train' 'dev' 'test' ; do 33 | if [ ! -d $DOWNLOAD/txt_db/vlep_$SPLIT.db/ ] ; then 34 | wget $BLOB/txt_db/vlep_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 35 | tar -xvf $DOWNLOAD/txt_db/vlep_$SPLIT.db.tar -C $DOWNLOAD/txt_db 36 | rm $DOWNLOAD/txt_db/vlep_$SPLIT.db.tar 37 | fi 38 | done 39 | if [ ! -d $DOWNLOAD/txt_db/vlep_subtitles.db/ ] ; then 40 | wget $BLOB/txt_db/vlep_subtitles.db.tar -P $DOWNLOAD/txt_db/ 41 | tar -xvf $DOWNLOAD/txt_db/vlep_subtitles.db.tar -C $DOWNLOAD/txt_db 42 | rm $DOWNLOAD/txt_db/vlep_subtitles.db.tar 43 | fi 44 | 45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero' 46 | # pretrained 47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then 48 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/ 49 | fi 50 | -------------------------------------------------------------------------------- /scripts/download_yc2.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | 5 | DOWNLOAD=$1 6 | 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do 8 | if [ ! -d $DOWNLOAD/$FOLDER ] ; then 9 | mkdir -p $DOWNLOAD/$FOLDER 10 | fi 11 | done 12 | 13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data' 14 | 15 | # Use azcopy for video db downloading 16 | if [ -f ~/azcopy/azcopy ]; then 17 | echo "azcopy exists, skip downloading" 18 | else 19 | echo "azcopy does not exist, start downloading" 20 | wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy 21 | fi 22 | chmod +x ~/azcopy/azcopy 23 | 24 | # video dbs 25 | if [ ! -d $DOWNLOAD/video_db/yc2/ ] ; then 26 | ~/azcopy/azcopy cp $BLOB/video_db/yc2.tar $DOWNLOAD/video_db/yc2.tar 27 | tar -xvf $DOWNLOAD/video_db/yc2.tar -C $DOWNLOAD/video_db 28 | rm $DOWNLOAD/video_db/yc2.tar 29 | fi 30 | 31 | # text dbs 32 | if [ ! -d $DOWNLOAD/txt_db/yc2_subtitles.db/ ] ; then 33 | wget $BLOB/txt_db/yc2_subtitles.db.tar -P $DOWNLOAD/txt_db/ 34 | tar -xvf $DOWNLOAD/txt_db/yc2_subtitles.db.tar -C $DOWNLOAD/txt_db 35 | rm $DOWNLOAD/txt_db/yc2_subtitles.db.tar 36 | fi 37 | # yc2r 38 | for SPLIT in 'train' 'val' 'test' ; do 39 | if [ ! -d $DOWNLOAD/txt_db/yc2r_$SPLIT.db/ ] ; then 40 | wget $BLOB/txt_db/yc2r_$SPLIT.db.tar -P $DOWNLOAD/txt_db/ 41 | tar -xvf $DOWNLOAD/txt_db/yc2r_$SPLIT.db.tar -C $DOWNLOAD/txt_db 42 | rm $DOWNLOAD/txt_db/yc2r_$SPLIT.db.tar 43 | fi 44 | done 45 | 46 | # pretrained 47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then 48 | wget $BLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/ 49 | fi 50 | 51 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard' 52 | YC2C=$BLOB/'yc2c' 53 | # yc2c raw data (evaluation and inference) 54 | for SPLIT in 'val' 'test' ; do 55 | wget -nc $YC2C/yc2c_${SPLIT}_release.jsonl -O $DOWNLOAD/txt_db/yc2c_${SPLIT}_release.jsonl 56 | done 57 | -------------------------------------------------------------------------------- /scripts/prepro_tvc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) Microsoft Corporation. 3 | # Licensed under the MIT license. 4 | 5 | DATA=$1 # txt_db 6 | 7 | for SPLIT in 'val' 'train'; do 8 | CMD="python scripts/prepro_tvc.py \ 9 | --annotation /txt/tvc_${SPLIT}_release.jsonl \ 10 | --subtitles /txt/tvqa_preprocessed_subtitles.jsonl \ 11 | --output /txt/tvc_${SPLIT}_new.db" 12 | 13 | docker run --ipc=host --rm \ 14 | --mount src=$(pwd),dst=/src,type=bind \ 15 | --mount src=$DATA,dst=/txt,type=bind \ 16 | -w /src linjieli222/hero \ 17 | bash -c "$CMD" 18 | done 19 | -------------------------------------------------------------------------------- /two_stream_eval/inf_yc2c.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | Two-stream evaluation for YC2C 6 | 7 | copied/modified from HERO 8 | (https://github.com/linjieli222/HERO) 9 | """ 10 | import sys 11 | import os 12 | currentdir = os.path.dirname(os.path.realpath(__file__)) 13 | parentdir = os.path.dirname(currentdir) 14 | sys.path.insert(0, parentdir) 15 | 16 | import argparse 17 | import json 18 | 19 | from horovod import torch as hvd 20 | from transformers import RobertaTokenizer 21 | 22 | from eval.yc2c import Yc2cEval 23 | from utils.distributed import all_gather_list 24 | from utils.basic_utils import save_jsonl 25 | 26 | from os.path import exists 27 | from pred_agg_eval.videocap_generator import VideoCapGenerator 28 | from pred_agg_eval.inf_vatex_en_c import decode 29 | from inf_tvc import load_model 30 | from inf_vatex_en_c import load_inf_data 31 | 32 | 33 | def main(opts): 34 | hvd.init() 35 | if hvd.rank() == 0: 36 | toker = RobertaTokenizer.from_pretrained('roberta-base') 37 | all_gather_list(None) 38 | else: 39 | all_gather_list(None) 40 | toker = RobertaTokenizer.from_pretrained('roberta-base') 41 | bos = toker.convert_tokens_to_ids([''])[0] 42 | eos = toker.convert_tokens_to_ids([''])[0] 43 | 44 | video_only_model_opts, video_only_model = load_model( 45 | opts.video_only_model_dir, opts.video_only_ckpt_step, 46 | opts) 47 | video_only_dataloader = load_inf_data( 48 | opts, video_only_model_opts, mode="video_only") 49 | 50 | if exists(opts.sub_only_model_dir): 51 | sub_only_model_opts, sub_only_model = load_model( 52 | opts.sub_only_model_dir, 53 | opts.sub_only_ckpt_step, opts) 54 | sub_only_dataloader = load_inf_data( 55 | opts, sub_only_model_opts, mode="sub_only") 56 | else: 57 | sub_only_model, sub_only_dataloader = None, None 58 | 59 | generator = VideoCapGenerator( 60 | video_only_model, opts.max_gen_step, 61 | bos, eos, not opts.no_fp16, 62 | model2=sub_only_model) 63 | 64 | results = decode( 65 | video_only_dataloader, sub_only_dataloader, 66 | generator, toker) 67 | output_path = os.path.join( 68 | opts.video_only_model_dir, opts.output) 69 | save_jsonl(results, output_path) 70 | 71 | # evaluate score if possible 72 | if (hvd.rank() == 0 73 | and 'descs' in json.loads(next(iter(open(opts.target_clip))))): 74 | evaluator = Yc2cEval(opts.target_clip) 75 | score = evaluator(results) 76 | print(score) 77 | 78 | 79 | if __name__ == "__main__": 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument("--sub_txt_db", 82 | default="/txt/yc2_subtitles.db", 83 | type=str, 84 | help="The input video subtitle corpus. (LMDB)") 85 | parser.add_argument("--vfeat_db", 86 | default="/video/yc2", type=str, 87 | help="The input video frame features.") 88 | parser.add_argument("--video_only_model_dir", required=True, type=str, 89 | help="dir root to trained model") 90 | parser.add_argument("--video_only_ckpt_step", required=True, type=int, 91 | help="checkpoint step") 92 | parser.add_argument("--sub_only_model_dir", default="", type=str, 93 | help="dir root to trained model") 94 | parser.add_argument("--sub_only_ckpt_step", default=-1, type=int, 95 | help="checkpoint step") 96 | parser.add_argument("--output", type=str, required=True, 97 | help="output file name") 98 | 99 | parser.add_argument("--batch_size", default=16, type=int, 100 | help="validation batch size (per GPU)") 101 | parser.add_argument("--max_gen_step", default=30, type=int, 102 | help="max generation steps") 103 | 104 | parser.add_argument('--n_workers', type=int, default=4, 105 | help="number of data workers") 106 | parser.add_argument('--no_pin_mem', action='store_true', 107 | help="disable pin memory") 108 | parser.add_argument("--no_fp16", action='store_true', 109 | help="disable fp16") 110 | 111 | parser.add_argument("--target_clip", required=True, type=str, 112 | help="jsonl annotation") 113 | 114 | args = parser.parse_args() 115 | 116 | main(args) -------------------------------------------------------------------------------- /two_stream_eval/videocap_generator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | copied/modified from HERO 6 | (https://github.com/linjieli222/HERO) 7 | """ 8 | import torch 9 | from model.videoCap import _to_fp16 10 | 11 | 12 | class VideoCapGenerator(object): 13 | def __init__(self, model1, max_step, bos, eos, fp16, model2=None): 14 | self.model1 = model1 15 | self.model2 = model2 16 | self.max_step = max_step 17 | self.bos = bos 18 | self.eos = eos 19 | self.fp16 = fp16 20 | 21 | def greedy_decode(self, batch1, batch2=None): 22 | """ 23 | run greedy decoding 24 | NOTE: Speed can potentially be improved by keeping past 25 | decoder hidden states and only run `step-wise` forward. 26 | Also, maybe can add early stop when all sequences reaches eos 27 | instead of running until max_step. 28 | """ 29 | if self.fp16: 30 | batch1 = _to_fp16(batch1) 31 | if batch2 is not None: 32 | batch2 = _to_fp16(batch2) 33 | encoder_outputs1, enc_mask1 = self.model1.encode(batch1) # (N, Lv, D) 34 | if self.fp16: 35 | encoder_outputs1 = encoder_outputs1.half() 36 | batch_size = enc_mask1.size(0) 37 | bos = torch.tensor([self.bos]).expand(batch_size).cuda() 38 | input_ids = torch.zeros(batch_size, self.max_step).to(bos) 39 | pos_ids = torch.arange(0, self.max_step+1).unsqueeze(0).cuda() 40 | last_out = bos 41 | 42 | if batch2 is not None: 43 | encoder_outputs2, enc_mask2 = self.model2.encode( 44 | batch2) # (N, Lv, D) 45 | if self.fp16: 46 | encoder_outputs2 = encoder_outputs2.half() 47 | for step in range(self.max_step): 48 | input_ids[:, step] = last_out 49 | score = self.model1.decode(encoder_outputs1, enc_mask1, 50 | input_ids[:, :step+1], 51 | pos_ids[:, :step+1], 52 | None, compute_loss=False) 53 | if batch2 is not None: 54 | score2 = self.model2.decode( 55 | encoder_outputs2, enc_mask2, 56 | input_ids[:, :step+1], 57 | pos_ids[:, :step+1], 58 | None, compute_loss=False) 59 | score = score/2. + score2/2. 60 | output_ids = score.max(dim=-1)[1] 61 | last_out = output_ids[:, -1] 62 | 63 | outputs = [self.cut_eos(ids) for ids in output_ids.tolist()] 64 | return outputs 65 | 66 | def cut_eos(self, ids): 67 | out_ids = [] 68 | for i in ids: 69 | if i == self.eos: 70 | break 71 | out_ids.append(i) 72 | return out_ids 73 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VALUE-Leaderboard/StarterCode/fe600a7dd552227a5d0297ab953a52d5ea667c9a/utils/__init__.py -------------------------------------------------------------------------------- /utils/const.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | constants 6 | 7 | copied/modified from HERO 8 | (https://github.com/linjieli222/HERO) 9 | """ 10 | VFEAT_DIM = {"resnet_slowfast": 4352, "resnet_mil-nce": 3072, 11 | "clip-vit_slowfast": 2816, 12 | "clip-vit_mil-nce": 1536, 13 | "resnet": 2048, 14 | "slowfast": 2304, 15 | "clip-vit": 512, 16 | "mil-nce": 1024} 17 | # VFEAT_DIM = 4352 18 | MAX_FRM_SEQ_LEN = 100 19 | VCMR_IOU_THDS = (0.5, 0.7) 20 | -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | some functions are modified from UNITER 6 | (https://github.com/ChenRocks/UNITER) 7 | 8 | helper for logging 9 | NOTE: loggers are global objects use with caution 10 | 11 | copied/modified from HERO 12 | (https://github.com/linjieli222/HERO) 13 | """ 14 | import logging 15 | 16 | import tensorboardX 17 | 18 | 19 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' 20 | _DATE_FMT = '%m/%d/%Y %H:%M:%S' 21 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO) 22 | LOGGER = logging.getLogger('__main__') # this is the global logger 23 | 24 | 25 | def add_log_to_file(log_path): 26 | fh = logging.FileHandler(log_path) 27 | formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT) 28 | fh.setFormatter(formatter) 29 | LOGGER.addHandler(fh) 30 | 31 | 32 | class TensorboardLogger(object): 33 | def __init__(self): 34 | self._logger = None 35 | self._global_step = 0 36 | 37 | def create(self, path): 38 | self._logger = tensorboardX.SummaryWriter(path) 39 | 40 | def noop(self, *args, **kwargs): 41 | return 42 | 43 | def step(self): 44 | self._global_step += 1 45 | 46 | @property 47 | def global_step(self): 48 | return self._global_step 49 | 50 | @global_step.setter 51 | def global_step(self, step): 52 | self._global_step = step 53 | 54 | def log_scaler_dict(self, log_dict, prefix=''): 55 | """ log a dictionary of scalar values""" 56 | if self._logger is None: 57 | return 58 | if prefix: 59 | prefix = f'{prefix}_' 60 | for name, value in log_dict.items(): 61 | if isinstance(value, dict): 62 | self.log_scaler_dict(value, self._global_step, 63 | prefix=f'{prefix}{name}') 64 | else: 65 | self._logger.add_scalar(f'{prefix}{name}', value, 66 | self._global_step) 67 | 68 | def __getattr__(self, name): 69 | if self._logger is None: 70 | return self.noop 71 | return self._logger.__getattribute__(name) 72 | 73 | 74 | TB_LOGGER = TensorboardLogger() 75 | 76 | 77 | class RunningMeter(object): 78 | """ running meteor of a scalar value 79 | (useful for monitoring training loss) 80 | """ 81 | def __init__(self, name, val=None, smooth=0.99): 82 | self._name = name 83 | self._sm = smooth 84 | self._val = val 85 | 86 | def __call__(self, value): 87 | self._val = (value if self._val is None 88 | else value*(1-self._sm) + self._val*self._sm) 89 | 90 | def __str__(self): 91 | return f'{self._name}: {self._val:.4f}' 92 | 93 | @property 94 | def val(self): 95 | return self._val 96 | 97 | @property 98 | def name(self): 99 | return self._name 100 | -------------------------------------------------------------------------------- /utils/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) Microsoft Corporation. 3 | Licensed under the MIT license. 4 | 5 | Copied from UNITER 6 | (https://github.com/ChenRocks/UNITER) 7 | 8 | Misc utilities 9 | 10 | copied/modified from HERO 11 | (https://github.com/linjieli222/HERO) 12 | """ 13 | import random 14 | 15 | import torch 16 | import numpy as np 17 | 18 | from utils.logger import LOGGER 19 | 20 | 21 | class Struct(object): 22 | def __init__(self, dict_): 23 | self.__dict__.update(dict_) 24 | 25 | 26 | class NoOp(object): 27 | """ useful for distributed training No-Ops """ 28 | def __getattr__(self, name): 29 | return self.noop 30 | 31 | def noop(self, *args, **kwargs): 32 | return 33 | 34 | 35 | def set_dropout(model, drop_p): 36 | for name, module in model.named_modules(): 37 | # we might want to tune dropout for smaller dataset 38 | if isinstance(module, torch.nn.Dropout): 39 | if module.p != drop_p: 40 | module.p = drop_p 41 | LOGGER.info(f'{name} set to {drop_p}') 42 | 43 | 44 | def set_random_seed(seed): 45 | random.seed(seed) 46 | np.random.seed(seed) 47 | torch.manual_seed(seed) 48 | torch.cuda.manual_seed_all(seed) 49 | --------------------------------------------------------------------------------