├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── config
    ├── FT_only_configs
    │   ├── train-all-multitask-8gpu.json
    │   ├── train-caption-multitask-8gpu.json
    │   ├── train-how2qa-8gpu.json
    │   ├── train-how2qa_sub_only-8gpu.json
    │   ├── train-how2qa_video_only-8gpu.json
    │   ├── train-how2r-8gpu.json
    │   ├── train-how2r_sub_only-8gpu.json
    │   ├── train-how2r_video_only-8gpu.json
    │   ├── train-qa-multitask-8gpu.json
    │   ├── train-retrieval-multitask-8gpu.json
    │   ├── train-tv_domain-multitask-8gpu.json
    │   ├── train-tvc-8gpu.json
    │   ├── train-tvc_sub_only-8gpu.json
    │   ├── train-tvc_video_only_8gpu.json
    │   ├── train-tvqa-8gpu.json
    │   ├── train-tvqa_sub_only-8gpu.json
    │   ├── train-tvqa_video_only-8gpu.json
    │   ├── train-tvr-8gpu.json
    │   ├── train-tvr_sub_only-8gpu.json
    │   ├── train-tvr_video_only-8gpu.json
    │   ├── train-vatex_en_c-8gpu.json
    │   ├── train-vatex_en_c_sub_only-8gpu.json
    │   ├── train-vatex_en_c_video_only-8gpu.json
    │   ├── train-vatex_en_r-8gpu.json
    │   ├── train-vatex_en_r_sub_only-8gpu.json
    │   ├── train-vatex_en_r_video_only-8gpu.json
    │   ├── train-violin-8gpu.json
    │   ├── train-violin_sub_only-8gpu.json
    │   ├── train-violin_video_only-8gpu.json
    │   ├── train-vlep-8gpu.json
    │   ├── train-vlep_sub_only-8gpu.json
    │   ├── train-vlep_video_only-8gpu.json
    │   ├── train-yc2c-8gpu.json
    │   ├── train-yc2c_sub_only-8gpu.json
    │   ├── train-yc2c_video_only-8gpu.json
    │   ├── train-yc2r-4gpu.json
    │   ├── train-yc2r_sub_only-4gpu.json
    │   ├── train-yc2r_video_only-4gpu.json
    │   └── train-youtube_domain-multitask-8gpu.json
    ├── config.py
    ├── model_config
    │   ├── hero_finetune.json
    │   ├── hero_pretrain.json
    │   ├── hero_videoCap.json
    │   ├── video_sub_feature_add_finetune.json
    │   ├── video_sub_feature_concat_finetune.json
    │   └── video_sub_sequence_finetune.json
    ├── pretrain-tv-16gpu.json
    ├── train-all-multitask-8gpu.json
    ├── train-caption-multitask-8gpu.json
    ├── train-how2qa-8gpu.json
    ├── train-how2r-8gpu.json
    ├── train-qa-multitask-8gpu.json
    ├── train-retrieval-multitask-8gpu.json
    ├── train-tv_domain-multitask-8gpu.json
    ├── train-tvc-8gpu.json
    ├── train-tvqa-8gpu.json
    ├── train-tvr-8gpu.json
    ├── train-vatex_en_c-8gpu.json
    ├── train-vatex_en_r-8gpu.json
    ├── train-violin-8gpu.json
    ├── train-vlep-8gpu.json
    ├── train-yc2c-8gpu.json
    ├── train-yc2r-4gpu.json
    └── train-youtube_domain-multitask-8gpu.json
├── data
    ├── __init__.py
    ├── data.py
    ├── fom.py
    ├── loader.py
    ├── mfm.py
    ├── mlm.py
    ├── tvc.py
    ├── vcmr.py
    ├── videoCap.py
    ├── videoQA.py
    ├── violin.py
    ├── vlep.py
    ├── vr.py
    └── vsm.py
├── eval
    ├── pycocoevalcap
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bleu
    │   │   ├── LICENSE
    │   │   ├── __init__.py
    │   │   ├── bleu.py
    │   │   └── bleu_scorer.py
    │   ├── cider
    │   │   ├── __init__.py
    │   │   ├── cider.py
    │   │   └── cider_scorer.py
    │   ├── license.txt
    │   ├── meteor
    │   │   ├── __init__.py
    │   │   ├── meteor.py
    │   │   └── tests
    │   │   │   └── test_meteor.py
    │   ├── rouge
    │   │   ├── __init__.py
    │   │   └── rouge.py
    │   └── tokenizer
    │   │   ├── __init__.py
    │   │   └── ptbtokenizer.py
    ├── tvc.py
    ├── vatex_en_c.py
    └── yc2c.py
├── eval_vcmr.py
├── eval_videoQA.py
├── eval_violin.py
├── eval_vr.py
├── inf_tvc.py
├── inf_vatex_en_c.py
├── inf_yc2c.py
├── launch_container.sh
├── load_data.py
├── model
    ├── __init__.py
    ├── embed.py
    ├── encoder.py
    ├── layers.py
    ├── model.py
    ├── modeling_utils.py
    ├── multitask.py
    ├── pretrain.py
    ├── vcmr.py
    ├── videoCap.py
    ├── videoQA.py
    └── vr.py
├── optim
    ├── __init__.py
    ├── adamw.py
    ├── misc.py
    └── sched.py
├── scripts
    ├── collect_video_feature_paths.py
    ├── convert_video_db_single_feature.py
    ├── convert_videodb.py
    ├── create_txtdb.sh
    ├── download_all.sh
    ├── download_how2.sh
    ├── download_pretrained.sh
    ├── download_tvc.sh
    ├── download_tvqa.sh
    ├── download_tvr.sh
    ├── download_vatex_en.sh
    ├── download_violin.sh
    ├── download_vlep.sh
    ├── download_yc2.sh
    ├── prepro_query.py
    ├── prepro_sub.py
    ├── prepro_tvc.py
    └── prepro_tvc.sh
├── train_all_multitask.py
├── train_captioning.py
├── train_qa.py
├── train_retrieval.py
├── two_stream_eval
    ├── eval_vcmr.py
    ├── eval_videoQA.py
    ├── eval_violin.py
    ├── eval_vr.py
    ├── inf_tvc.py
    ├── inf_vatex_en_c.py
    ├── inf_yc2c.py
    └── videocap_generator.py
└── utils
    ├── __init__.py
    ├── basic_utils.py
    ├── const.py
    ├── distributed.py
    ├── logger.py
    ├── misc.py
    ├── save.py
    ├── tvr_eval_utils.py
    └── tvr_standalone_eval.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # philly
 2 | philly/
 3 | .pt*
 4 | .amlt*
 5 | # ctags
 6 | tags
 7 | 
 8 | # capeval tmp files
 9 | eval/pycocoevalcap/tokenizer/tmp*
10 | demo/
11 | 
12 | # compiled files #
13 | __pycache__
14 | *.pyc
15 | 
16 | # Packages #
17 | ############
18 | # it's better to unpack these files and commit the raw source
19 | # git has its own built in compression methods
20 | *.7z
21 | *.dmg
22 | *.gz
23 | *.iso
24 | *.jar
25 | *.rar
26 | *.tar
27 | *.zip
28 | 
29 | # Logs and databases #
30 | ######################
31 | *.log
32 | *.sql
33 | *.sqlite
34 | .ipynb_checkpoints/
35 | *.swp
36 | *.vscode/
37 | *.idea/
38 | 
39 | # OS generated files #
40 | ######################
41 | .DS_Store
42 | .DS_Store?
43 | ._*
44 | .Spotlight-V100
45 | .Trashes
46 | ehthumbs.db
47 | Thumbs.db
48 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:19.10-py3
 2 | 
 3 | # basic python packages
 4 | RUN pip install transformers==2.0.0 \
 5 |                 tensorboardX==1.7 ipdb==0.12 lz4==2.1.9 lmdb==0.97
 6 | 
 7 | ####### horovod for multi-GPU (distributed) training #######
 8 | # horovod
 9 | RUN HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITH_PYTORCH=1 \
10 |     pip install --no-cache-dir horovod==0.18.2 &&\
11 |     ldconfig
12 | 
13 | # ssh
14 | RUN apt-get update &&\
15 |     apt-get install -y --no-install-recommends openssh-client openssh-server &&\
16 |     mkdir -p /var/run/sshd
17 | 
18 | # Allow OpenSSH to talk to containers without asking for confirmation
19 | RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
20 |     echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
21 |     mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
22 | 
23 | # captioning
24 | 
25 | # captioning eval tool (java for PTBtokenizer and METEOR)
26 | RUN apt-get install -y --no-install-recommends openjdk-8-jdk && apt-get clean
27 | 
28 | # binaries for cococap eval
29 | ARG PYCOCOEVALCAP=https://github.com/tylin/coco-caption/raw/master/pycocoevalcap
30 | RUN mkdir /workspace/cococap_bin/ && \
31 |     wget $PYCOCOEVALCAP/meteor/meteor-1.5.jar -P /workspace/cococap_bin/ && \
32 |     wget $PYCOCOEVALCAP/meteor/data/paraphrase-en.gz -P /workspace/cococap_bin/ && \
33 |     wget $PYCOCOEVALCAP/tokenizer/stanford-corenlp-3.4.1.jar -P /workspace/cococap_bin/
34 | 
35 | # add new command here
36 | 
37 | WORKDIR /src
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 VALUE Benchmark Starter Code
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-caption-multitask-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "tvc_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "cap_txt_db": ["/txt/tvc_train.db"],
 9 |             "batch_size": 4,
10 |             "ratio": 2
11 |         },
12 |         {
13 |             "task": "videoCap",
14 |             "name": "vatex_en_c_video_sub_train",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db",
16 |             "vfeat_db": "/video/vatex",
17 |             "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"],
18 |             "batch_size": 128,
19 |             "ratio": 2
20 |         },
21 |         {
22 |             "task": "videoCap",
23 |             "name": "yc2c_video_sub_train",
24 |             "sub_txt_db": "/txt/yc2_subtitles.db",
25 |             "vfeat_db": "/video/yc2",
26 |             "cap_txt_db": ["/txt/yc2r_train.db"],
27 |             "batch_size": 16,
28 |             "ratio": 1
29 |         }
30 |     ],
31 |     "val_datasets": [
32 |         {
33 |             "task": "videoCap",
34 |             "name": "tvc_video_sub_val",
35 |             "sub_txt_db": "/txt/tv_subtitles.db",
36 |             "vfeat_db": "/video/tv",
37 |             "batch_size": 8,
38 |             "gt_anno": "/txt/tvc_val_release.jsonl"
39 |         },
40 |         {
41 |             "task": "videoCap",
42 |             "name": "vatex_en_c_video_sub_val",
43 |             "sub_txt_db": "/txt/vatex_subtitles.db",
44 |             "vfeat_db": "/video/vatex",
45 |             "batch_size": 128,
46 |             "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl"
47 |         },
48 |         {
49 |             "task": "videoCap",
50 |             "name": "yc2c_video_sub_val",
51 |             "sub_txt_db": "/txt/yc2_subtitles.db",
52 |             "vfeat_db": "/video/yc2",
53 |             "batch_size": 16,
54 |             "gt_anno": "/txt/yc2c_val_release.jsonl"
55 |         }
56 |     ],
57 |     "compressed_db": false,
58 |     "model_config": "config/model_config/hero_videoCap.json",
59 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
60 |     "load_partial_pretrained": true,
61 |     "skip_layer_loading": true,
62 |     "output_dir": "/storage/MT_FT_only/caption_multi-task_default",
63 |     "max_clip_len": 100,
64 |     "max_txt_len": 60,
65 |     "max_cap_per_vid": -1,
66 |     "max_gen_step": 30,
67 |     "vfeat_version": "resnet_slowfast",
68 |     "vfeat_interval": 1.5,
69 |     "train_batch_size": 4,
70 |     "val_batch_size": 8,
71 |     "gradient_accumulation_steps": 1,
72 |     "learning_rate": 1e-4,
73 |     "lr_mul": 10.0,
74 |     "valid_steps": 500,
75 |     "num_train_steps": 30000,
76 |     "optim": "adamw",
77 |     "betas": [0.9, 0.98],
78 |     "lsr": 0.1,
79 |     "dropout": 0.1,
80 |     "weight_decay": 0.01,
81 |     "grad_norm": 1.0,
82 |     "warmup_steps": 3000,
83 |     "sub_ctx_len": 1,
84 |     "seed": 77,
85 |     "no_fp16": false,
86 |     "n_workers": 4,
87 |     "pin_mem": true
88 | }


--------------------------------------------------------------------------------
/config/FT_only_configs/train-how2qa-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "how2qa_video_sub_train",
 6 |             "sub_txt_db": "/txt/how2_subtitles.db",
 7 |             "vfeat_db": "/video/how2",
 8 |             "query_txt_db": "/txt/how2qa_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "how2qa_video_sub_val",
15 |             "sub_txt_db": "/txt/how2_subtitles.db",
16 |             "vfeat_db": "/video/how2",
17 |             "query_txt_db": "/txt/how2qa_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/ST_FT_only/how2qa_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 2000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 200,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-how2qa_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "how2qa_sub_only_train",
 6 |             "sub_txt_db": "/txt/how2_subtitles.db",
 7 |             "vfeat_db": "/video/how2",
 8 |             "query_txt_db": "/txt/how2qa_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "how2qa_sub_only_val",
15 |             "sub_txt_db": "/txt/how2_subtitles.db",
16 |             "vfeat_db": "/video/how2",
17 |             "query_txt_db": "/txt/how2qa_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_sub_only/how2qa_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 2000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 200,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-how2qa_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "how2qa_video_only_train",
 6 |             "sub_txt_db": "/txt/how2_subtitles.db",
 7 |             "vfeat_db": "/video/how2",
 8 |             "query_txt_db": "/txt/how2qa_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "how2qa_video_only_val",
15 |             "sub_txt_db": "/txt/how2_subtitles.db",
16 |             "vfeat_db": "/video/how2",
17 |             "query_txt_db": "/txt/how2qa_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_video_only/how2qa_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 2000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 200,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-how2r-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vcmr",
 5 |             "name": "how2r_video_sub_train",
 6 |             "sub_txt_db": "/txt/how2_subtitles.db",
 7 |             "vfeat_db": "/video/how2",
 8 |             "query_txt_db": "/txt/how2r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vcmr",
14 |             "name": "how2r_video_sub_val",
15 |             "sub_txt_db": "/txt/how2_subtitles.db",
16 |             "vfeat_db": "/video/how2",
17 |             "query_txt_db": "/txt/how2r_val_1k.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/ST_FT_only/how2r_default",
26 |     "eval_with_query_type": true,
27 |     "max_before_nms": 200,
28 |     "max_after_nms": 100,
29 |     "distributed_eval": true,
30 |     "nms_thd": -1,
31 |     "q2c_alpha": 20,
32 |     "max_vcmr_video": 100,
33 |     "full_eval_tasks": [
34 |         "VCMR",
35 |         "SVMR",
36 |         "VR"
37 |     ],
38 |     "max_clip_len": 100,
39 |     "max_txt_len": 60,
40 |     "vfeat_version": "resnet_slowfast",
41 |     "vfeat_interval": 1.5,
42 |     "min_pred_l": 3,
43 |     "max_pred_l": 20,
44 |     "drop_svmr_prob": 0.9,
45 |     "train_batch_size": 32,
46 |     "val_batch_size": 20,
47 |     "vcmr_eval_video_batch_size": 50,
48 |     "vcmr_eval_batch_size": 80,
49 |     "gradient_accumulation_steps":2,
50 |     "learning_rate": 1e-04,
51 |     "valid_steps": 200,
52 |     "save_steps": 200,
53 |     "num_train_steps": 6000,
54 |     "optim": "adamw",
55 |     "betas": [
56 |         0.9,
57 |         0.98
58 |     ],
59 |     "dropout": 0.1,
60 |     "weight_decay": 0.01,
61 |     "grad_norm": 1.0,
62 |     "warmup_steps": 600,
63 |     "lw_neg_q": 8.0,
64 |     "lw_neg_ctx": 8.0,
65 |     "lw_st_ed": 0.01,
66 |     "ranking_loss_type": "hinge",
67 |     "margin": 0.1,
68 |     "hard_pool_size": [
69 |         20
70 |     ],
71 |     "hard_neg_weights": [
72 |         10
73 |     ],
74 |     "hard_negtiave_start_step": [
75 |         2000
76 |     ],
77 |     "train_span_start_step": 0,
78 |     "sub_ctx_len": 0,
79 |     "use_all_neg": true,
80 |     "seed": 77,
81 |     "no_fp16": false,
82 |     "n_workers": 4,
83 |     "no_pin_mem": false,
84 |     "rank": 0
85 | }
86 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-how2r_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vcmr",
 5 |             "name": "how2r_sub_only_train",
 6 |             "sub_txt_db": "/txt/how2_subtitles.db",
 7 |             "vfeat_db": "/video/how2",
 8 |             "query_txt_db": "/txt/how2r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vcmr",
14 |             "name": "how2r_sub_only_val",
15 |             "sub_txt_db": "/txt/how2_subtitles.db",
16 |             "vfeat_db": "/video/how2",
17 |             "query_txt_db": "/txt/how2r_val_1k.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_sub_only/how2r_default",
26 |     "eval_with_query_type": true,
27 |     "max_before_nms": 200,
28 |     "max_after_nms": 100,
29 |     "distributed_eval": true,
30 |     "nms_thd": -1,
31 |     "q2c_alpha": 20,
32 |     "max_vcmr_video": 100,
33 |     "full_eval_tasks": [
34 |         "VCMR",
35 |         "SVMR",
36 |         "VR"
37 |     ],
38 |     "max_clip_len": 100,
39 |     "max_txt_len": 60,
40 |     "vfeat_version": "resnet_slowfast",
41 |     "vfeat_interval": 1.5,
42 |     "min_pred_l": 3,
43 |     "max_pred_l": 20,
44 |     "drop_svmr_prob": 0.9,
45 |     "train_batch_size": 32,
46 |     "val_batch_size": 20,
47 |     "vcmr_eval_video_batch_size": 50,
48 |     "vcmr_eval_batch_size": 80,
49 |     "gradient_accumulation_steps":2,
50 |     "learning_rate": 1e-04,
51 |     "valid_steps": 200,
52 |     "save_steps": 200,
53 |     "num_train_steps": 3000,
54 |     "optim": "adamw",
55 |     "betas": [
56 |         0.9,
57 |         0.98
58 |     ],
59 |     "dropout": 0.1,
60 |     "weight_decay": 0.01,
61 |     "grad_norm": 1.0,
62 |     "warmup_steps": 300,
63 |     "lw_neg_q": 8.0,
64 |     "lw_neg_ctx": 8.0,
65 |     "lw_st_ed": 0.01,
66 |     "ranking_loss_type": "hinge",
67 |     "margin": 0.1,
68 |     "hard_pool_size": [
69 |         20
70 |     ],
71 |     "hard_neg_weights": [
72 |         10
73 |     ],
74 |     "hard_negtiave_start_step": [
75 |         1000
76 |     ],
77 |     "train_span_start_step": 0,
78 |     "sub_ctx_len": 0,
79 |     "use_all_neg": true,
80 |     "seed": 77,
81 |     "no_fp16": false,
82 |     "n_workers": 4,
83 |     "no_pin_mem": false,
84 |     "rank": 0
85 | }
86 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-how2r_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vcmr",
 5 |             "name": "how2r_video_only_train",
 6 |             "sub_txt_db": "/txt/how2_subtitles.db",
 7 |             "vfeat_db": "/video/how2",
 8 |             "query_txt_db": "/txt/how2r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vcmr",
14 |             "name": "how2r_video_only_val",
15 |             "sub_txt_db": "/txt/how2_subtitles.db",
16 |             "vfeat_db": "/video/how2",
17 |             "query_txt_db": "/txt/how2r_val_1k.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/how2r_default",
26 |     "eval_with_query_type": true,
27 |     "max_before_nms": 200,
28 |     "max_after_nms": 100,
29 |     "distributed_eval": true,
30 |     "nms_thd": -1,
31 |     "q2c_alpha": 20,
32 |     "max_vcmr_video": 100,
33 |     "full_eval_tasks": [
34 |         "VCMR",
35 |         "SVMR",
36 |         "VR"
37 |     ],
38 |     "max_clip_len": 100,
39 |     "max_txt_len": 60,
40 |     "vfeat_version": "resnet_slowfast",
41 |     "vfeat_interval": 1.5,
42 |     "min_pred_l": 3,
43 |     "max_pred_l": 20,
44 |     "drop_svmr_prob": 0.9,
45 |     "train_batch_size": 32,
46 |     "val_batch_size": 20,
47 |     "vcmr_eval_video_batch_size": 50,
48 |     "vcmr_eval_batch_size": 80,
49 |     "gradient_accumulation_steps":2,
50 |     "learning_rate": 1e-04,
51 |     "valid_steps": 200,
52 |     "save_steps": 200,
53 |     "num_train_steps": 6000,
54 |     "optim": "adamw",
55 |     "betas": [
56 |         0.9,
57 |         0.98
58 |     ],
59 |     "dropout": 0.1,
60 |     "weight_decay": 0.01,
61 |     "grad_norm": 1.0,
62 |     "warmup_steps": 600,
63 |     "lw_neg_q": 8.0,
64 |     "lw_neg_ctx": 8.0,
65 |     "lw_st_ed": 0.01,
66 |     "ranking_loss_type": "hinge",
67 |     "margin": 0.1,
68 |     "hard_pool_size": [
69 |         20
70 |     ],
71 |     "hard_neg_weights": [
72 |         10
73 |     ],
74 |     "hard_negative_start_step": [
75 |         2000
76 |     ],
77 |     "train_span_start_step": 0,
78 |     "sub_ctx_len": 0,
79 |     "use_all_neg": true,
80 |     "seed": 77,
81 |     "no_fp16": false,
82 |     "n_workers": 4,
83 |     "no_pin_mem": false,
84 |     "rank": 0
85 | }
86 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-qa-multitask-8gpu.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "train_datasets": [
  3 |         {
  4 |             "task": "videoQA",
  5 |             "name": "tvqa_video_sub_train",
  6 |             "sub_txt_db": "/txt/tv_subtitles.db",
  7 |             "vfeat_db": "/video/tv",
  8 |             "query_txt_db": "/txt/tvqa_train.db",
  9 |             "ratio": 5
 10 |         },
 11 |         {
 12 |             "task": "videoQA",
 13 |             "name": "how2qa_video_sub_train",
 14 |             "sub_txt_db": "/txt/how2_subtitles.db",
 15 |             "vfeat_db": "/video/how2",
 16 |             "query_txt_db": "/txt/how2qa_train.db",
 17 |             "ratio": 1
 18 |         },
 19 |         {
 20 |             "task": "violin",
 21 |             "name": "violin_video_sub_train",
 22 |             "sub_txt_db": "/txt/violin_subtitles.db",
 23 |             "vfeat_db": "/video/violin",
 24 |             "query_txt_db": "/txt/violin_train.db",
 25 |             "ratio": 3
 26 |         },
 27 |         {
 28 |             "task": "videoQA",
 29 |             "name": "vlep_video_sub_train",
 30 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
 31 |             "vfeat_db": "/video/vlep",
 32 |             "query_txt_db": "/txt/vlep_train.db",
 33 |             "ratio": 1
 34 |         }
 35 |     ],
 36 |     "val_datasets": [
 37 |         {
 38 |             "task": "videoQA",
 39 |             "name": "tvqa_video_sub_val",
 40 |             "sub_txt_db": "/txt/tv_subtitles.db",
 41 |             "vfeat_db": "/video/tv",
 42 |             "query_txt_db": "/txt/tvqa_val.db"
 43 |         },
 44 |         {
 45 |             "task": "videoQA",
 46 |             "name": "how2qa_video_sub_val",
 47 |             "sub_txt_db": "/txt/how2_subtitles.db",
 48 |             "vfeat_db": "/video/how2",
 49 |             "query_txt_db": "/txt/how2qa_val.db"
 50 |         },
 51 |         {
 52 |             "task": "violin",
 53 |             "name": "violin_video_sub_val",
 54 |             "sub_txt_db": "/txt/violin_subtitles.db",
 55 |             "vfeat_db": "/video/violin",
 56 |             "query_txt_db": "/txt/violin_val.db"
 57 |         },
 58 |         {
 59 |             "task": "videoQA",
 60 |             "name": "vlep_video_sub_dev",
 61 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
 62 |             "vfeat_db": "/video/vlep",
 63 |             "query_txt_db": "/txt/vlep_dev.db"
 64 |         }
 65 |     ],
 66 |     "compressed_db": false,
 67 |     "model_config": "config/model_config/hero_finetune.json",
 68 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
 69 |     "load_partial_pretrained": true,
 70 |     "skip_layer_loading": true,
 71 |     "output_dir": "/storage/MT_FT_only/qa_multi-task_default",
 72 |     "max_clip_len": 100,
 73 |     "max_txt_len": 120,
 74 |     "vfeat_version": "resnet_slowfast",
 75 |     "vfeat_interval": 1.5,
 76 |     "train_batch_size": 4,
 77 |     "val_batch_size": 10,
 78 |     "gradient_accumulation_steps": 2,
 79 |     "learning_rate": 5e-05,
 80 |     "valid_steps": 200,
 81 |     "save_steps": 200,
 82 |     "num_train_steps": 20000,
 83 |     "optim": "adamw",
 84 |     "betas": [
 85 |         0.9,
 86 |         0.98
 87 |     ],
 88 |     "dropout": 0.1,
 89 |     "weight_decay": 0.01,
 90 |     "lr_mul": 10.0,
 91 |     "grad_norm": 1.0,
 92 |     "warmup_steps": 2000,
 93 |     "lw_st_ed": 0.4,
 94 |     "sub_ctx_len": 0,
 95 |     "seed": 77,
 96 |     "no_fp16": false,
 97 |     "n_workers": 4,
 98 |     "no_pin_mem": false,
 99 |     "rank": 0
100 | }


--------------------------------------------------------------------------------
/config/FT_only_configs/train-retrieval-multitask-8gpu.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "train_datasets": [
  3 |         {
  4 |             "task": "vcmr",
  5 |             "name": "tvr_video_sub_train",
  6 |             "sub_txt_db": "/txt/tv_subtitles.db",
  7 |             "vfeat_db": "/video/tv",
  8 |             "query_txt_db": "/txt/tvr_train.db",
  9 |             "batch_size": 32,
 10 |             "ratio": 2
 11 |         },
 12 |         {
 13 |             "task": "vcmr",
 14 |             "name": "how2r_video_sub_train",
 15 |             "sub_txt_db": "/txt/how2_subtitles.db",
 16 |             "vfeat_db": "/video/how2",
 17 |             "query_txt_db": "/txt/how2r_train.db",
 18 |             "batch_size": 32,
 19 |             "ratio": 1
 20 |         },
 21 |         {
 22 |             "task": "vr",
 23 |             "name": "vatex_en_r_video_sub_train",
 24 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
 25 |             "vfeat_db": "/video/vatex",
 26 |             "query_txt_db": "/txt/vatex_en_r_train.db",
 27 |             "batch_size": 64,
 28 |             "ratio": 3
 29 |         },
 30 |         {
 31 |             "task": "vr",
 32 |             "name": "yc2r_video_sub_train",
 33 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
 34 |             "vfeat_db": "/video/yc2",
 35 |             "query_txt_db": "/txt/yc2r_train.db",
 36 |             "batch_size": 48,
 37 |             "ratio": 1
 38 |         }
 39 |     ],
 40 |     "val_datasets": [
 41 |         {
 42 |             "task": "vcmr",
 43 |             "name": "tvr_video_sub_val",
 44 |             "sub_txt_db": "/txt/tv_subtitles.db",
 45 |             "vfeat_db": "/video/tv",
 46 |             "query_txt_db": "/txt/tvr_val.db"
 47 |         },
 48 |         {
 49 |             "task": "vcmr",
 50 |             "name": "how2r_video_sub_val",
 51 |             "sub_txt_db": "/txt/how2_subtitles.db",
 52 |             "vfeat_db": "/video/how2",
 53 |             "query_txt_db": "/txt/how2r_val_1k.db"
 54 |         },
 55 |         {
 56 |             "task": "vr",
 57 |             "name": "vatex_en_r_video_sub_val",
 58 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
 59 |             "vfeat_db": "/video/vatex",
 60 |             "query_txt_db": "/txt/vatex_en_r_val.db"
 61 |         },
 62 |         {
 63 |             "task": "vr",
 64 |             "name": "yc2r_video_sub_val",
 65 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
 66 |             "vfeat_db": "/video/yc2",
 67 |             "query_txt_db": "/txt/yc2r_val.db"
 68 |         }
 69 |     ],
 70 |     "compressed_db": false,
 71 |     "model_config": "config/model_config/hero_finetune.json",
 72 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
 73 |     "load_partial_pretrained": true,
 74 |     "skip_layer_loading": true,
 75 |     "output_dir": "/storage/MT_FT_only/retrieval_multi-task_default",
 76 |     "eval_with_query_type": true,
 77 |     "max_before_nms": 200,
 78 |     "max_after_nms": 100,
 79 |     "distributed_eval": true,
 80 |     "nms_thd": -1,
 81 |     "q2c_alpha": 20,
 82 |     "max_vcmr_video": 100,
 83 |     "full_eval_tasks": [
 84 |         "VCMR",
 85 |         "SVMR",
 86 |         "VR"
 87 |     ],
 88 |     "max_clip_len": 100,
 89 |     "max_txt_len": 60,
 90 |     "vfeat_version": "resnet_slowfast",
 91 |     "vfeat_interval": 1.5,
 92 |     "min_pred_l": 2,
 93 |     "max_pred_l": 16,
 94 |     "drop_svmr_prob": 0.8,
 95 |     "train_batch_size": 32,
 96 |     "val_batch_size": 20,
 97 |     "vcmr_eval_video_batch_size": 50,
 98 |     "vcmr_eval_batch_size": 80,
 99 |     "vr_eval_video_batch_size": 50,
100 |     "vr_eval_batch_size": 80,
101 |     "gradient_accumulation_steps":2,
102 |     "learning_rate": 1e-04,
103 |     "valid_steps": 400,
104 |     "save_steps": 400,
105 |     "num_train_steps": 20000,
106 |     "optim": "adamw",
107 |     "betas": [
108 |         0.9,
109 |         0.98
110 |     ],
111 |     "dropout": 0.1,
112 |     "weight_decay": 0.01,
113 |     "grad_norm": 1.0,
114 |     "warmup_steps": 2000,
115 |     "lw_neg_q": 8.0,
116 |     "lw_neg_ctx": 8.0,
117 |     "lw_st_ed": 0.01,
118 |     "ranking_loss_type": "hinge",
119 |     "margin": 0.1,
120 |     "hard_pool_size": [
121 |         20
122 |     ],
123 |     "hard_neg_weights": [
124 |         10
125 |     ],
126 |     "hard_negative_start_step": [
127 |         8000
128 |     ],
129 |     "train_span_start_step": 0,
130 |     "sub_ctx_len": 0,
131 |     "use_all_neg": true,
132 |     "seed": 77,
133 |     "no_fp16": false,
134 |     "n_workers": 4,
135 |     "no_pin_mem": false,
136 |     "rank": 0
137 | }
138 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tv_domain-multitask-8gpu.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "train_datasets": [
  3 |         {
  4 |             "task": "vcmr",
  5 |             "name": "tvr_video_sub_train",
  6 |             "sub_txt_db": "/txt/tv_subtitles.db",
  7 |             "vfeat_db": "/video/tv",
  8 |             "query_txt_db": "/txt/tvr_train.db",
  9 |             "batch_size": 32,
 10 |             "ratio": 5
 11 |         },
 12 |         {
 13 |             "task": "videoCap",
 14 |             "name": "tvc_video_sub_train",
 15 |             "sub_txt_db": "/txt/tv_subtitles.db",
 16 |             "vfeat_db": "/video/tv",
 17 |             "cap_txt_db": ["/txt/tvc_train.db"],
 18 |             "batch_size": 4,
 19 |             "ratio": 5
 20 |         },
 21 |         {
 22 |             "task": "videoQA",
 23 |             "name": "tvqa_video_sub_train",
 24 |             "sub_txt_db": "/txt/tv_subtitles.db",
 25 |             "vfeat_db": "/video/tv",
 26 |             "query_txt_db": "/txt/tvqa_train.db",
 27 |             "batch_size": 4,
 28 |             "ratio": 5
 29 |         },
 30 |         {
 31 |             "task": "violin",
 32 |             "name": "violin_video_sub_train",
 33 |             "sub_txt_db": "/txt/violin_subtitles.db",
 34 |             "vfeat_db": "/video/violin",
 35 |             "query_txt_db": "/txt/violin_train.db",
 36 |             "batch_size": 4,
 37 |             "ratio": 3
 38 |         },
 39 |         {
 40 |             "task": "videoQA",
 41 |             "name": "vlep_video_sub_train",
 42 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
 43 |             "vfeat_db": "/video/vlep",
 44 |             "query_txt_db": "/txt/vlep_train.db",
 45 |             "batch_size": 4,
 46 |             "ratio": 1
 47 |         }
 48 |     ],
 49 |     "val_datasets": [
 50 |         {
 51 |             "task": "vcmr",
 52 |             "name": "tvr_video_sub_val",
 53 |             "sub_txt_db": "/txt/tv_subtitles.db",
 54 |             "vfeat_db": "/video/tv",
 55 |             "batch_size": 20,
 56 |             "query_txt_db": "/txt/tvr_val.db"
 57 |         },
 58 |         {
 59 |             "task": "videoCap",
 60 |             "name": "tvc_video_sub_val",
 61 |             "sub_txt_db": "/txt/tv_subtitles.db",
 62 |             "vfeat_db": "/video/tv",
 63 |             "batch_size": 8,
 64 |             "gt_anno": "/txt/tvc_val_release.jsonl"
 65 |         },
 66 |         {
 67 |             "task": "videoQA",
 68 |             "name": "tvqa_video_sub_val",
 69 |             "sub_txt_db": "/txt/tv_subtitles.db",
 70 |             "vfeat_db": "/video/tv",
 71 |             "batch_size": 10,
 72 |             "query_txt_db": "/txt/tvqa_val.db"
 73 |         },
 74 |         {
 75 |             "task": "violin",
 76 |             "name": "violin_video_sub_val",
 77 |             "sub_txt_db": "/txt/violin_subtitles.db",
 78 |             "vfeat_db": "/video/violin",
 79 |             "batch_size": 10,
 80 |             "query_txt_db": "/txt/violin_val.db"
 81 |         },
 82 |         {
 83 |             "task": "videoQA",
 84 |             "name": "vlep_video_sub_dev",
 85 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
 86 |             "vfeat_db": "/video/vlep",
 87 |             "batch_size": 10,
 88 |             "query_txt_db": "/txt/vlep_dev.db"
 89 |         }
 90 |     ],
 91 |     "compressed_db": false,
 92 |     "model_config": "config/model_config/hero_finetune.json",
 93 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
 94 |     "load_partial_pretrained": true,
 95 |     "skip_layer_loading": true,
 96 |     "output_dir": "/storage/MT_FT_only/tv-domain_multi-task_default",
 97 |     "eval_with_query_type": true,
 98 |     "max_before_nms": 200,
 99 |     "max_after_nms": 100,
100 |     "distributed_eval": true,
101 |     "nms_thd": -1,
102 |     "q2c_alpha": 20,
103 |     "max_vcmr_video": 100,
104 |     "full_eval_tasks": [
105 |         "VCMR",
106 |         "SVMR",
107 |         "VR"
108 |     ],
109 |     "max_clip_len": 100,
110 |     "max_txt_len": 60,
111 |     "vfeat_version": "resnet_slowfast",
112 |     "vfeat_interval": 1.5,
113 |     "min_pred_l": 2,
114 |     "max_pred_l": 16,
115 |     "drop_svmr_prob": 0.8,
116 |     "train_batch_size": 32,
117 |     "val_batch_size": 20,
118 |     "vcmr_eval_video_batch_size": 50,
119 |     "vcmr_eval_batch_size": 80,
120 |     "vr_eval_video_batch_size": 50,
121 |     "vr_eval_batch_size": 80,
122 |     "gradient_accumulation_steps":2,
123 |     "learning_rate": 1e-04,
124 |     "valid_steps": 400,
125 |     "save_steps": 400,
126 |     "num_train_steps": 30000,
127 |     "optim": "adamw",
128 |     "betas": [
129 |         0.9,
130 |         0.98
131 |     ],
132 |     "dropout": 0.1,
133 |     "weight_decay": 0.01,
134 |     "grad_norm": 1.0,
135 |     "warmup_steps": 3000,
136 |     "lw_neg_q": 8.0,
137 |     "lw_neg_ctx": 8.0,
138 |     "lw_st_ed": 0.01,
139 |     "ranking_loss_type": "hinge",
140 |     "margin": 0.1,
141 |     "hard_pool_size": [
142 |         20
143 |     ],
144 |     "hard_neg_weights": [
145 |         10
146 |     ],
147 |     "hard_negative_start_step": [
148 |         8000
149 |     ],
150 |     "train_span_start_step": 0,
151 |     "sub_ctx_len": 0,
152 |     "use_all_neg": true,
153 |     "seed": 77,
154 |     "no_fp16": false,
155 |     "n_workers": 1,
156 |     "no_pin_mem": false,
157 |     "rank": 0,
158 |     "max_cap_per_vid": -1,
159 |     "max_gen_step": 30,
160 |     "lr_mul": 10.0,
161 |     "lsr": 0.1,
162 |     "qa_lw_st_ed": 0.4
163 | }
164 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvc-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "tvc_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "cap_txt_db": ["/txt/tvc_train.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "tvc_video_sub_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "gt_anno": "/txt/tvc_val_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "/src/config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/ST_FT_only/tvc_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_cap_per_vid": -1,
28 |     "max_gen_step": 30,
29 |     "vfeat_version": "resnet_slowfast",
30 |     "vfeat_interval": 1.5,
31 |     "compressed_db": false,
32 |     "train_batch_size": 4,
33 |     "val_batch_size": 8,
34 |     "gradient_accumulation_steps": 1,
35 |     "learning_rate": 1e-4,
36 |     "lr_mul": 10.0,
37 |     "valid_steps": 500,
38 |     "num_train_steps": 7000,
39 |     "optim": "adamw",
40 |     "betas": [0.9, 0.98],
41 |     "lsr": 0.1,
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "grad_norm": 1.0,
45 |     "warmup_steps": 700,
46 |     "sub_ctx_len": 1,
47 |     "seed": 77,
48 |     "no_fp16": false,
49 |     "n_workers": 4,
50 |     "pin_mem": true
51 | }
52 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvc_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "tvc_sub_only_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "cap_txt_db": ["/txt/tvc_train.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "tvc_sub_only_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "gt_anno": "/txt/tvc_val_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "/src/config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/multi_channel_ablation_sub_only/tvc_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_cap_per_vid": -1,
28 |     "max_gen_step": 30,
29 |     "vfeat_version": "resnet_slowfast",
30 |     "vfeat_interval": 1.5,
31 |     "compressed_db": false,
32 |     "train_batch_size": 4,
33 |     "val_batch_size": 8,
34 |     "gradient_accumulation_steps": 1,
35 |     "learning_rate": 1e-4,
36 |     "lr_mul": 10.0,
37 |     "valid_steps": 500,
38 |     "num_train_steps": 7000,
39 |     "optim": "adamw",
40 |     "betas": [0.9, 0.98],
41 |     "lsr": 0.1,
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "grad_norm": 1.0,
45 |     "warmup_steps": 700,
46 |     "sub_ctx_len": 1,
47 |     "seed": 77,
48 |     "no_fp16": false,
49 |     "n_workers": 4,
50 |     "pin_mem": true
51 | }
52 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvc_video_only_8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "tvc_video_only_train",
 6 |             "sub_txt_db": null,
 7 |             "vfeat_db": "/video/tv",
 8 |             "cap_txt_db": ["/txt/tvc_train.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "tvc_video_only_val",
15 |             "sub_txt_db": null,
16 |             "vfeat_db": "/video/tv",
17 |             "gt_anno": "/txt/tvc_val_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "/src/config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/multi_channel_ablation_video_only/tvc_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_cap_per_vid": -1,
28 |     "max_gen_step": 30,
29 |     "vfeat_version": "resnet_slowfast",
30 |     "vfeat_interval": 1.5,
31 |     "compressed_db": false,
32 |     "train_batch_size": 4,
33 |     "val_batch_size": 8,
34 |     "gradient_accumulation_steps": 1,
35 |     "learning_rate": 1e-4,
36 |     "lr_mul": 10.0,
37 |     "valid_steps": 500,
38 |     "num_train_steps": 7000,
39 |     "optim": "adamw",
40 |     "betas": [0.9, 0.98],
41 |     "lsr": 0.1,
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "grad_norm": 1.0,
45 |     "warmup_steps": 700,
46 |     "sub_ctx_len": 1,
47 |     "seed": 77,
48 |     "no_fp16": false,
49 |     "n_workers": 4,
50 |     "pin_mem": true
51 | }
52 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvqa-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "tvqa_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvqa_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "tvqa_video_sub_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "query_txt_db": "/txt/tvqa_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/ST_FT_only/tvqa_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 10000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 1000,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvqa_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "tvqa_sub_only_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvqa_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "tvqa_sub_only_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "query_txt_db": "/txt/tvqa_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_sub_only/tvqa_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 10000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 1000,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvqa_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "tvqa_video_only_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvqa_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "tvqa_video_only_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "query_txt_db": "/txt/tvqa_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_video_only/tvqa_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 10000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 1000,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvr-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vcmr",
 5 |             "name": "tvr_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvr_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vcmr",
14 |             "name": "tvr_video_sub_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "query_txt_db": "/txt/tvr_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/ST_FT_only/tvr_default",
26 |     "eval_with_query_type": true,
27 |     "max_before_nms": 200,
28 |     "max_after_nms": 100,
29 |     "distributed_eval": true,
30 |     "nms_thd": -1,
31 |     "q2c_alpha": 20,
32 |     "max_vcmr_video": 100,
33 |     "full_eval_tasks": [
34 |         "VCMR",
35 |         "SVMR",
36 |         "VR"
37 |     ],
38 |     "max_clip_len": 100,
39 |     "max_txt_len": 60,
40 |     "vfeat_version": "resnet_slowfast",
41 |     "vfeat_interval": 1.5,
42 |     "min_pred_l": 2,
43 |     "max_pred_l": 16,
44 |     "drop_svmr_prob": 0.8,
45 |     "train_batch_size": 32,
46 |     "val_batch_size": 20,
47 |     "vcmr_eval_video_batch_size": 50,
48 |     "vcmr_eval_batch_size": 80,
49 |     "gradient_accumulation_steps":2,
50 |     "learning_rate": 1e-04,
51 |     "valid_steps": 400,
52 |     "save_steps": 400,
53 |     "num_train_steps": 10000,
54 |     "optim": "adamw",
55 |     "betas": [
56 |         0.9,
57 |         0.98
58 |     ],
59 |     "dropout": 0.1,
60 |     "weight_decay": 0.01,
61 |     "grad_norm": 1.0,
62 |     "warmup_steps": 1000,
63 |     "lw_neg_q": 8.0,
64 |     "lw_neg_ctx": 8.0,
65 |     "lw_st_ed": 0.01,
66 |     "ranking_loss_type": "hinge",
67 |     "margin": 0.1,
68 |     "hard_pool_size": [
69 |         20
70 |     ],
71 |     "hard_neg_weights": [
72 |         10
73 |     ],
74 |     "hard_negative_start_step": [
75 |         4000
76 |     ],
77 |     "train_span_start_step": 0,
78 |     "sub_ctx_len": 0,
79 |     "use_all_neg": true,
80 |     "seed": 77,
81 |     "no_fp16": false,
82 |     "n_workers": 4,
83 |     "no_pin_mem": false,
84 |     "rank": 0
85 | }
86 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvr_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vcmr",
 5 |             "name": "tvr_sub_only_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvr_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vcmr",
14 |             "name": "tvr_sub_only_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "query_txt_db": "/txt/tvr_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_sub_only/tvr_default",
26 |     "eval_with_query_type": true,
27 |     "max_before_nms": 200,
28 |     "max_after_nms": 100,
29 |     "distributed_eval": true,
30 |     "nms_thd": -1,
31 |     "q2c_alpha": 20,
32 |     "max_vcmr_video": 100,
33 |     "full_eval_tasks": [
34 |         "VCMR",
35 |         "SVMR",
36 |         "VR"
37 |     ],
38 |     "max_clip_len": 100,
39 |     "max_txt_len": 60,
40 |     "vfeat_version": "resnet_slowfast",
41 |     "vfeat_interval": 1.5,
42 |     "min_pred_l": 2,
43 |     "max_pred_l": 16,
44 |     "drop_svmr_prob": 0.8,
45 |     "train_batch_size": 32,
46 |     "val_batch_size": 20,
47 |     "vcmr_eval_video_batch_size": 50,
48 |     "vcmr_eval_batch_size": 80,
49 |     "gradient_accumulation_steps":2,
50 |     "learning_rate": 1e-04,
51 |     "valid_steps": 400,
52 |     "save_steps": 400,
53 |     "num_train_steps": 10000,
54 |     "optim": "adamw",
55 |     "betas": [
56 |         0.9,
57 |         0.98
58 |     ],
59 |     "dropout": 0.1,
60 |     "weight_decay": 0.01,
61 |     "grad_norm": 1.0,
62 |     "warmup_steps": 1000,
63 |     "lw_neg_q": 8.0,
64 |     "lw_neg_ctx": 8.0,
65 |     "lw_st_ed": 0.01,
66 |     "ranking_loss_type": "hinge",
67 |     "margin": 0.1,
68 |     "hard_pool_size": [
69 |         20
70 |     ],
71 |     "hard_neg_weights": [
72 |         10
73 |     ],
74 |     "hard_negative_start_step": [
75 |         4000
76 |     ],
77 |     "train_span_start_step": 0,
78 |     "sub_ctx_len": 0,
79 |     "use_all_neg": true,
80 |     "seed": 77,
81 |     "no_fp16": false,
82 |     "n_workers": 4,
83 |     "no_pin_mem": false,
84 |     "rank": 0
85 | }
86 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-tvr_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vcmr",
 5 |             "name": "tvr_video_only_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvr_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vcmr",
14 |             "name": "tvr_video_only_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "query_txt_db": "/txt/tvr_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_video_only/tvr_default",
26 |     "eval_with_query_type": true,
27 |     "max_before_nms": 200,
28 |     "max_after_nms": 100,
29 |     "distributed_eval": true,
30 |     "nms_thd": -1,
31 |     "q2c_alpha": 20,
32 |     "max_vcmr_video": 100,
33 |     "full_eval_tasks": [
34 |         "VCMR",
35 |         "SVMR",
36 |         "VR"
37 |     ],
38 |     "max_clip_len": 100,
39 |     "max_txt_len": 60,
40 |     "vfeat_version": "resnet_slowfast",
41 |     "vfeat_interval": 1.5,
42 |     "min_pred_l": 2,
43 |     "max_pred_l": 16,
44 |     "drop_svmr_prob": 0.8,
45 |     "train_batch_size": 32,
46 |     "val_batch_size": 20,
47 |     "vcmr_eval_video_batch_size": 50,
48 |     "vcmr_eval_batch_size": 80,
49 |     "gradient_accumulation_steps":2,
50 |     "learning_rate": 1e-04,
51 |     "valid_steps": 400,
52 |     "save_steps": 400,
53 |     "num_train_steps": 10000,
54 |     "optim": "adamw",
55 |     "betas": [
56 |         0.9,
57 |         0.98
58 |     ],
59 |     "dropout": 0.1,
60 |     "weight_decay": 0.01,
61 |     "grad_norm": 1.0,
62 |     "warmup_steps": 1000,
63 |     "lw_neg_q": 8.0,
64 |     "lw_neg_ctx": 8.0,
65 |     "lw_st_ed": 0.01,
66 |     "ranking_loss_type": "hinge",
67 |     "margin": 0.1,
68 |     "hard_pool_size": [
69 |         20
70 |     ],
71 |     "hard_neg_weights": [
72 |         10
73 |     ],
74 |     "hard_negative_start_step": [
75 |         4000
76 |     ],
77 |     "train_span_start_step": 0,
78 |     "sub_ctx_len": 0,
79 |     "use_all_neg": true,
80 |     "seed": 77,
81 |     "no_fp16": false,
82 |     "n_workers": 4,
83 |     "no_pin_mem": false,
84 |     "rank": 0
85 | }
86 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vatex_en_c-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "vatex_en_c_video_sub_train",
 6 |             "sub_txt_db": "/txt/vatex_subtitles.db",
 7 |             "vfeat_db": "/video/vatex",
 8 |             "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "vatex_en_c_video_sub_val",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db",
16 |             "vfeat_db": "/video/vatex",
17 |             "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/ST_FT_only/vatex_en_c_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_gen_step": 30,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "compressed_db": false,
31 |     "train_batch_size": 128,
32 |     "val_batch_size": 128,
33 |     "gradient_accumulation_steps": 1,
34 |     "learning_rate": 1e-4,
35 |     "lr_mul": 10.0,
36 |     "valid_steps": 500,
37 |     "num_train_steps": 7000,
38 |     "optim": "adamw",
39 |     "betas": [0.9, 0.98],
40 |     "lsr": 0.1,
41 |     "dropout": 0.1,
42 |     "weight_decay": 0.01,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 700,
45 |     "sub_ctx_len": 1,
46 |     "seed": 77,
47 |     "no_fp16": false,
48 |     "n_workers": 4,
49 |     "pin_mem": true
50 | }
51 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vatex_en_c_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "vatex_en_c_sub_only_train",
 6 |             "sub_txt_db": "/txt/vatex_subtitles.db",
 7 |             "vfeat_db": "/video/vatex",
 8 |             "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "vatex_en_c_sub_only_val",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db",
16 |             "vfeat_db": "/video/vatex",
17 |             "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/multi_channel_ablation_sub_only/vatex_en_c_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_gen_step": 30,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "compressed_db": false,
31 |     "train_batch_size": 128,
32 |     "val_batch_size": 128,
33 |     "gradient_accumulation_steps": 1,
34 |     "learning_rate": 1e-4,
35 |     "lr_mul": 10.0,
36 |     "valid_steps": 500,
37 |     "num_train_steps": 7000,
38 |     "optim": "adamw",
39 |     "betas": [0.9, 0.98],
40 |     "lsr": 0.1,
41 |     "dropout": 0.1,
42 |     "weight_decay": 0.01,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 700,
45 |     "sub_ctx_len": 1,
46 |     "seed": 77,
47 |     "no_fp16": false,
48 |     "n_workers": 4,
49 |     "pin_mem": true
50 | }
51 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vatex_en_c_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "vatex_en_c_video_only_train",
 6 |             "sub_txt_db": null,
 7 |             "vfeat_db": "/video/vatex",
 8 |             "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "vatex_en_c_video_only_val",
15 |             "sub_txt_db": null,
16 |             "vfeat_db": "/video/vatex",
17 |             "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/multi_channel_ablation_video_only/vatex_en_c_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_gen_step": 30,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "compressed_db": false,
31 |     "train_batch_size": 128,
32 |     "val_batch_size": 128,
33 |     "gradient_accumulation_steps": 1,
34 |     "learning_rate": 1e-4,
35 |     "lr_mul": 10.0,
36 |     "valid_steps": 500,
37 |     "num_train_steps": 7000,
38 |     "optim": "adamw",
39 |     "betas": [0.9, 0.98],
40 |     "lsr": 0.1,
41 |     "dropout": 0.1,
42 |     "weight_decay": 0.01,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 700,
45 |     "sub_ctx_len": 1,
46 |     "seed": 77,
47 |     "no_fp16": false,
48 |     "n_workers": 4,
49 |     "pin_mem": true
50 | }
51 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vatex_en_r-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vr",
 5 |             "name": "vatex_en_r_video_sub_train",
 6 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
 7 |             "vfeat_db": "/video/vatex",
 8 |             "query_txt_db": "/txt/vatex_en_r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vr",
14 |             "name": "vatex_en_r_video_sub_val",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
16 |             "vfeat_db": "/video/vatex",
17 |             "query_txt_db": "/txt/vatex_en_r_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/ST_FT_only/vatex_en_default",
26 |     "distributed_eval": true,
27 |     "max_vr_video": 100,
28 |     "max_clip_len": 100,
29 |     "max_txt_len": 60,
30 |     "vfeat_version": "resnet_slowfast",
31 |     "vfeat_interval": 1.5,
32 |     "train_batch_size": 64,
33 |     "val_batch_size": 20,
34 |     "vr_eval_video_batch_size": 50,
35 |     "vr_eval_q_batch_size": 80,
36 |     "gradient_accumulation_steps": 2,
37 |     "learning_rate": 7e-05,
38 |     "valid_steps": 200,
39 |     "save_steps": 200,
40 |     "num_train_steps": 4000,
41 |     "optim": "adamw",
42 |     "betas": [
43 |         0.9,
44 |         0.98
45 |     ],
46 |     "dropout": 0.1,
47 |     "weight_decay": 0.01,
48 |     "grad_norm": 1.0,
49 |     "warmup_steps": 400,
50 |     "lw_neg_q": 10.0,
51 |     "lw_neg_ctx": 10.0,
52 |     "ranking_loss_type": "hinge",
53 |     "margin": 0.1,
54 |     "hard_pool_size": [
55 |         80
56 |     ],
57 |     "hard_neg_weights": [
58 |         10
59 |     ],
60 |     "hard_negative_start_step": [
61 |         2000
62 |     ],
63 |     "use_all_neg": true,
64 |     "sub_ctx_len": 1,
65 |     "seed": 77,
66 |     "no_fp16": false,
67 |     "n_workers": 4,
68 |     "no_pin_mem": false,
69 |     "rank": 0
70 | }
71 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vatex_en_r_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vr",
 5 |             "name": "vatex_en_r_sub_only_train",
 6 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
 7 |             "vfeat_db": "/video/vatex",
 8 |             "query_txt_db": "/txt/vatex_en_r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vr",
14 |             "name": "vatex_en_r_sub_only_val",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
16 |             "vfeat_db": "/video/vatex",
17 |             "query_txt_db": "/txt/vatex_en_r_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_sub_only/vatex_en_r_default",
26 |     "distributed_eval": true,
27 |     "max_vr_video": 100,
28 |     "max_clip_len": 100,
29 |     "max_txt_len": 60,
30 |     "vfeat_version": "resnet_slowfast",
31 |     "vfeat_interval": 1.5,
32 |     "train_batch_size": 64,
33 |     "val_batch_size": 20,
34 |     "vr_eval_video_batch_size": 50,
35 |     "vr_eval_q_batch_size": 80,
36 |     "gradient_accumulation_steps": 2,
37 |     "learning_rate": 7e-05,
38 |     "valid_steps": 200,
39 |     "save_steps": 200,
40 |     "num_train_steps": 4000,
41 |     "optim": "adamw",
42 |     "betas": [
43 |         0.9,
44 |         0.98
45 |     ],
46 |     "dropout": 0.1,
47 |     "weight_decay": 0.01,
48 |     "grad_norm": 1.0,
49 |     "warmup_steps": 400,
50 |     "lw_neg_q": 10.0,
51 |     "lw_neg_ctx": 10.0,
52 |     "ranking_loss_type": "hinge",
53 |     "margin": 0.1,
54 |     "hard_pool_size": [
55 |         80
56 |     ],
57 |     "hard_neg_weights": [
58 |         10
59 |     ],
60 |     "hard_negative_start_step": [
61 |         2000
62 |     ],
63 |     "use_all_neg": true,
64 |     "sub_ctx_len": 1,
65 |     "seed": 77,
66 |     "no_fp16": false,
67 |     "n_workers": 4,
68 |     "no_pin_mem": false,
69 |     "rank": 0
70 | }
71 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vatex_en_r_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vr",
 5 |             "name": "vatex_en_r_video_only_train",
 6 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
 7 |             "vfeat_db": "/video/vatex",
 8 |             "query_txt_db": "/txt/vatex_en_r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vr",
14 |             "name": "vatex_en_r_video_only_val",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
16 |             "vfeat_db": "/video/vatex",
17 |             "query_txt_db": "/txt/vatex_en_r_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_video_only/vatex_en_r_default",
26 |     "distributed_eval": true,
27 |     "max_vr_video": 100,
28 |     "max_clip_len": 100,
29 |     "max_txt_len": 60,
30 |     "vfeat_version": "resnet_slowfast",
31 |     "vfeat_interval": 1.5,
32 |     "train_batch_size": 64,
33 |     "val_batch_size": 20,
34 |     "vr_eval_video_batch_size": 50,
35 |     "vr_eval_q_batch_size": 80,
36 |     "gradient_accumulation_steps": 2,
37 |     "learning_rate": 7e-05,
38 |     "valid_steps": 200,
39 |     "save_steps": 200,
40 |     "num_train_steps": 4000,
41 |     "optim": "adamw",
42 |     "betas": [
43 |         0.9,
44 |         0.98
45 |     ],
46 |     "dropout": 0.1,
47 |     "weight_decay": 0.01,
48 |     "grad_norm": 1.0,
49 |     "warmup_steps": 400,
50 |     "lw_neg_q": 10.0,
51 |     "lw_neg_ctx": 10.0,
52 |     "ranking_loss_type": "hinge",
53 |     "margin": 0.1,
54 |     "hard_pool_size": [
55 |         80
56 |     ],
57 |     "hard_neg_weights": [
58 |         10
59 |     ],
60 |     "hard_negative_start_step": [
61 |         2000
62 |     ],
63 |     "use_all_neg": true,
64 |     "sub_ctx_len": 1,
65 |     "seed": 77,
66 |     "no_fp16": false,
67 |     "n_workers": 4,
68 |     "no_pin_mem": false,
69 |     "rank": 0
70 | }
71 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-violin-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "violin",
 5 |             "name": "violin_video_sub_train",
 6 |             "sub_txt_db": "/txt/violin_subtitles.db",
 7 |             "vfeat_db": "/video/violin",
 8 |             "query_txt_db": "/txt/violin_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "violin",
14 |             "name": "violin_video_sub_val",
15 |             "sub_txt_db": "/txt/violin_subtitles.db",
16 |             "vfeat_db": "/video/violin",
17 |             "query_txt_db": "/txt/violin_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/ST_FT_only/violin_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 3e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 6000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 8.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 600,
47 |     "sub_ctx_len": 2,
48 |     "seed": 77,
49 |     "no_fp16": false,
50 |     "n_workers": 4,
51 |     "no_pin_mem": false,
52 |     "rank": 0
53 | }
54 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-violin_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "violin",
 5 |             "name": "violin_sub_only_train",
 6 |             "sub_txt_db": "/txt/violin_subtitles.db",
 7 |             "vfeat_db": "/video/violin",
 8 |             "query_txt_db": "/txt/violin_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "violin",
14 |             "name": "violin_sub_only__val",
15 |             "sub_txt_db": "/txt/violin_subtitles.db",
16 |             "vfeat_db": "/video/violin",
17 |             "query_txt_db": "/txt/violin_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_sub_only/violin_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 3e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 6000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 8.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 600,
47 |     "sub_ctx_len": 2,
48 |     "seed": 77,
49 |     "no_fp16": false,
50 |     "n_workers": 4,
51 |     "no_pin_mem": false,
52 |     "rank": 0
53 | }
54 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-violin_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "violin",
 5 |             "name": "violin_video_only_train",
 6 |             "sub_txt_db": "/txt/violin_subtitles.db",
 7 |             "vfeat_db": "/video/violin",
 8 |             "query_txt_db": "/txt/violin_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "violin",
14 |             "name": "violin_video_only_val",
15 |             "sub_txt_db": "/txt/violin_subtitles.db",
16 |             "vfeat_db": "/video/violin",
17 |             "query_txt_db": "/txt/violin_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_video_only/violin_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 3e-05,
34 |     "valid_steps": 200,
35 |     "save_steps": 200,
36 |     "num_train_steps": 6000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 8.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 600,
47 |     "sub_ctx_len": 2,
48 |     "seed": 77,
49 |     "no_fp16": false,
50 |     "n_workers": 4,
51 |     "no_pin_mem": false,
52 |     "rank": 0
53 | }
54 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vlep-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "vlep_video_sub_train",
 6 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
 7 |             "vfeat_db": "/video/vlep",
 8 |             "query_txt_db": "/txt/vlep_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "vlep_video_sub_dev",
15 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
16 |             "vfeat_db": "/video/vlep",
17 |             "query_txt_db": "/txt/vlep_dev.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/ST_FT_only/vlep_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 100,
35 |     "save_steps": 200,
36 |     "num_train_steps": 2000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 200,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vlep_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "vlep_sub_only_train",
 6 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
 7 |             "vfeat_db": "/video/vlep",
 8 |             "query_txt_db": "/txt/vlep_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "vlep_sub_only_dev",
15 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
16 |             "vfeat_db": "/video/vlep",
17 |             "query_txt_db": "/txt/vlep_dev.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_sub_only/vlep_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 100,
35 |     "save_steps": 200,
36 |     "num_train_steps": 2000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 200,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-vlep_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "vlep_video_only_train",
 6 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
 7 |             "vfeat_db": "/video/vlep",
 8 |             "query_txt_db": "/txt/vlep_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "vlep_video_only_dev",
15 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
16 |             "vfeat_db": "/video/vlep",
17 |             "query_txt_db": "/txt/vlep_dev.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_video_only/vlep_default",
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 120,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 4,
31 |     "val_batch_size": 10,
32 |     "gradient_accumulation_steps": 2,
33 |     "learning_rate": 5e-05,
34 |     "valid_steps": 100,
35 |     "save_steps": 200,
36 |     "num_train_steps": 2000,
37 |     "optim": "adamw",
38 |     "betas": [
39 |         0.9,
40 |         0.98
41 |     ],
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "lr_mul": 10.0,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 200,
47 |     "lw_st_ed": 0.4,
48 |     "sub_ctx_len": 0,
49 |     "seed": 77,
50 |     "no_fp16": false,
51 |     "n_workers": 4,
52 |     "no_pin_mem": false,
53 |     "rank": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-yc2c-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "yc2c_video_sub_train",
 6 |             "sub_txt_db": "/txt/yc2_subtitles.db",
 7 |             "vfeat_db": "/video/yc2",
 8 |             "cap_txt_db": ["/txt/yc2r_train.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "yc2c_video_sub_val",
15 |             "sub_txt_db": "/txt/yc2_subtitles.db",
16 |             "vfeat_db": "/video/yc2",
17 |             "gt_anno": "/txt/yc2c_val_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/ST_FT_only/yc2c_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_gen_step": 30,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "compressed_db": false,
31 |     "train_batch_size": 16,
32 |     "val_batch_size": 16,
33 |     "gradient_accumulation_steps": 1,
34 |     "learning_rate": 1e-4,
35 |     "lr_mul": 10.0,
36 |     "valid_steps": 500,
37 |     "num_train_steps": 7000,
38 |     "optim": "adamw",
39 |     "betas": [0.9, 0.98],
40 |     "lsr": 0.1,
41 |     "dropout": 0.1,
42 |     "weight_decay": 0.01,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 700,
45 |     "sub_ctx_len": 1,
46 |     "seed": 77,
47 |     "no_fp16": false,
48 |     "n_workers": 4,
49 |     "pin_mem": true
50 | }
51 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-yc2c_sub_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "yc2c_sub_only_train",
 6 |             "sub_txt_db": "/txt/yc2_subtitles.db",
 7 |             "vfeat_db": "/video/yc2",
 8 |             "cap_txt_db": ["/txt/yc2r_train.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "yc2c_sub_only_val",
15 |             "sub_txt_db": "/txt/yc2_subtitles.db",
16 |             "vfeat_db": "/video/yc2",
17 |             "gt_anno": "/txt/yc2c_val_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/multi_channel_ablation_sub_only/yc2c_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_gen_step": 30,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "compressed_db": false,
31 |     "train_batch_size": 16,
32 |     "val_batch_size": 16,
33 |     "gradient_accumulation_steps": 1,
34 |     "learning_rate": 1e-4,
35 |     "lr_mul": 10.0,
36 |     "valid_steps": 500,
37 |     "num_train_steps": 7000,
38 |     "optim": "adamw",
39 |     "betas": [0.9, 0.98],
40 |     "lsr": 0.1,
41 |     "dropout": 0.1,
42 |     "weight_decay": 0.01,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 700,
45 |     "sub_ctx_len": 1,
46 |     "seed": 77,
47 |     "no_fp16": false,
48 |     "n_workers": 4,
49 |     "pin_mem": true
50 | }
51 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-yc2c_video_only-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "yc2c_video_only_train",
 6 |             "sub_txt_db": null,
 7 |             "vfeat_db": "/video/yc2",
 8 |             "cap_txt_db": ["/txt/yc2r_train.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "yc2c_video_only_val",
15 |             "sub_txt_db": null,
16 |             "vfeat_db": "/video/yc2",
17 |             "gt_anno": "/txt/yc2c_val_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
22 |     "load_partial_pretrained": true,
23 |     "skip_layer_loading": true,
24 |     "output_dir": "/storage/multi_channel_ablation_video_only/yc2c_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_gen_step": 30,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "compressed_db": false,
31 |     "train_batch_size": 16,
32 |     "val_batch_size": 16,
33 |     "gradient_accumulation_steps": 1,
34 |     "learning_rate": 1e-4,
35 |     "lr_mul": 10.0,
36 |     "valid_steps": 500,
37 |     "num_train_steps": 7000,
38 |     "optim": "adamw",
39 |     "betas": [0.9, 0.98],
40 |     "lsr": 0.1,
41 |     "dropout": 0.1,
42 |     "weight_decay": 0.01,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 700,
45 |     "sub_ctx_len": 1,
46 |     "seed": 77,
47 |     "no_fp16": false,
48 |     "n_workers": 4,
49 |     "pin_mem": true
50 | }
51 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-yc2r-4gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vr",
 5 |             "name": "yc2r_video_sub_train",
 6 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
 7 |             "vfeat_db": "/video/yc2",
 8 |             "query_txt_db": "/txt/yc2r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vr",
14 |             "name": "yc2r_video_sub_val",
15 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
16 |             "vfeat_db": "/video/yc2",
17 |             "query_txt_db": "/txt/yc2r_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/ST_FT_only/yc2r_default",
26 |     "distributed_eval": true,
27 |     "max_vr_video": 100,
28 |     "max_clip_len": 100,
29 |     "max_txt_len": 60,
30 |     "vfeat_version": "resnet_slowfast",
31 |     "vfeat_interval": 1.5,
32 |     "train_batch_size": 48,
33 |     "val_batch_size": 20,
34 |     "vr_eval_video_batch_size": 50,
35 |     "vr_eval_q_batch_size": 80,
36 |     "gradient_accumulation_steps": 2,
37 |     "learning_rate": 7e-05,
38 |     "valid_steps": 200,
39 |     "save_steps": 200,
40 |     "num_train_steps": 4000,
41 |     "optim": "adamw",
42 |     "betas": [
43 |         0.9,
44 |         0.98
45 |     ],
46 |     "dropout": 0.1,
47 |     "weight_decay": 0.01,
48 |     "grad_norm": 1.0,
49 |     "warmup_steps": 400,
50 |     "lw_neg_q": 10.0,
51 |     "lw_neg_ctx": 10.0,
52 |     "ranking_loss_type": "hinge",
53 |     "margin": 0.1,
54 |     "hard_pool_size": [
55 |         80
56 |     ],
57 |     "hard_neg_weights": [
58 |         10
59 |     ],
60 |     "hard_negative_start_step": [
61 |         2000
62 |     ],
63 |     "use_all_neg": true,
64 |     "sub_ctx_len": 1,
65 |     "seed": 77,
66 |     "no_fp16": false,
67 |     "n_workers": 4,
68 |     "no_pin_mem": false,
69 |     "rank": 0
70 | }
71 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-yc2r_sub_only-4gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vr",
 5 |             "name": "yc2r_sub_only_train",
 6 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
 7 |             "vfeat_db": "/video/yc2",
 8 |             "query_txt_db": "/txt/yc2r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vr",
14 |             "name": "yc2r_sub_only_val",
15 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
16 |             "vfeat_db": "/video/yc2",
17 |             "query_txt_db": "/txt/yc2r_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_sub_only/yc2r_default",
26 |     "distributed_eval": true,
27 |     "max_vr_video": 100,
28 |     "max_clip_len": 100,
29 |     "max_txt_len": 60,
30 |     "vfeat_version": "resnet_slowfast",
31 |     "vfeat_interval": 1.5,
32 |     "train_batch_size": 40,
33 |     "val_batch_size": 20,
34 |     "vr_eval_video_batch_size": 50,
35 |     "vr_eval_q_batch_size": 80,
36 |     "gradient_accumulation_steps": 2,
37 |     "learning_rate": 7e-05,
38 |     "valid_steps": 200,
39 |     "save_steps": 200,
40 |     "num_train_steps": 4000,
41 |     "optim": "adamw",
42 |     "betas": [
43 |         0.9,
44 |         0.98
45 |     ],
46 |     "dropout": 0.1,
47 |     "weight_decay": 0.01,
48 |     "grad_norm": 1.0,
49 |     "warmup_steps": 400,
50 |     "lw_neg_q": 10.0,
51 |     "lw_neg_ctx": 10.0,
52 |     "ranking_loss_type": "hinge",
53 |     "margin": 0.1,
54 |     "hard_pool_size": [
55 |         80
56 |     ],
57 |     "hard_neg_weights": [
58 |         10
59 |     ],
60 |     "hard_negative_start_step": [
61 |         2000
62 |     ],
63 |     "use_all_neg": true,
64 |     "sub_ctx_len": 1,
65 |     "seed": 77,
66 |     "no_fp16": false,
67 |     "n_workers": 4,
68 |     "no_pin_mem": false,
69 |     "rank": 0
70 | }
71 | 


--------------------------------------------------------------------------------
/config/FT_only_configs/train-yc2r_video_only-4gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vr",
 5 |             "name": "yc2r_video_only_train",
 6 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
 7 |             "vfeat_db": "/video/yc2",
 8 |             "query_txt_db": "/txt/yc2r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vr",
14 |             "name": "yc2r_video_only_val",
15 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
16 |             "vfeat_db": "/video/yc2",
17 |             "query_txt_db": "/txt/yc2r_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
23 |     "load_partial_pretrained": true,
24 |     "skip_layer_loading": true,
25 |     "output_dir": "/storage/multi_channel_ablation_video_only/yc2r_default",
26 |     "distributed_eval": true,
27 |     "max_vr_video": 100,
28 |     "max_clip_len": 100,
29 |     "max_txt_len": 60,
30 |     "vfeat_version": "resnet_slowfast",
31 |     "vfeat_interval": 1.5,
32 |     "train_batch_size": 64,
33 |     "val_batch_size": 20,
34 |     "vr_eval_video_batch_size": 50,
35 |     "vr_eval_q_batch_size": 80,
36 |     "gradient_accumulation_steps": 2,
37 |     "learning_rate": 7e-05,
38 |     "valid_steps": 200,
39 |     "save_steps": 200,
40 |     "num_train_steps": 4000,
41 |     "optim": "adamw",
42 |     "betas": [
43 |         0.9,
44 |         0.98
45 |     ],
46 |     "dropout": 0.1,
47 |     "weight_decay": 0.01,
48 |     "grad_norm": 1.0,
49 |     "warmup_steps": 400,
50 |     "lw_neg_q": 10.0,
51 |     "lw_neg_ctx": 10.0,
52 |     "ranking_loss_type": "hinge",
53 |     "margin": 0.1,
54 |     "hard_pool_size": [
55 |         80
56 |     ],
57 |     "hard_neg_weights": [
58 |         10
59 |     ],
60 |     "hard_negative_start_step": [
61 |         2000
62 |     ],
63 |     "use_all_neg": true,
64 |     "sub_ctx_len": 1,
65 |     "seed": 77,
66 |     "no_fp16": false,
67 |     "n_workers": 4,
68 |     "no_pin_mem": false,
69 |     "rank": 0
70 | }
71 | 


--------------------------------------------------------------------------------
/config/model_config/hero_finetune.json:
--------------------------------------------------------------------------------
 1 | {"f_config":{   
 2 |     "attention_probs_dropout_prob": 0.1,
 3 |     "hidden_act": "gelu",
 4 |     "hidden_dropout_prob": 0.1,
 5 |     "hidden_size": 768,
 6 |     "initializer_range": 0.02,
 7 |     "intermediate_size": 3072,
 8 |     "max_position_embeddings": 514,
 9 |     "num_attention_heads": 12,
10 |     "num_hidden_layers": 6,
11 |     "type_vocab_size": 2,
12 |     "vocab_size": 50272
13 |   },
14 |   "c_config": {
15 |     "attention_probs_dropout_prob": 0.1,
16 |     "hidden_act": "gelu",
17 |     "hidden_dropout_prob": 0.1,
18 |     "hidden_size": 768,
19 |     "initializer_range": 0.02,
20 |     "intermediate_size": 3072,
21 |     "max_position_embeddings": 514,
22 |     "num_attention_heads": 12,
23 |     "num_hidden_layers": 3,
24 |     "type_vocab_size": 2
25 |   },
26 |   "q_config": {
27 |     "attention_probs_dropout_prob": 0.1,
28 |     "hidden_act": "gelu",
29 |     "hidden_dropout_prob": 0.1,
30 |     "hidden_size": 768,
31 |     "initializer_range": 0.02,
32 |     "intermediate_size": 3072,
33 |     "num_attention_heads": 12,
34 |     "max_position_embeddings": 514,
35 |     "num_hidden_layers": 0,
36 |     "type_vocab_size": 1,
37 |     "vocab_size": 50272
38 |   },
39 |   "d_config": {
40 |     "attention_probs_dropout_prob": 0.1,
41 |     "hidden_act": "gelu",
42 |     "hidden_dropout_prob": 0.1,
43 |     "hidden_size": 768,
44 |     "initializer_range": 0.02,
45 |     "intermediate_size": 3072,
46 |     "max_position_embeddings": 1024,
47 |     "num_attention_heads": 12,
48 |     "num_hidden_layers": 2,
49 |     "type_vocab_size": 1,
50 |     "vocab_size": 50272
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/config/model_config/hero_pretrain.json:
--------------------------------------------------------------------------------
 1 | {"f_config":{   
 2 |     "attention_probs_dropout_prob": 0.1,
 3 |     "hidden_act": "gelu",
 4 |     "hidden_dropout_prob": 0.1,
 5 |     "hidden_size": 768,
 6 |     "initializer_range": 0.02,
 7 |     "intermediate_size": 3072,
 8 |     "max_position_embeddings": 514,
 9 |     "num_attention_heads": 12,
10 |     "num_hidden_layers": 6,
11 |     "type_vocab_size": 1,
12 |     "vocab_size": 50265
13 |   },
14 |   "c_config": {
15 |     "attention_probs_dropout_prob": 0.1,
16 |     "hidden_act": "gelu",
17 |     "hidden_dropout_prob": 0.1,
18 |     "hidden_size": 768,
19 |     "initializer_range": 0.02,
20 |     "intermediate_size": 3072,
21 |     "max_position_embeddings": 514,
22 |     "num_attention_heads": 12,
23 |     "num_hidden_layers": 3,
24 |     "type_vocab_size": 2
25 |   },
26 |   "q_config": { 
27 |     "attention_probs_dropout_prob": 0.1,
28 |     "hidden_act": "gelu",
29 |     "hidden_dropout_prob": 0.1,
30 |     "hidden_size": 768,
31 |     "initializer_range": 0.02,
32 |     "intermediate_size": 3072,
33 |     "num_attention_heads": 12,
34 |     "max_position_embeddings": 514,
35 |     "num_hidden_layers": 0,
36 |     "type_vocab_size": 1,
37 |     "vocab_size": 50265
38 |   }
39 | }


--------------------------------------------------------------------------------
/config/model_config/hero_videoCap.json:
--------------------------------------------------------------------------------
 1 | { "model": "hero",
 2 |   "f_config":{   
 3 |     "attention_probs_dropout_prob": 0.1,
 4 |     "hidden_act": "gelu",
 5 |     "hidden_dropout_prob": 0.1,
 6 |     "hidden_size": 768,
 7 |     "initializer_range": 0.02,
 8 |     "intermediate_size": 3072,
 9 |     "max_position_embeddings": 514,
10 |     "num_attention_heads": 12,
11 |     "num_hidden_layers": 6,
12 |     "type_vocab_size": 2,
13 |     "vocab_size": 50272
14 |   },
15 |   "c_config": {
16 |     "attention_probs_dropout_prob": 0.1,
17 |     "hidden_act": "gelu",
18 |     "hidden_dropout_prob": 0.1,
19 |     "hidden_size": 768,
20 |     "initializer_range": 0.02,
21 |     "intermediate_size": 3072,
22 |     "max_position_embeddings": 514,
23 |     "num_attention_heads": 12,
24 |     "num_hidden_layers": 3,
25 |     "type_vocab_size": 2
26 |   },
27 |   "d_config": {
28 |     "attention_probs_dropout_prob": 0.1,
29 |     "hidden_act": "gelu",
30 |     "hidden_dropout_prob": 0.1,
31 |     "hidden_size": 768,
32 |     "initializer_range": 0.02,
33 |     "intermediate_size": 3072,
34 |     "max_position_embeddings": 1024,
35 |     "num_attention_heads": 12,
36 |     "num_hidden_layers": 2,
37 |     "type_vocab_size": 1,
38 |     "vocab_size": 50272
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/config/model_config/video_sub_feature_add_finetune.json:
--------------------------------------------------------------------------------
 1 | { "model": "video_sub_feature_fusion",
 2 |   "video_sub_fusion_method": "add",
 3 |   "f_config":{   
 4 |     "attention_probs_dropout_prob": 0.1,
 5 |     "hidden_act": "gelu",
 6 |     "hidden_dropout_prob": 0.1,
 7 |     "hidden_size": 768,
 8 |     "initializer_range": 0.02,
 9 |     "intermediate_size": 3072,
10 |     "max_position_embeddings": 514,
11 |     "num_attention_heads": 12,
12 |     "num_hidden_layers": 6,
13 |     "type_vocab_size": 2,
14 |     "vocab_size": 50272
15 |   },
16 |   "c_config": {
17 |     "attention_probs_dropout_prob": 0.1,
18 |     "hidden_act": "gelu",
19 |     "hidden_dropout_prob": 0.1,
20 |     "hidden_size": 768,
21 |     "initializer_range": 0.02,
22 |     "intermediate_size": 3072,
23 |     "max_position_embeddings": 514,
24 |     "num_attention_heads": 12,
25 |     "num_hidden_layers": 3,
26 |     "type_vocab_size": 2
27 |   },
28 |   "q_config": {
29 |     "attention_probs_dropout_prob": 0.1,
30 |     "hidden_act": "gelu",
31 |     "hidden_dropout_prob": 0.1,
32 |     "hidden_size": 768,
33 |     "initializer_range": 0.02,
34 |     "intermediate_size": 3072,
35 |     "num_attention_heads": 12,
36 |     "max_position_embeddings": 514,
37 |     "num_hidden_layers": 0,
38 |     "type_vocab_size": 1,
39 |     "vocab_size": 50272
40 |   },
41 |   "d_config": {
42 |     "attention_probs_dropout_prob": 0.1,
43 |     "hidden_act": "gelu",
44 |     "hidden_dropout_prob": 0.1,
45 |     "hidden_size": 768,
46 |     "initializer_range": 0.02,
47 |     "intermediate_size": 3072,
48 |     "max_position_embeddings": 1024,
49 |     "num_attention_heads": 12,
50 |     "num_hidden_layers": 2,
51 |     "type_vocab_size": 1,
52 |     "vocab_size": 50272
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/config/model_config/video_sub_feature_concat_finetune.json:
--------------------------------------------------------------------------------
 1 | { "model": "video_sub_feature_fusion",
 2 |   "video_sub_fusion_method": "concat",
 3 |   "f_config":{   
 4 |     "attention_probs_dropout_prob": 0.1,
 5 |     "hidden_act": "gelu",
 6 |     "hidden_dropout_prob": 0.1,
 7 |     "hidden_size": 768,
 8 |     "initializer_range": 0.02,
 9 |     "intermediate_size": 3072,
10 |     "max_position_embeddings": 514,
11 |     "num_attention_heads": 12,
12 |     "num_hidden_layers": 6,
13 |     "type_vocab_size": 2,
14 |     "vocab_size": 50272
15 |   },
16 |   "c_config": {
17 |     "attention_probs_dropout_prob": 0.1,
18 |     "hidden_act": "gelu",
19 |     "hidden_dropout_prob": 0.1,
20 |     "hidden_size": 768,
21 |     "initializer_range": 0.02,
22 |     "intermediate_size": 3072,
23 |     "max_position_embeddings": 514,
24 |     "num_attention_heads": 12,
25 |     "num_hidden_layers": 3,
26 |     "type_vocab_size": 2
27 |   },
28 |   "q_config": {
29 |     "attention_probs_dropout_prob": 0.1,
30 |     "hidden_act": "gelu",
31 |     "hidden_dropout_prob": 0.1,
32 |     "hidden_size": 768,
33 |     "initializer_range": 0.02,
34 |     "intermediate_size": 3072,
35 |     "num_attention_heads": 12,
36 |     "max_position_embeddings": 514,
37 |     "num_hidden_layers": 0,
38 |     "type_vocab_size": 1,
39 |     "vocab_size": 50272
40 |   },
41 |   "d_config": {
42 |     "attention_probs_dropout_prob": 0.1,
43 |     "hidden_act": "gelu",
44 |     "hidden_dropout_prob": 0.1,
45 |     "hidden_size": 768,
46 |     "initializer_range": 0.02,
47 |     "intermediate_size": 3072,
48 |     "max_position_embeddings": 1024,
49 |     "num_attention_heads": 12,
50 |     "num_hidden_layers": 2,
51 |     "type_vocab_size": 1,
52 |     "vocab_size": 50272
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/config/model_config/video_sub_sequence_finetune.json:
--------------------------------------------------------------------------------
 1 | { "model": "video_sub_sequence_model",
 2 |   "f_config":{   
 3 |     "attention_probs_dropout_prob": 0.1,
 4 |     "hidden_act": "gelu",
 5 |     "hidden_dropout_prob": 0.1,
 6 |     "hidden_size": 768,
 7 |     "initializer_range": 0.02,
 8 |     "intermediate_size": 3072,
 9 |     "max_position_embeddings": 514,
10 |     "num_attention_heads": 12,
11 |     "num_hidden_layers": 6,
12 |     "type_vocab_size": 2,
13 |     "vocab_size": 50272
14 |   },
15 |   "c_config": {
16 |     "attention_probs_dropout_prob": 0.1,
17 |     "hidden_act": "gelu",
18 |     "hidden_dropout_prob": 0.1,
19 |     "hidden_size": 768,
20 |     "initializer_range": 0.02,
21 |     "intermediate_size": 3072,
22 |     "max_position_embeddings": 514,
23 |     "num_attention_heads": 12,
24 |     "num_hidden_layers": 3,
25 |     "type_vocab_size": 2
26 |   },
27 |   "q_config": {
28 |     "attention_probs_dropout_prob": 0.1,
29 |     "hidden_act": "gelu",
30 |     "hidden_dropout_prob": 0.1,
31 |     "hidden_size": 768,
32 |     "initializer_range": 0.02,
33 |     "intermediate_size": 3072,
34 |     "num_attention_heads": 12,
35 |     "max_position_embeddings": 514,
36 |     "num_hidden_layers": 0,
37 |     "type_vocab_size": 1,
38 |     "vocab_size": 50272
39 |   },
40 |   "d_config": {
41 |     "attention_probs_dropout_prob": 0.1,
42 |     "hidden_act": "gelu",
43 |     "hidden_dropout_prob": 0.1,
44 |     "hidden_size": 768,
45 |     "initializer_range": 0.02,
46 |     "intermediate_size": 3072,
47 |     "max_position_embeddings": 1024,
48 |     "num_attention_heads": 12,
49 |     "num_hidden_layers": 2,
50 |     "type_vocab_size": 1,
51 |     "vocab_size": 50272
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/config/pretrain-tv-16gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "txt_db": "/txt",
 3 |     "img_db": "/video",
 4 |     "targets": [
 5 |         {"name": "tv",
 6 |          "sub_txt_db": "tv_subtitles.db",
 7 |          "vfeat_db": "tv",
 8 |          "vfeat_interval": 1.5,
 9 |          "splits": [
10 |              {"name": "all",
11 |               "tasks": ["mlm", "mfm-nce", "fom", "vsm"],
12 |               "train_idx": "pretrain_splits/tv_train.json",
13 |               "val_idx": "pretrain_splits/tv_val.json",
14 |               "ratio": [2, 2, 1, 2]
15 |              }
16 |          ]
17 |         }
18 |     ],
19 |     "targets_ratio": [1],
20 |     "mask_prob": 0.15,
21 |     "compressed_db": false,
22 |     "model_config": "config/model_config/hero_pretrain.json",
23 |     "checkpoint": "/pretrain/pretrain-tv-init.bin",
24 |     "load_partial_pretrained" : true,
25 |     "skip_layer_loading" : true,
26 |     "output_dir": "/storage/default_pretrain_tv",
27 |     "max_clip_len": 100,
28 |     "max_txt_len": 60,
29 |     "vfeat_version": "resnet_slowfast",
30 |     "drop_svmr_prob": 0.8,
31 |     "train_batch_size": 32,
32 |     "val_batch_size": 32,
33 |     "gradient_accumulation_steps": 2,
34 |     "learning_rate": 3e-05,
35 |     "valid_steps": 500,
36 |     "save_steps": 500,
37 |     "num_train_steps": 100000,
38 |     "optim": "adamw",
39 |     "betas": [
40 |         0.9,
41 |         0.98
42 |     ],
43 |     "dropout": 0.1,
44 |     "weight_decay": 0.01,
45 |     "grad_norm": 1.0,
46 |     "warmup_steps": 10000,
47 |     "lw_neg_q": 8.0,
48 |     "lw_neg_ctx": 8.0,
49 |     "lw_st_ed": 0.01,
50 |     "ranking_loss_type": "hinge",
51 |     "margin": 0.1,
52 |     "hard_pool_size": [
53 |         20
54 |     ],
55 |     "hard_neg_weights": [
56 |         10
57 |     ],
58 |     "hard_negative_start_step": [
59 |         20000
60 |     ],
61 |     "train_span_start_step": 0,
62 |     "sub_ctx_len": 0,
63 |     "use_all_neg": true,
64 |     "seed": 77,
65 |     "no_fp16": false,
66 |     "n_workers": 4,
67 |     "no_pin_mem": false,
68 |     "rank": 0
69 | }
70 | 


--------------------------------------------------------------------------------
/config/train-caption-multitask-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "tvc_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "cap_txt_db": ["/txt/tvc_train.db"],
 9 |             "batch_size": 4,
10 |             "ratio": 2
11 |         },
12 |         {
13 |             "task": "videoCap",
14 |             "name": "vatex_en_c_video_sub_train",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db",
16 |             "vfeat_db": "/video/vatex",
17 |             "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"],
18 |             "batch_size": 128,
19 |             "ratio": 2
20 |         },
21 |         {
22 |             "task": "videoCap",
23 |             "name": "yc2c_video_sub_train",
24 |             "sub_txt_db": "/txt/yc2_subtitles.db",
25 |             "vfeat_db": "/video/yc2",
26 |             "cap_txt_db": ["/txt/yc2r_train.db"],
27 |             "batch_size": 16,
28 |             "ratio": 1
29 |         }
30 |     ],
31 |     "val_datasets": [
32 |         {
33 |             "task": "videoCap",
34 |             "name": "tvc_video_sub_val",
35 |             "sub_txt_db": "/txt/tv_subtitles.db",
36 |             "vfeat_db": "/video/tv",
37 |             "batch_size": 8,
38 |             "gt_anno": "/txt/tvc_val_release.jsonl"
39 |         },
40 |         {
41 |             "task": "videoCap",
42 |             "name": "vatex_en_c_video_sub_val",
43 |             "sub_txt_db": "/txt/vatex_subtitles.db",
44 |             "vfeat_db": "/video/vatex",
45 |             "batch_size": 128,
46 |             "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl"
47 |         },
48 |         {
49 |             "task": "videoCap",
50 |             "name": "yc2c_video_sub_val",
51 |             "sub_txt_db": "/txt/yc2_subtitles.db",
52 |             "vfeat_db": "/video/yc2",
53 |             "batch_size": 16,
54 |             "gt_anno": "/txt/yc2c_val_release.jsonl"
55 |         }
56 |     ],
57 |     "compressed_db": false,
58 |     "model_config": "/src/config/model_config/hero_videoCap.json",
59 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
60 |     "output_dir": "/storage/MT_PT_FT/captioning_multi-task_default",
61 |     "max_clip_len": 100,
62 |     "max_txt_len": 60,
63 |     "max_cap_per_vid": -1,
64 |     "max_gen_step": 30,
65 |     "vfeat_version": "resnet_slowfast",
66 |     "vfeat_interval": 1.5,
67 |     "train_batch_size": 4,
68 |     "val_batch_size": 8,
69 |     "gradient_accumulation_steps": 1,
70 |     "learning_rate": 1e-4,
71 |     "lr_mul": 10.0,
72 |     "valid_steps": 500,
73 |     "num_train_steps": 30000,
74 |     "optim": "adamw",
75 |     "betas": [0.9, 0.98],
76 |     "lsr": 0.1,
77 |     "dropout": 0.1,
78 |     "weight_decay": 0.01,
79 |     "grad_norm": 1.0,
80 |     "warmup_steps": 3000,
81 |     "sub_ctx_len": 1,
82 |     "seed": 77,
83 |     "no_fp16": false,
84 |     "n_workers": 4,
85 |     "pin_mem": true
86 | }
87 | 


--------------------------------------------------------------------------------
/config/train-how2qa-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "how2qa_video_sub_train",
 6 |             "sub_txt_db": "/txt/how2_subtitles.db",
 7 |             "vfeat_db": "/video/how2",
 8 |             "query_txt_db": "/txt/how2qa_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "how2qa_video_sub_val",
15 |             "sub_txt_db": "/txt/how2_subtitles.db",
16 |             "vfeat_db": "/video/how2",
17 |             "query_txt_db": "/txt/how2qa_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
23 |     "output_dir": "/storage/ST_PT_FT/how2qa_default",
24 |     "max_clip_len": 100,
25 |     "max_txt_len": 120,
26 |     "vfeat_version": "resnet_slowfast",
27 |     "vfeat_interval": 1.5,
28 |     "train_batch_size": 4,
29 |     "val_batch_size": 10,
30 |     "gradient_accumulation_steps": 2,
31 |     "learning_rate": 5e-05,
32 |     "valid_steps": 200,
33 |     "save_steps": 200,
34 |     "num_train_steps": 2000,
35 |     "optim": "adamw",
36 |     "betas": [
37 |         0.9,
38 |         0.98
39 |     ],
40 |     "dropout": 0.1,
41 |     "weight_decay": 0.01,
42 |     "lr_mul": 10.0,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 200,
45 |     "lw_st_ed": 0.4,
46 |     "sub_ctx_len": 0,
47 |     "seed": 77,
48 |     "no_fp16": false,
49 |     "n_workers": 4,
50 |     "no_pin_mem": false,
51 |     "rank": 0
52 | }
53 | 


--------------------------------------------------------------------------------
/config/train-how2r-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vcmr",
 5 |             "name": "how2r_video_sub_train",
 6 |             "sub_txt_db": "/txt/how2_subtitles.db",
 7 |             "vfeat_db": "/video/how2",
 8 |             "query_txt_db": "/txt/how2r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vcmr",
14 |             "name": "how2r_video_sub_val",
15 |             "sub_txt_db": "/txt/how2_subtitles.db",
16 |             "vfeat_db": "/video/how2",
17 |             "query_txt_db": "/txt/how2r_val_1k.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
23 |     "output_dir": "/storage/ST_PT_FT/how2r_default",
24 |     "eval_with_query_type": true,
25 |     "max_before_nms": 200,
26 |     "max_after_nms": 100,
27 |     "distributed_eval": true,
28 |     "nms_thd": -1,
29 |     "q2c_alpha": 20,
30 |     "max_vcmr_video": 100,
31 |     "full_eval_tasks": [
32 |         "VCMR",
33 |         "SVMR",
34 |         "VR"
35 |     ],
36 |     "max_clip_len": 100,
37 |     "max_txt_len": 60,
38 |     "vfeat_version": "resnet_slowfast",
39 |     "vfeat_interval": 1.5,
40 |     "min_pred_l": 3,
41 |     "max_pred_l": 20,
42 |     "drop_svmr_prob": 0.9,
43 |     "train_batch_size": 32,
44 |     "val_batch_size": 20,
45 |     "vcmr_eval_video_batch_size": 50,
46 |     "vcmr_eval_batch_size": 80,
47 |     "gradient_accumulation_steps":2,
48 |     "learning_rate": 1e-04,
49 |     "valid_steps": 200,
50 |     "save_steps": 200,
51 |     "num_train_steps": 3000,
52 |     "optim": "adamw",
53 |     "betas": [
54 |         0.9,
55 |         0.98
56 |     ],
57 |     "dropout": 0.1,
58 |     "weight_decay": 0.01,
59 |     "grad_norm": 1.0,
60 |     "warmup_steps": 300,
61 |     "lw_neg_q": 8.0,
62 |     "lw_neg_ctx": 8.0,
63 |     "lw_st_ed": 0.01,
64 |     "ranking_loss_type": "hinge",
65 |     "margin": 0.1,
66 |     "hard_pool_size": [
67 |         20
68 |     ],
69 |     "hard_neg_weights": [
70 |         10
71 |     ],
72 |     "hard_negtiave_start_step": [
73 |         1000
74 |     ],
75 |     "train_span_start_step": 0,
76 |     "sub_ctx_len": 0,
77 |     "use_all_neg": true,
78 |     "seed": 77,
79 |     "no_fp16": false,
80 |     "n_workers": 4,
81 |     "no_pin_mem": false,
82 |     "rank": 0
83 | }
84 | 


--------------------------------------------------------------------------------
/config/train-qa-multitask-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "tvqa_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvqa_train.db",
 9 |             "ratio": 5
10 |         },
11 |         {
12 |             "task": "videoQA",
13 |             "name": "how2qa_video_sub_train",
14 |             "sub_txt_db": "/txt/how2_subtitles.db",
15 |             "vfeat_db": "/video/how2",
16 |             "query_txt_db": "/txt/how2qa_train.db",
17 |             "ratio": 1
18 |         },
19 |         {
20 |             "task": "violin",
21 |             "name": "violin_video_sub_train",
22 |             "sub_txt_db": "/txt/violin_subtitles.db",
23 |             "vfeat_db": "/video/violin",
24 |             "query_txt_db": "/txt/violin_train.db",
25 |             "ratio": 3
26 |         },
27 |         {
28 |             "task": "videoQA",
29 |             "name": "vlep_video_sub_train",
30 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
31 |             "vfeat_db": "/video/vlep",
32 |             "query_txt_db": "/txt/vlep_train.db",
33 |             "ratio": 1
34 |         }
35 |     ],
36 |     "val_datasets": [
37 |         {
38 |             "task": "videoQA",
39 |             "name": "tvqa_video_sub_val",
40 |             "sub_txt_db": "/txt/tv_subtitles.db",
41 |             "vfeat_db": "/video/tv",
42 |             "query_txt_db": "/txt/tvqa_val.db"
43 |         },
44 |         {
45 |             "task": "videoQA",
46 |             "name": "how2qa_video_sub_val",
47 |             "sub_txt_db": "/txt/how2_subtitles.db",
48 |             "vfeat_db": "/video/how2",
49 |             "query_txt_db": "/txt/how2qa_val.db"
50 |         },
51 |         {
52 |             "task": "violin",
53 |             "name": "violin_video_sub_val",
54 |             "sub_txt_db": "/txt/violin_subtitles.db",
55 |             "vfeat_db": "/video/violin",
56 |             "query_txt_db": "/txt/violin_val.db"
57 |         },
58 |         {
59 |             "task": "videoQA",
60 |             "name": "vlep_video_sub_dev",
61 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
62 |             "vfeat_db": "/video/vlep",
63 |             "query_txt_db": "/txt/vlep_dev.db"
64 |         }
65 |     ],
66 |     "compressed_db": false,
67 |     "model_config": "config/model_config/hero_finetune.json",
68 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
69 |     "output_dir": "/storage/MT_PT_FT/qa_multi-task_default",
70 |     "max_clip_len": 100,
71 |     "max_txt_len": 120,
72 |     "vfeat_version": "resnet_slowfast",
73 |     "vfeat_interval": 1.5,
74 |     "train_batch_size": 4,
75 |     "val_batch_size": 10,
76 |     "gradient_accumulation_steps": 2,
77 |     "learning_rate": 5e-05,
78 |     "valid_steps": 200,
79 |     "save_steps": 200,
80 |     "num_train_steps": 20000,
81 |     "optim": "adamw",
82 |     "betas": [
83 |         0.9,
84 |         0.98
85 |     ],
86 |     "dropout": 0.1,
87 |     "weight_decay": 0.01,
88 |     "lr_mul": 10.0,
89 |     "grad_norm": 1.0,
90 |     "warmup_steps": 2000,
91 |     "lw_st_ed": 0.4,
92 |     "sub_ctx_len": 0,
93 |     "seed": 77,
94 |     "no_fp16": false,
95 |     "n_workers": 4,
96 |     "no_pin_mem": false,
97 |     "rank": 0
98 | }


--------------------------------------------------------------------------------
/config/train-retrieval-multitask-8gpu.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "train_datasets": [
  3 |         {
  4 |             "task": "vcmr",
  5 |             "name": "tvr_video_sub_train",
  6 |             "sub_txt_db": "/txt/tv_subtitles.db",
  7 |             "vfeat_db": "/video/tv",
  8 |             "query_txt_db": "/txt/tvr_train.db",
  9 |             "batch_size": 32,
 10 |             "ratio": 2
 11 |         },
 12 |         {
 13 |             "task": "vcmr",
 14 |             "name": "how2r_video_sub_train",
 15 |             "sub_txt_db": "/txt/how2_subtitles.db",
 16 |             "vfeat_db": "/video/how2",
 17 |             "query_txt_db": "/txt/how2r_train.db",
 18 |             "batch_size": 32,
 19 |             "ratio": 1
 20 |         },
 21 |         {
 22 |             "task": "vr",
 23 |             "name": "vatex_en_r_video_sub_train",
 24 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
 25 |             "vfeat_db": "/video/vatex",
 26 |             "query_txt_db": "/txt/vatex_en_r_train.db",
 27 |             "batch_size": 64,
 28 |             "ratio": 3
 29 |         },
 30 |         {
 31 |             "task": "vr",
 32 |             "name": "yc2r_video_sub_train",
 33 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
 34 |             "vfeat_db": "/video/yc2",
 35 |             "query_txt_db": "/txt/yc2r_train.db",
 36 |             "batch_size": 48,
 37 |             "ratio": 1
 38 |         }
 39 |     ],
 40 |     "val_datasets": [
 41 |         {
 42 |             "task": "vcmr",
 43 |             "name": "tvr_video_sub_val",
 44 |             "sub_txt_db": "/txt/tv_subtitles.db",
 45 |             "vfeat_db": "/video/tv",
 46 |             "query_txt_db": "/txt/tvr_val.db"
 47 |         },
 48 |         {
 49 |             "task": "vcmr",
 50 |             "name": "how2r_video_sub_val",
 51 |             "sub_txt_db": "/txt/how2_subtitles.db",
 52 |             "vfeat_db": "/video/how2",
 53 |             "query_txt_db": "/txt/how2r_val_1k.db"
 54 |         },
 55 |         {
 56 |             "task": "vr",
 57 |             "name": "vatex_en_r_video_sub_val",
 58 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
 59 |             "vfeat_db": "/video/vatex",
 60 |             "query_txt_db": "/txt/vatex_en_r_val.db"
 61 |         },
 62 |         {
 63 |             "task": "vr",
 64 |             "name": "yc2r_video_sub_val",
 65 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
 66 |             "vfeat_db": "/video/yc2",
 67 |             "query_txt_db": "/txt/yc2r_val.db"
 68 |         }
 69 |     ],
 70 |     "compressed_db": false,
 71 |     "model_config": "config/model_config/hero_finetune.json",
 72 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
 73 |     "output_dir": "/storage/MT_PT_FT/retrieval_multi-task_default",
 74 |     "eval_with_query_type": true,
 75 |     "max_before_nms": 200,
 76 |     "max_after_nms": 100,
 77 |     "distributed_eval": true,
 78 |     "nms_thd": -1,
 79 |     "q2c_alpha": 20,
 80 |     "max_vcmr_video": 100,
 81 |     "full_eval_tasks": [
 82 |         "VCMR",
 83 |         "SVMR",
 84 |         "VR"
 85 |     ],
 86 |     "max_clip_len": 100,
 87 |     "max_txt_len": 60,
 88 |     "vfeat_version": "resnet_slowfast",
 89 |     "vfeat_interval": 1.5,
 90 |     "min_pred_l": 2,
 91 |     "max_pred_l": 16,
 92 |     "drop_svmr_prob": 0.8,
 93 |     "train_batch_size": 32,
 94 |     "val_batch_size": 20,
 95 |     "vcmr_eval_video_batch_size": 50,
 96 |     "vcmr_eval_batch_size": 80,
 97 |     "vr_eval_video_batch_size": 50,
 98 |     "vr_eval_batch_size": 80,
 99 |     "gradient_accumulation_steps":2,
100 |     "learning_rate": 1e-04,
101 |     "valid_steps": 400,
102 |     "save_steps": 400,
103 |     "num_train_steps": 10000,
104 |     "optim": "adamw",
105 |     "betas": [
106 |         0.9,
107 |         0.98
108 |     ],
109 |     "dropout": 0.1,
110 |     "weight_decay": 0.01,
111 |     "grad_norm": 1.0,
112 |     "warmup_steps": 1000,
113 |     "lw_neg_q": 8.0,
114 |     "lw_neg_ctx": 8.0,
115 |     "lw_st_ed": 0.01,
116 |     "ranking_loss_type": "hinge",
117 |     "margin": 0.1,
118 |     "hard_pool_size": [
119 |         20
120 |     ],
121 |     "hard_neg_weights": [
122 |         10
123 |     ],
124 |     "hard_negative_start_step": [
125 |         4000
126 |     ],
127 |     "train_span_start_step": 0,
128 |     "sub_ctx_len": 0,
129 |     "use_all_neg": true,
130 |     "seed": 77,
131 |     "no_fp16": false,
132 |     "n_workers": 4,
133 |     "no_pin_mem": false,
134 |     "rank": 0
135 | }
136 | 


--------------------------------------------------------------------------------
/config/train-tvc-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "tvc_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "cap_txt_db": ["/txt/tvc_train.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "tvc_video_sub_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "gt_anno": "/txt/tvc_val_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "/src/config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
22 |     "load_partial_pretrained": false,
23 |     "skip_layer_loading": false,
24 |     "output_dir": "/storage/ST_PT_FT/tvc_default",
25 |     "max_clip_len": 100,
26 |     "max_txt_len": 60,
27 |     "max_cap_per_vid": -1,
28 |     "max_gen_step": 30,
29 |     "vfeat_version": "resnet_slowfast",
30 |     "vfeat_interval": 1.5,
31 |     "compressed_db": false,
32 |     "train_batch_size": 4,
33 |     "val_batch_size": 8,
34 |     "gradient_accumulation_steps": 1,
35 |     "learning_rate": 1e-4,
36 |     "lr_mul": 10.0,
37 |     "valid_steps": 500,
38 |     "num_train_steps": 7000,
39 |     "optim": "adamw",
40 |     "betas": [0.9, 0.98],
41 |     "lsr": 0.1,
42 |     "dropout": 0.1,
43 |     "weight_decay": 0.01,
44 |     "grad_norm": 1.0,
45 |     "warmup_steps": 700,
46 |     "sub_ctx_len": 1,
47 |     "seed": 77,
48 |     "no_fp16": false,
49 |     "n_workers": 4,
50 |     "pin_mem": true
51 | }
52 | 


--------------------------------------------------------------------------------
/config/train-tvqa-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "tvqa_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvqa_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "tvqa_video_sub_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "query_txt_db": "/txt/tvqa_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
23 |     "output_dir": "/storage/ST_PT_FT/tvqa_default",
24 |     "max_clip_len": 100,
25 |     "max_txt_len": 120,
26 |     "vfeat_version": "resnet_slowfast",
27 |     "vfeat_interval": 1.5,
28 |     "train_batch_size": 4,
29 |     "val_batch_size": 10,
30 |     "gradient_accumulation_steps": 2,
31 |     "learning_rate": 5e-05,
32 |     "valid_steps": 200,
33 |     "save_steps": 200,
34 |     "num_train_steps": 10000,
35 |     "optim": "adamw",
36 |     "betas": [
37 |         0.9,
38 |         0.98
39 |     ],
40 |     "dropout": 0.1,
41 |     "weight_decay": 0.01,
42 |     "lr_mul": 10.0,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 1000,
45 |     "lw_st_ed": 0.4,
46 |     "sub_ctx_len": 0,
47 |     "seed": 77,
48 |     "no_fp16": false,
49 |     "n_workers": 4,
50 |     "no_pin_mem": false,
51 |     "rank": 0
52 | }
53 | 


--------------------------------------------------------------------------------
/config/train-tvr-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vcmr",
 5 |             "name": "tvr_video_sub_train",
 6 |             "sub_txt_db": "/txt/tv_subtitles.db",
 7 |             "vfeat_db": "/video/tv",
 8 |             "query_txt_db": "/txt/tvr_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vcmr",
14 |             "name": "tvr_video_sub_val",
15 |             "sub_txt_db": "/txt/tv_subtitles.db",
16 |             "vfeat_db": "/video/tv",
17 |             "query_txt_db": "/txt/tvr_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
23 |     "output_dir": "/storage/ST_PT_FT/tvr_default",
24 |     "eval_with_query_type": true,
25 |     "max_before_nms": 200,
26 |     "max_after_nms": 100,
27 |     "distributed_eval": true,
28 |     "nms_thd": -1,
29 |     "q2c_alpha": 20,
30 |     "max_vcmr_video": 100,
31 |     "full_eval_tasks": [
32 |         "VCMR",
33 |         "SVMR",
34 |         "VR"
35 |     ],
36 |     "max_clip_len": 100,
37 |     "max_txt_len": 60,
38 |     "vfeat_version": "resnet_slowfast",
39 |     "vfeat_interval": 1.5,
40 |     "min_pred_l": 2,
41 |     "max_pred_l": 16,
42 |     "drop_svmr_prob": 0.8,
43 |     "train_batch_size": 32,
44 |     "val_batch_size": 20,
45 |     "vcmr_eval_video_batch_size": 50,
46 |     "vcmr_eval_batch_size": 80,
47 |     "gradient_accumulation_steps":2,
48 |     "learning_rate": 1e-04,
49 |     "valid_steps": 200,
50 |     "save_steps": 200,
51 |     "num_train_steps": 5000,
52 |     "optim": "adamw",
53 |     "betas": [
54 |         0.9,
55 |         0.98
56 |     ],
57 |     "dropout": 0.1,
58 |     "weight_decay": 0.01,
59 |     "grad_norm": 1.0,
60 |     "warmup_steps": 500,
61 |     "lw_neg_q": 8.0,
62 |     "lw_neg_ctx": 8.0,
63 |     "lw_st_ed": 0.01,
64 |     "ranking_loss_type": "hinge",
65 |     "margin": 0.1,
66 |     "hard_pool_size": [
67 |         20
68 |     ],
69 |     "hard_neg_weights": [
70 |         10
71 |     ],
72 |     "hard_negative_start_step": [
73 |         2000
74 |     ],
75 |     "train_span_start_step": 0,
76 |     "sub_ctx_len": 0,
77 |     "use_all_neg": true,
78 |     "seed": 77,
79 |     "no_fp16": false,
80 |     "n_workers": 4,
81 |     "no_pin_mem": false,
82 |     "rank": 0
83 | }
84 | 


--------------------------------------------------------------------------------
/config/train-vatex_en_c-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "vatex_en_c_video_sub_train",
 6 |             "sub_txt_db": "/txt/vatex_subtitles.db",
 7 |             "vfeat_db": "/video/vatex",
 8 |             "cap_txt_db": ["/txt/vatex_en_r_train.db", "/txt/vatex_en_r_val.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "vatex_en_c_video_sub_val",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db",
16 |             "vfeat_db": "/video/vatex",
17 |             "gt_anno": "/txt/vatex_en_c_test_public_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
22 |     "output_dir": "/storage/ST_PT_FT/vatex_en_c_default",
23 |     "max_clip_len": 100,
24 |     "max_txt_len": 60,
25 |     "max_gen_step": 30,
26 |     "vfeat_version": "resnet_slowfast",
27 |     "vfeat_interval": 1.5,
28 |     "compressed_db": false,
29 |     "train_batch_size": 128,
30 |     "val_batch_size": 128,
31 |     "gradient_accumulation_steps": 1,
32 |     "learning_rate": 1e-4,
33 |     "lr_mul": 10.0,
34 |     "valid_steps": 500,
35 |     "num_train_steps": 7000,
36 |     "optim": "adamw",
37 |     "betas": [0.9, 0.98],
38 |     "lsr": 0.1,
39 |     "dropout": 0.1,
40 |     "weight_decay": 0.01,
41 |     "grad_norm": 1.0,
42 |     "warmup_steps": 700,
43 |     "sub_ctx_len": 1,
44 |     "seed": 77,
45 |     "no_fp16": false,
46 |     "n_workers": 4,
47 |     "pin_mem": true
48 | }
49 | 


--------------------------------------------------------------------------------
/config/train-vatex_en_r-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vr",
 5 |             "name": "vatex_en_r_video_sub_train",
 6 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
 7 |             "vfeat_db": "/video/vatex",
 8 |             "query_txt_db": "/txt/vatex_en_r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vr",
14 |             "name": "vatex_en_r_video_sub_val",
15 |             "sub_txt_db": "/txt/vatex_subtitles.db/",
16 |             "vfeat_db": "/video/vatex",
17 |             "query_txt_db": "/txt/vatex_en_r_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
23 |     "output_dir": "/storage/ST_PT_FT/vatex_en_r_default",
24 |     "distributed_eval": true,
25 |     "max_vr_video": 100,
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 60,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 64,
31 |     "val_batch_size": 20,
32 |     "vr_eval_video_batch_size": 50,
33 |     "vr_eval_q_batch_size": 80,
34 |     "gradient_accumulation_steps": 2,
35 |     "learning_rate": 7e-05,
36 |     "valid_steps": 200,
37 |     "save_steps": 200,
38 |     "num_train_steps": 4000,
39 |     "optim": "adamw",
40 |     "betas": [
41 |         0.9,
42 |         0.98
43 |     ],
44 |     "dropout": 0.1,
45 |     "weight_decay": 0.01,
46 |     "grad_norm": 1.0,
47 |     "warmup_steps": 400,
48 |     "lw_neg_q": 10.0,
49 |     "lw_neg_ctx": 10.0,
50 |     "ranking_loss_type": "hinge",
51 |     "margin": 0.1,
52 |     "hard_pool_size": [
53 |         80
54 |     ],
55 |     "hard_neg_weights": [
56 |         10
57 |     ],
58 |     "hard_negative_start_step": [
59 |         2000
60 |     ],
61 |     "use_all_neg": true,
62 |     "sub_ctx_len": 1,
63 |     "seed": 77,
64 |     "no_fp16": false,
65 |     "n_workers": 4,
66 |     "no_pin_mem": false,
67 |     "rank": 0
68 | }
69 | 


--------------------------------------------------------------------------------
/config/train-violin-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "violin",
 5 |             "name": "violin_video_sub_train",
 6 |             "sub_txt_db": "/txt/violin_subtitles.db",
 7 |             "vfeat_db": "/video/violin",
 8 |             "query_txt_db": "/txt/violin_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "violin",
14 |             "name": "violin_video_sub_val",
15 |             "sub_txt_db": "/txt/violin_subtitles.db",
16 |             "vfeat_db": "/video/violin",
17 |             "query_txt_db": "/txt/violin_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
23 |     "output_dir": "/storage/ST_PT_FT/violin_default",
24 |     "max_clip_len": 100,
25 |     "max_txt_len": 120,
26 |     "vfeat_version": "resnet_slowfast",
27 |     "vfeat_interval": 1.5,
28 |     "train_batch_size": 4,
29 |     "val_batch_size": 10,
30 |     "gradient_accumulation_steps": 2,
31 |     "learning_rate": 3e-05,
32 |     "valid_steps": 200,
33 |     "save_steps": 200,
34 |     "num_train_steps": 6000,
35 |     "optim": "adamw",
36 |     "betas": [
37 |         0.9,
38 |         0.98
39 |     ],
40 |     "dropout": 0.1,
41 |     "weight_decay": 0.01,
42 |     "lr_mul": 8.0,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 600,
45 |     "sub_ctx_len": 2,
46 |     "seed": 77,
47 |     "no_fp16": false,
48 |     "n_workers": 4,
49 |     "no_pin_mem": false,
50 |     "rank": 0
51 | }
52 | 


--------------------------------------------------------------------------------
/config/train-vlep-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoQA",
 5 |             "name": "vlep_video_sub_train",
 6 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
 7 |             "vfeat_db": "/video/vlep",
 8 |             "query_txt_db": "/txt/vlep_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoQA",
14 |             "name": "vlep_video_sub_dev",
15 |             "sub_txt_db": "/txt/vlep_subtitles.db/",
16 |             "vfeat_db": "/video/vlep",
17 |             "query_txt_db": "/txt/vlep_dev.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
23 |     "output_dir": "/storage/ST_PT_FT/vlep_default",
24 |     "max_clip_len": 100,
25 |     "max_txt_len": 120,
26 |     "vfeat_version": "resnet_slowfast",
27 |     "vfeat_interval": 1.5,
28 |     "train_batch_size": 4,
29 |     "val_batch_size": 10,
30 |     "gradient_accumulation_steps": 2,
31 |     "learning_rate": 5e-05,
32 |     "valid_steps": 100,
33 |     "save_steps": 200,
34 |     "num_train_steps": 1000,
35 |     "optim": "adamw",
36 |     "betas": [
37 |         0.9,
38 |         0.98
39 |     ],
40 |     "dropout": 0.1,
41 |     "weight_decay": 0.01,
42 |     "lr_mul": 10.0,
43 |     "grad_norm": 1.0,
44 |     "warmup_steps": 100,
45 |     "lw_st_ed": 0.4,
46 |     "sub_ctx_len": 0,
47 |     "seed": 77,
48 |     "no_fp16": false,
49 |     "n_workers": 4,
50 |     "no_pin_mem": false,
51 |     "rank": 0
52 | }
53 | 


--------------------------------------------------------------------------------
/config/train-yc2c-8gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "videoCap",
 5 |             "name": "yc2c_video_sub_train",
 6 |             "sub_txt_db": "/txt/yc2_subtitles.db",
 7 |             "vfeat_db": "/video/yc2",
 8 |             "cap_txt_db": ["/txt/yc2r_train.db"]
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "videoCap",
14 |             "name": "yc2c_video_sub_val",
15 |             "sub_txt_db": "/txt/yc2_subtitles.db",
16 |             "vfeat_db": "/video/yc2",
17 |             "gt_anno": "/txt/yc2c_val_release.jsonl"
18 |         }
19 |     ],
20 |     "model_config": "config/model_config/hero_videoCap.json",
21 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
22 |     "output_dir": "/storage/ST_PT_FT/yc2c_default",
23 |     "max_clip_len": 100,
24 |     "max_txt_len": 60,
25 |     "max_gen_step": 30,
26 |     "vfeat_version": "resnet_slowfast",
27 |     "vfeat_interval": 1.5,
28 |     "compressed_db": false,
29 |     "train_batch_size": 16,
30 |     "val_batch_size": 16,
31 |     "gradient_accumulation_steps": 1,
32 |     "learning_rate": 1e-4,
33 |     "lr_mul": 10.0,
34 |     "valid_steps": 500,
35 |     "num_train_steps": 7000,
36 |     "optim": "adamw",
37 |     "betas": [0.9, 0.98],
38 |     "lsr": 0.1,
39 |     "dropout": 0.1,
40 |     "weight_decay": 0.01,
41 |     "grad_norm": 1.0,
42 |     "warmup_steps": 700,
43 |     "sub_ctx_len": 1,
44 |     "seed": 77,
45 |     "no_fp16": false,
46 |     "n_workers": 4,
47 |     "pin_mem": true
48 | }
49 | 


--------------------------------------------------------------------------------
/config/train-yc2r-4gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_datasets": [
 3 |         {
 4 |             "task": "vr",
 5 |             "name": "yc2r_video_sub_train",
 6 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
 7 |             "vfeat_db": "/video/yc2",
 8 |             "query_txt_db": "/txt/yc2r_train.db"
 9 |         }
10 |     ],
11 |     "val_datasets": [
12 |         {
13 |             "task": "vr",
14 |             "name": "yc2r_video_sub_val",
15 |             "sub_txt_db": "/txt/yc2_subtitles.db/",
16 |             "vfeat_db": "/video/yc2",
17 |             "query_txt_db": "/txt/yc2r_val.db"
18 |         }
19 |     ],
20 |     "compressed_db": false,
21 |     "model_config": "config/model_config/hero_finetune.json",
22 |     "checkpoint": "/pretrain/hero-tv-ht100.pt",
23 |     "output_dir": "/storage/ST_PT_FT/yc2r_video_sub_default",
24 |     "distributed_eval": true,
25 |     "max_vr_video": 100,
26 |     "max_clip_len": 100,
27 |     "max_txt_len": 60,
28 |     "vfeat_version": "resnet_slowfast",
29 |     "vfeat_interval": 1.5,
30 |     "train_batch_size": 48,
31 |     "val_batch_size": 20,
32 |     "vr_eval_video_batch_size": 50,
33 |     "vr_eval_q_batch_size": 80,
34 |     "gradient_accumulation_steps": 2,
35 |     "learning_rate": 7e-05,
36 |     "valid_steps": 200,
37 |     "save_steps": 200,
38 |     "num_train_steps": 4000,
39 |     "optim": "adamw",
40 |     "betas": [
41 |         0.9,
42 |         0.98
43 |     ],
44 |     "dropout": 0.1,
45 |     "weight_decay": 0.01,
46 |     "grad_norm": 1.0,
47 |     "warmup_steps": 400,
48 |     "lw_neg_q": 10.0,
49 |     "lw_neg_ctx": 10.0,
50 |     "ranking_loss_type": "hinge",
51 |     "margin": 0.1,
52 |     "hard_pool_size": [
53 |         80
54 |     ],
55 |     "hard_neg_weights": [
56 |         10
57 |     ],
58 |     "hard_negative_start_step": [
59 |         2000
60 |     ],
61 |     "use_all_neg": true,
62 |     "sub_ctx_len": 1,
63 |     "seed": 77,
64 |     "no_fp16": false,
65 |     "n_workers": 4,
66 |     "no_pin_mem": false,
67 |     "rank": 0
68 | }
69 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | """
 6 | from .data import (
 7 |     TxtTokLmdb, VideoFeatLmdb, SubTokLmdb,
 8 |     QueryTokLmdb, VideoFeatSubTokDataset, video_collate,
 9 |     VideoFeatDataset, QaQueryTokLmdb, SubOnlyDataset)
10 | from .loader import PrefetchLoader, MetaLoader
11 | from .vcmr import (
12 |     VcmrDataset, vcmr_collate, VcmrEvalDataset, vcmr_eval_collate,
13 |     VcmrFullEvalDataset, vcmr_full_eval_collate,
14 |     VcmrVideoOnlyDataset, VcmrVideoOnlyEvalDataset,
15 |     VcmrVideoOnlyFullEvalDataset,
16 |     VcmrSubOnlyDataset, VcmrSubOnlyEvalDataset,
17 |     VcmrSubOnlyFullEvalDataset)
18 | from .vr import (
19 |     VrDataset, VrEvalDataset, VrSubTokLmdb, VrQueryTokLmdb,
20 |     MsrvttQueryTokLmdb,
21 |     VrFullEvalDataset, vr_collate, vr_eval_collate,
22 |     vr_full_eval_collate,
23 |     VrVideoOnlyDataset, VrVideoOnlyEvalDataset,
24 |     VrVideoOnlyFullEvalDataset,
25 |     VrSubOnlyDataset, VrSubOnlyEvalDataset,
26 |     VrSubOnlyFullEvalDataset)
27 | from .videoQA import (
28 |     VideoQaDataset, video_qa_collate,
29 |     VideoQaEvalDataset, video_qa_eval_collate,
30 |     VideoQaVideoOnlyDataset, VideoQaVideoOnlyEvalDataset,
31 |     VideoQaSubOnlyDataset, VideoQaSubOnlyEvalDataset)
32 | from .vlep import (
33 |     VlepDataset, vlep_collate,
34 |     VlepEvalDataset, vlep_eval_collate,
35 |     VlepVideoOnlyDataset, VlepVideoOnlyEvalDataset,
36 |     VlepSubOnlyDataset, VlepSubOnlyEvalDataset)
37 | from .violin import (
38 |     ViolinDataset, violin_collate,
39 |     ViolinEvalDataset, violin_eval_collate,
40 |     ViolinVideoOnlyDataset, ViolinVideoOnlyEvalDataset,
41 |     ViolinSubOnlyDataset, ViolinSubOnlyEvalDataset)
42 | from .fom import (
43 |     FomDataset, fom_collate,
44 |     FomEvalDataset, fom_eval_collate)
45 | from .vsm import VsmDataset, vsm_collate
46 | from .mlm import (
47 |     VideoMlmDataset, mlm_collate)
48 | from .mfm import MfmDataset, mfm_collate
49 | from .videoCap import (VideoCapTrainDataset, VideoCapValDataset,
50 |                        CaptionTokLmdb,
51 |                        VideoCapEvalDataset,
52 |                        VideoCapVideoOnlyTrainDataset,
53 |                        VideoCapVideoOnlyValDataset,
54 |                        VideoCapVideoOnlyEvalDataset,
55 |                        VideoCapSubOnlyTrainDataset,
56 |                        VideoCapSubOnlyValDataset,
57 |                        VideoCapSubOnlyEvalDataset)
58 | from .tvc import (
59 |     TvcTrainDataset, TvcValDataset, TvcTokLmdb,
60 |     TvcEvalDataset,
61 |     TvcVideoOnlyValDataset, TvcVideoOnlyTrainDataset,
62 |     TvcVideoOnlyEvalDataset,
63 |     TvcSubOnlyValDataset, TvcSubOnlyTrainDataset,
64 |     TvcSubOnlyEvalDataset)
65 | 


--------------------------------------------------------------------------------
/data/fom.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) Microsoft Corporation.
  3 | Licensed under the MIT license.
  4 | 
  5 | Pretrain FOM  dataset
  6 | 
  7 | copied/modified from HERO
  8 | (https://github.com/linjieli222/HERO)
  9 | """
 10 | import copy
 11 | import random
 12 | 
 13 | from torch.utils.data import Dataset
 14 | import torch
 15 | from toolz.sandbox import unzip
 16 | import horovod.torch as hvd
 17 | 
 18 | from .data import VideoFeatSubTokDataset, _check_ngpu, video_collate
 19 | 
 20 | 
 21 | class FomDataset(Dataset):
 22 |     def __init__(self, video_ids, vid_sub_db, random_reorder_p=0.15):
 23 |         assert isinstance(vid_sub_db, VideoFeatSubTokDataset)
 24 |         self.vid_sub_db = vid_sub_db
 25 |         if _check_ngpu() > 1:
 26 |             self.ids = video_ids[hvd.rank()::hvd.size()]
 27 |         else:
 28 |             self.ids = video_ids
 29 |         self.random_reorder_p = random_reorder_p
 30 | 
 31 |     def __len__(self):
 32 |         return len(self.ids)
 33 | 
 34 |     def __getitem__(self, i):
 35 |         vid_ = self.ids[i]
 36 |         (f_sub_input_ids, f_v_feats, f_attn_masks,
 37 |          c_v_feats, c_attn_masks,
 38 |          num_subs, sub2frames) = self.vid_sub_db[vid_]
 39 |         c_pos_ids = [i for i in range(len(c_v_feats))]
 40 |         # Random shuffle 15% of pos_ids
 41 |         orders, targets = random_reorder(
 42 |             list(range(len(c_pos_ids))), self.random_reorder_p)
 43 |         orders = torch.tensor(orders, dtype=torch.long)
 44 |         targets = torch.tensor(targets, dtype=torch.long)
 45 |         video_inputs = (
 46 |             f_sub_input_ids, f_v_feats, f_attn_masks,
 47 |             c_v_feats, c_attn_masks,
 48 |             num_subs, sub2frames)
 49 |         out = (video_inputs, orders, targets)
 50 |         return out
 51 | 
 52 | 
 53 | def fom_collate(inputs):
 54 |     (video_inputs, orders, targets) = map(list, unzip(inputs))
 55 |     batch = video_collate(video_inputs)
 56 | 
 57 |     clip_level_v_feats = batch["c_v_feats"]
 58 |     num_frames = [item.size(0) for item in orders]
 59 | 
 60 |     all_orders = torch.arange(
 61 |         0, clip_level_v_feats.size(1), dtype=torch.long).unsqueeze(0).repeat(
 62 |             clip_level_v_feats.size(0), 1)
 63 |     all_targets = torch.ones_like(all_orders) * -1
 64 |     for i, nframe in enumerate(num_frames):
 65 |         all_orders[i, :nframe] = orders[i]
 66 |         all_targets[i, :nframe] = targets[i]
 67 |     reordered_frame_idx = []
 68 |     binary_targets = []
 69 |     bs, max_vl = all_orders.size()
 70 |     for clip_idx in range(bs):
 71 |         for i in range(num_frames[clip_idx]):
 72 |             if all_targets[clip_idx, i] == -1:
 73 |                 continue
 74 |             for j in range(i+1, num_frames[clip_idx]):
 75 |                 if all_targets[clip_idx, j] == -1:
 76 |                     continue
 77 |                 reordered_frame_idx.append(clip_idx*max_vl+i)
 78 |                 reordered_frame_idx.append(clip_idx*max_vl+j)
 79 |                 if all_targets[clip_idx, i] > all_targets[clip_idx, j]:
 80 |                     binary_targets.append(0)
 81 |                 else:
 82 |                     binary_targets.append(1)
 83 | 
 84 |                 reordered_frame_idx.append(clip_idx*max_vl+j)
 85 |                 reordered_frame_idx.append(clip_idx*max_vl+i)
 86 |                 if all_targets[clip_idx, j] > all_targets[clip_idx, i]:
 87 |                     binary_targets.append(0)
 88 |                 else:
 89 |                     binary_targets.append(1)
 90 |     reordered_frame_idx = torch.tensor(reordered_frame_idx, dtype=torch.long)
 91 |     binary_targets = torch.tensor(binary_targets, dtype=torch.long)
 92 |     batch["shuffled_orders"] = all_orders
 93 |     batch["targets"] = all_targets
 94 |     batch['reordered_frame_idx'] = reordered_frame_idx
 95 |     batch['binary_targets'] = binary_targets
 96 |     return batch
 97 | 
 98 | 
 99 | def random_reorder(pos_ids, random_reorder_p=0.15):
100 |     """
101 |     random reorder frame positions
102 |     """
103 |     selected_pos = []
104 |     target_pos = []
105 |     for i, pos_id in enumerate(pos_ids):
106 |         prob = random.random()
107 |         # mask token with 15% probability
108 |         if prob < random_reorder_p:
109 |             selected_pos.append(i)
110 |             target_pos.append(pos_id)
111 |     target_pos_shuffled = copy.deepcopy(target_pos)
112 |     random.shuffle(target_pos_shuffled)
113 |     output_order = copy.deepcopy(pos_ids)
114 |     output_target = [-1] * len(output_order)
115 |     for i, pos in enumerate(selected_pos):
116 |         output_order[pos] = target_pos_shuffled[i]
117 |         output_target[target_pos_shuffled[i]] = pos
118 |     return output_order, output_target
119 | 
120 | 
121 | class FomEvalDataset(FomDataset):
122 |     def __getitem__(self, i):
123 |         vid = self.ids[i]
124 |         tensors = super().__getitem__(i)
125 |         return (vid, *tensors)
126 | 
127 | 
128 | def fom_eval_collate(inputs):
129 |     vids, batch = [], []
130 |     for id_, *tensors in inputs:
131 |         vids.append(id_)
132 |         batch.append(tensors)
133 |     batch = fom_collate(batch)
134 |     batch['vids'] = vids
135 |     return batch
136 | 


--------------------------------------------------------------------------------
/data/mfm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) Microsoft Corporation.
  3 | Licensed under the MIT license.
  4 | 
  5 | Pretrain MFM  dataset
  6 | 
  7 | copied/modified from HERO
  8 | (https://github.com/linjieli222/HERO)
  9 | """
 10 | import random
 11 | 
 12 | import torch
 13 | from torch.nn.utils.rnn import pad_sequence
 14 | from torch.utils.data import Dataset
 15 | from toolz.sandbox import unzip
 16 | from cytoolz import concat
 17 | import horovod.torch as hvd
 18 | 
 19 | from .data import VideoFeatSubTokDataset, video_collate, _check_ngpu
 20 | 
 21 | 
 22 | def _get_img_mask(mask_prob, num_frame):
 23 |     img_mask = [random.random() < mask_prob for _ in range(num_frame)]
 24 |     if not any(img_mask):
 25 |         # at least mask 1
 26 |         img_mask[random.choice(range(num_frame))] = True
 27 |     img_mask = torch.tensor(img_mask)
 28 |     return img_mask
 29 | 
 30 | 
 31 | def _get_feat_target(img_feat, img_masks):
 32 |     img_masks_ext = img_masks.unsqueeze(-1).expand_as(img_feat)  # (n, m, d)
 33 |     feat_dim = img_feat.size(-1)
 34 |     feat_targets = img_feat[img_masks_ext].contiguous().view(
 35 |         -1, feat_dim)  # (s, d)
 36 |     return feat_targets
 37 | 
 38 | 
 39 | def _mask_img_feat(img_feat, img_masks):
 40 |     img_masks_ext = img_masks.unsqueeze(-1).expand_as(img_feat)
 41 |     img_feat_masked = img_feat.data.masked_fill(img_masks_ext, 0)
 42 |     return img_feat_masked
 43 | 
 44 | 
 45 | class MfmDataset(Dataset):
 46 |     def __init__(self, video_ids, vid_sub_db, mask_prob=0.15):
 47 |         assert isinstance(vid_sub_db, VideoFeatSubTokDataset)
 48 |         self.mask_prob = mask_prob
 49 |         self.vid_sub_db = vid_sub_db
 50 |         if _check_ngpu() > 1:
 51 |             self.ids = video_ids[hvd.rank()::hvd.size()]
 52 |         else:
 53 |             self.ids = video_ids
 54 | 
 55 |     def __len__(self):
 56 |         return len(self.ids)
 57 | 
 58 |     def __getitem__(self, i):
 59 |         vid = self.ids[i]
 60 |         (all_input_ids, f_v_feats, f_attn_masks,
 61 |          c_v_feats, c_attn_masks,
 62 |          num_subs, sub2frames) = self.vid_sub_db[vid]
 63 | 
 64 |         c_frame_mask = _get_img_mask(self.mask_prob, c_v_feats.size(0))
 65 |         frame_masks = []
 66 |         for i, frames in sub2frames:
 67 |             if len(frames):
 68 |                 frame_masks.append(
 69 |                     c_frame_mask.index_select(0, torch.tensor(frames)))
 70 |             else:
 71 |                 frame_masks.append(torch.zeros(1, dtype=torch.bool))
 72 |         c_pos_ids = torch.tensor(range(len(c_v_feats)), dtype=torch.long)
 73 |         c_frame_mask = c_frame_mask.index_select(0, c_pos_ids)
 74 |         return ((all_input_ids, f_v_feats, f_attn_masks,
 75 |                  c_v_feats, c_attn_masks,
 76 |                  num_subs, sub2frames),
 77 |                 frame_masks, c_frame_mask)
 78 | 
 79 | 
 80 | def mfm_collate(inputs):
 81 |     video_inputs, all_frame_masks, c_frame_masks = map(list, unzip(inputs))
 82 |     batch = video_collate(video_inputs)
 83 | 
 84 |     # mask features
 85 |     frame_masks = pad_sequence(list(concat(all_frame_masks)),
 86 |                                batch_first=True, padding_value=0)
 87 |     c_frame_masks = pad_sequence(c_frame_masks,
 88 |                                  batch_first=True, padding_value=0)
 89 |     f_v_feats = batch['f_v_feats']
 90 |     f_v_feats = _mask_img_feat(f_v_feats, frame_masks)
 91 |     c_v_feats = batch['c_v_feats']
 92 |     feat_targets = _get_feat_target(c_v_feats, c_frame_masks)
 93 |     c_v_feats = _mask_img_feat(c_v_feats, c_frame_masks)
 94 | 
 95 |     batch['f_v_feats'] = f_v_feats
 96 |     batch['f_v_masks'] = frame_masks
 97 |     batch['c_v_feats'] = c_v_feats
 98 |     batch['c_v_masks'] = c_frame_masks
 99 |     batch['feat_targets'] = feat_targets
100 |     return batch
101 | 


--------------------------------------------------------------------------------
/data/vlep.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) Microsoft Corporation.
  3 | Licensed under the MIT license.
  4 | 
  5 | VLEP dataset
  6 | 
  7 | copied/modified from HERO
  8 | (https://github.com/linjieli222/HERO)
  9 | """
 10 | import torch
 11 | from .videoQA import (
 12 |     VideoQaDataset, video_qa_eval_collate, video_qa_collate)
 13 | from .data import (SubOnlyDataset, VideoFeatDataset, QaQueryTokLmdb)
 14 | 
 15 | 
 16 | class VlepDataset(VideoQaDataset):
 17 | 
 18 |     def __getitem__(self, i):
 19 |         vid, qids = self.getids(i)
 20 |         video_inputs = self.video_db.__getitem__(vid)
 21 |         (frame_level_input_ids, frame_level_v_feats,
 22 |          frame_level_attn_masks, frame_level_sub_attn_masks,
 23 |          clip_level_v_feats, clip_level_attn_masks, num_subs,
 24 |          sub_idx2frame_idx) = video_inputs
 25 |         nframes = len(clip_level_v_feats)
 26 | 
 27 |         all_vids = []
 28 |         all_targets = []
 29 |         all_ts_targets = []
 30 |         all_qa_input_ids = []
 31 |         all_qa_attn_masks = []
 32 |         all_video_qa_inputs = []
 33 |         for qid in qids:
 34 |             example = self.query_db[qid]
 35 |             if example['target'] is not None:
 36 |                 target = torch.LongTensor([example['target']])
 37 |             else:
 38 |                 target = torch.LongTensor([-1])
 39 |             if example['ts'] is not None:
 40 |                 st_idx, ed_idx = self.get_st_ed_label(
 41 |                     example['ts'], max_idx=nframes-1)
 42 |                 ts_target = torch.LongTensor(
 43 |                     [st_idx, ed_idx])
 44 |             else:
 45 |                 ts_target = torch.LongTensor([-1, -1])
 46 | 
 47 |             input_ids = example["input_ids"]
 48 |             for a_input_ids in input_ids:
 49 |                 f_sub_qa_input_ids = []
 50 |                 f_sub_qa_attn_masks = []
 51 |                 sub_qa_attn_masks = []
 52 |                 curr_qa_input_id = torch.tensor(
 53 |                     [self.query_db.sep] + a_input_ids)
 54 |                 curr_qa_attn_masks = torch.tensor([1]*len(curr_qa_input_id))
 55 |                 all_qa_input_ids.append(curr_qa_input_id)
 56 |                 all_qa_attn_masks.append(curr_qa_attn_masks)
 57 |                 for f_sub_input_ids, f_attn_masks, sub_attn_masks in zip(
 58 |                         frame_level_input_ids, frame_level_attn_masks,
 59 |                         frame_level_sub_attn_masks):
 60 |                     curr_f_sub_qa_input_ids = torch.cat((
 61 |                         f_sub_input_ids, curr_qa_input_id))
 62 |                     curr_f_sub_qa_attn_masks = torch.cat((
 63 |                         f_attn_masks, curr_qa_attn_masks))
 64 |                     curr_sub_qa_attn_masks = torch.cat(
 65 |                         (sub_attn_masks, curr_qa_attn_masks))
 66 |                     f_sub_qa_input_ids.append(curr_f_sub_qa_input_ids)
 67 |                     f_sub_qa_attn_masks.append(curr_f_sub_qa_attn_masks)
 68 |                     sub_qa_attn_masks.append(curr_sub_qa_attn_masks)
 69 |                 curr_video_qa_inputs = (
 70 |                     f_sub_qa_input_ids, frame_level_v_feats,
 71 |                     f_sub_qa_attn_masks, sub_qa_attn_masks,
 72 |                     clip_level_v_feats, clip_level_attn_masks, num_subs,
 73 |                     sub_idx2frame_idx)
 74 |                 all_video_qa_inputs.append(curr_video_qa_inputs)
 75 |             all_vids.append(vid)
 76 |             all_targets.append(target)
 77 |             all_ts_targets.append(ts_target)
 78 |         out = (all_video_qa_inputs, all_qa_input_ids, all_qa_attn_masks,
 79 |                all_vids, all_targets, all_ts_targets)
 80 |         return out
 81 | 
 82 | 
 83 | vlep_collate = video_qa_collate
 84 | 
 85 | 
 86 | class VlepEvalDataset(VlepDataset):
 87 |     def __getitem__(self, i):
 88 |         vid, qids = self.getids(i)
 89 |         outs = super().__getitem__(i)
 90 |         return qids, outs
 91 | 
 92 | 
 93 | vlep_eval_collate = video_qa_eval_collate
 94 | 
 95 | 
 96 | class VlepVideoOnlyDataset(VlepDataset):
 97 |     def __validate_input_db__(self):
 98 |         assert isinstance(self.query_db, QaQueryTokLmdb)
 99 |         assert isinstance(self.video_db, VideoFeatDataset)
100 | 
101 | 
102 | class VlepVideoOnlyEvalDataset(VlepVideoOnlyDataset):
103 |     def __getitem__(self, i):
104 |         vid, qids = self.getids(i)
105 |         outs = super().__getitem__(i)
106 |         return qids, outs
107 | 
108 | 
109 | class VlepSubOnlyDataset(VlepDataset):
110 |     def __validate_input_db__(self):
111 |         assert isinstance(self.query_db, QaQueryTokLmdb)
112 |         assert isinstance(self.video_db, SubOnlyDataset)
113 | 
114 | 
115 | class VlepSubOnlyEvalDataset(VlepSubOnlyDataset):
116 |     def __getitem__(self, i):
117 |         vid, qids = self.getids(i)
118 |         outs = super().__getitem__(i)
119 |         return qids, outs
120 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/README.md:
--------------------------------------------------------------------------------
1 | # coco-caption
2 | 
3 | Original README can be found at [tylin/coco-caption](https://github.com/tylin/coco-caption/blob/3f0fe9b819c0ea881a56441e4de1146924a394eb/README.md).
4 | 
5 | ## License
6 | 
7 | All files in the pycocoevalcap directory are under
8 | [BSD 2-clause "Simplified" License](https://github.com/tylin/coco-caption/blob/3f0fe9b819c0ea881a56441e4de1146924a394eb/license.txt)
9 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from .bleu_scorer import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(gts.keys() == res.keys())
24 |         imgIds = gts.keys()
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             assert(len(ref) >= 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from .cider_scorer import CiderScorer
11 | import pdb
12 | 
13 | class Cider:
14 |     """
15 |     Main Class to compute the CIDEr metric 
16 | 
17 |     """
18 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 | 
24 |     def compute_score(self, gts, res):
25 |         """
26 |         Main function to compute CIDEr score
27 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
29 |         :return: cider (float) : computed CIDEr score for the corpus 
30 |         """
31 | 
32 |         assert(gts.keys() == res.keys())
33 |         imgIds = gts.keys()
34 | 
35 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
36 | 
37 |         for id in imgIds:
38 |             hypo = res[id]
39 |             ref = gts[id]
40 | 
41 |             # Sanity check.
42 |             assert(type(hypo) is list)
43 |             assert(len(hypo) == 1)
44 |             assert(type(ref) is list)
45 |             assert(len(ref) > 0)
46 | 
47 |             cider_scorer += (hypo[0], ref)
48 | 
49 |         (score, scores) = cider_scorer.compute_score()
50 | 
51 |         return score, scores
52 | 
53 |     def method(self):
54 |         return "CIDEr"
55 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/license.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of the FreeBSD Project.
27 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/meteor/tests/test_meteor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | import unittest
 5 | 
 6 | from nlgeval.pycocoevalcap.meteor.meteor import Meteor
 7 | 
 8 | 
 9 | class TestMeteor(unittest.TestCase):
10 |     def test_compute_score(self):
11 |         m = Meteor()
12 | 
13 |         s = m.compute_score({0: ["test"]}, {0: ["test"]})
14 |         self.assertEqual(s, (1.0, [1.0]))
15 | 
16 |         s = m.compute_score({0: ["テスト"]}, {0: ["テスト"]})
17 |         self.assertEqual(s, (1.0, [1.0]))
18 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | def my_lcs(string, sub):
 14 |     """
 15 |     Calculates longest common subsequence for a pair of tokenized strings
 16 |     :param string : list of str : tokens from a string split using whitespace
 17 |     :param sub : list of str : shorter string, also split using whitespace
 18 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 19 | 
 20 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 21 |     """
 22 |     if(len(string)< len(sub)):
 23 |         sub, string = string, sub
 24 | 
 25 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 26 | 
 27 |     for j in range(1,len(sub)+1):
 28 |         for i in range(1,len(string)+1):
 29 |             if(string[i-1] == sub[j-1]):
 30 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 31 |             else:
 32 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 33 | 
 34 |     return lengths[len(string)][len(sub)]
 35 | 
 36 | class Rouge():
 37 |     '''
 38 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 39 | 
 40 |     '''
 41 |     def __init__(self):
 42 |         # vrama91: updated the value below based on discussion with Hovey
 43 |         self.beta = 1.2
 44 | 
 45 |     def calc_score(self, candidate, refs):
 46 |         """
 47 |         Compute ROUGE-L score given one candidate and references for an image
 48 |         :param candidate: str : candidate sentence to be evaluated
 49 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 50 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 51 |         """
 52 |         assert(len(candidate)==1)	
 53 |         assert(len(refs)>0)         
 54 |         prec = []
 55 |         rec = []
 56 | 
 57 |         # split into tokens
 58 |         token_c = candidate[0].split(" ")
 59 |     	
 60 |         for reference in refs:
 61 |             # split into tokens
 62 |             token_r = reference.split(" ")
 63 |             # compute the longest common subsequence
 64 |             lcs = my_lcs(token_r, token_c)
 65 |             prec.append(lcs/float(len(token_c)))
 66 |             rec.append(lcs/float(len(token_r)))
 67 | 
 68 |         prec_max = max(prec)
 69 |         rec_max = max(rec)
 70 | 
 71 |         if(prec_max!=0 and rec_max !=0):
 72 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 73 |         else:
 74 |             score = 0.0
 75 |         return score
 76 | 
 77 |     def compute_score(self, gts, res):
 78 |         """
 79 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 80 |         Invoked by evaluate_captions.py 
 81 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 82 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 83 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 84 |         """
 85 |         assert(gts.keys() == res.keys())
 86 |         imgIds = gts.keys()
 87 | 
 88 |         score = []
 89 |         for id in imgIds:
 90 |             hypo = res[id]
 91 |             ref  = gts[id]
 92 | 
 93 |             score.append(self.calc_score(hypo, ref))
 94 | 
 95 |             # Sanity check.
 96 |             assert(type(hypo) is list)
 97 |             assert(len(hypo) == 1)
 98 |             assert(type(ref) is list)
 99 |             assert(len(ref) > 0)
100 | 
101 |         average_score = np.mean(np.array(score))
102 |         return average_score, np.array(score)
103 | 
104 |     def method(self):
105 |         return "Rouge"
106 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 | 


--------------------------------------------------------------------------------
/eval/pycocoevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # File Name : ptbtokenizer.py
 4 | #
 5 | # Description : Do the PTB Tokenization and remove punctuations.
 6 | #
 7 | # Creation Date : 29-12-2014
 8 | # Last Modified : Thu Mar 19 09:53:35 2015
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | import os
12 | import subprocess
13 | import tempfile
14 | 
15 | # path to the stanford corenlp jar
16 | STANFORD_CORENLP_3_4_1_JAR = ('/workspace/cococap_bin/'
17 |                               'stanford-corenlp-3.4.1.jar')
18 | 
19 | # punctuations to be removed from the sentences
20 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
21 |         ".", "?", "!", ",", ":", "-", "--", "...", ";"]
22 | 
23 | class PTBTokenizer:
24 |     """Python wrapper of Stanford PTBTokenizer"""
25 | 
26 |     def tokenize(self, captions_for_image):
27 |         cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
28 |                 'edu.stanford.nlp.process.PTBTokenizer', \
29 |                 '-preserveLines', '-lowerCase']
30 | 
31 |         # ======================================================
32 |         # prepare data for PTB Tokenizer
33 |         # ======================================================
34 |         final_tokenized_captions_for_image = {}
35 |         image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
36 |         sentences = '\n'.join([c.replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
37 | 
38 |         # ======================================================
39 |         # save sentences to temporary file
40 |         # ======================================================
41 |         path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
42 |         tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
43 |         tmp_file.write(sentences.encode())
44 |         tmp_file.close()
45 | 
46 |         # ======================================================
47 |         # tokenize sentence
48 |         # ======================================================
49 |         cmd.append(os.path.basename(tmp_file.name))
50 |         p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
51 |                 stdout=subprocess.PIPE)
52 |         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
53 |         token_lines = token_lines.decode()
54 |         lines = token_lines.split('\n')
55 |         # remove temp file
56 |         os.remove(tmp_file.name)
57 | 
58 |         # ======================================================
59 |         # create dictionary for tokenized captions
60 |         # ======================================================
61 |         for k, line in zip(image_id, lines):
62 |             if not k in final_tokenized_captions_for_image:
63 |                 final_tokenized_captions_for_image[k] = []
64 |             tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
65 |                     if w not in PUNCTUATIONS])
66 |             final_tokenized_captions_for_image[k].append(tokenized_caption)
67 | 
68 |         return final_tokenized_captions_for_image
69 | 


--------------------------------------------------------------------------------
/eval/tvc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | reproduce TVC evaluation using pycocoevalcap from Maluuba nlg-eval (Python 3)
 6 | 
 7 | copied/modified from HERO
 8 | (https://github.com/linjieli222/HERO)
 9 | """
10 | import json
11 | 
12 | from .pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
13 | from .pycocoevalcap.bleu.bleu import Bleu
14 | from .pycocoevalcap.cider.cider import Cider
15 | from .pycocoevalcap.meteor.meteor import Meteor
16 | from .pycocoevalcap.rouge.rouge import Rouge
17 | 
18 | 
19 | def _remove_nonascii(text):
20 |     return ''.join([i if ord(i) < 128 else ' ' for i in text])
21 | 
22 | 
23 | class TvcEval(object):
24 |     """ preload evaluation tools and references for repeated evaluation """
25 |     def __init__(self, ref_path):
26 |         self.tokenizer = PTBTokenizer()
27 |         id2refs = {ex['clip_id']: [_remove_nonascii(cap['desc'].strip())
28 |                                    for cap in ex['descs']]
29 |                    for ex in map(json.loads, open(ref_path))}
30 |         self.id2refs = self.tokenizer.tokenize(id2refs)
31 |         self.scorers = []
32 |         self.scorers.append((Bleu(4),
33 |                              ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))
34 |         self.scorers.append((Meteor(), "METEOR"))
35 |         self.scorers.append((Rouge(), "ROUGE_L"))
36 |         self.scorers.append((Cider(), "CIDEr"))
37 | 
38 |     def __call__(self, json_res):
39 |         """ corpus level metrics, take list of results """
40 |         id2hyps = {
41 |             res['clip_id']: [_remove_nonascii(res['descs'][0]['desc'].strip())]
42 |             for res in json_res
43 |         }
44 |         id2hyps = self.tokenizer.tokenize(id2hyps)
45 |         assert len(id2hyps) == len(self.id2refs)
46 | 
47 |         ret_scores = {}
48 |         for scorer, method in self.scorers:
49 |             print(f"Computing {method} score...")
50 |             score, scores = scorer.compute_score(self.id2refs, id2hyps)
51 |             if isinstance(method, list):
52 |                 for sc, scs, m in zip(score, scores, method):
53 |                     ret_scores[m] = sc * 100
54 |             else:
55 |                 ret_scores[method] = score * 100
56 | 
57 |         return ret_scores
58 | 


--------------------------------------------------------------------------------
/eval/vatex_en_c.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | VATEX captioning evaluation
 6 | """
 7 | import json
 8 | 
 9 | from .pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
10 | from .pycocoevalcap.bleu.bleu import Bleu
11 | from .pycocoevalcap.cider.cider import Cider
12 | from .pycocoevalcap.meteor.meteor import Meteor
13 | from .pycocoevalcap.rouge.rouge import Rouge
14 | 
15 | 
16 | def _remove_nonascii(text):
17 |     return ''.join([i if ord(i) < 128 else ' ' for i in text])
18 | 
19 | 
20 | class Vatex_en_c_Eval(object):
21 |     """ preload evaluation tools and references for repeated evaluation """
22 |     def __init__(self, ref_path):
23 |         self.tokenizer = PTBTokenizer()
24 |         id2refs = {ex['clip_id']: [_remove_nonascii(cap['desc'].strip())
25 |                                     for cap in ex['descs']]
26 |                    for ex in map(json.loads, open(ref_path))}
27 |         self.id2refs = self.tokenizer.tokenize(id2refs)
28 |         self.scorers = []
29 |         self.scorers.append((Bleu(4),
30 |                              ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))
31 |         self.scorers.append((Meteor(), "METEOR"))
32 |         self.scorers.append((Rouge(), "ROUGE_L"))
33 |         self.scorers.append((Cider(), "CIDEr"))
34 | 
35 |     def __call__(self, json_res):
36 |         """ corpus level metrics, take list of results """
37 |         id2hyps = {
38 |             res['clip_id']:
39 |                 [_remove_nonascii(res['descs'][0]['desc'].strip())]
40 |             for res in json_res}
41 |         id2hyps = self.tokenizer.tokenize(id2hyps)
42 |         assert len(id2hyps) == len(self.id2refs)
43 | 
44 |         ret_scores = {}
45 |         for scorer, method in self.scorers:
46 |             print(f"Computing {method} score...")
47 |             score, scores = scorer.compute_score(self.id2refs, id2hyps)
48 |             if isinstance(method, list):
49 |                 for sc, scs, m in zip(score, scores, method):
50 |                     ret_scores[m] = sc * 100
51 |             else:
52 |                 ret_scores[method] = score * 100
53 | 
54 |         return ret_scores
55 | 


--------------------------------------------------------------------------------
/inf_yc2c.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | run evaluation of YC2C or infenrece for submission
 6 | generate prediction from JSON file
 7 | 
 8 | copied/modified from HERO
 9 | (https://github.com/linjieli222/HERO
10 | """
11 | import argparse
12 | import json
13 | 
14 | from horovod import torch as hvd
15 | from transformers import RobertaTokenizer
16 | 
17 | from model.videoCap import VideoCapGenerator
18 | from eval.yc2c import Yc2cEval
19 | from utils.distributed import all_gather_list
20 | from utils.basic_utils import save_jsonl
21 | 
22 | from inf_tvc import load_model
23 | from inf_vatex_en_c import load_inf_data, decode
24 | 
25 | 
26 | def main(opts):
27 |     hvd.init()
28 |     if hvd.rank() == 0:
29 |         toker = RobertaTokenizer.from_pretrained('roberta-base')
30 |         all_gather_list(None)
31 |     else:
32 |         all_gather_list(None)
33 |         toker = RobertaTokenizer.from_pretrained('roberta-base')
34 | 
35 |     bos = toker.convert_tokens_to_ids(['<s>'])[0]
36 |     eos = toker.convert_tokens_to_ids(['</s>'])[0]
37 | 
38 |     model_opts, model = load_model(opts.model_dir, opts.ckpt_step, opts)
39 |     loader = load_inf_data(opts, model_opts, mode="video_sub")
40 |     model.eval()
41 |     generator = VideoCapGenerator(
42 |         model, opts.max_gen_step, bos, eos, not opts.no_fp16)
43 |     results = decode(loader, generator, toker)
44 |     import os
45 |     output_path = os.path.join(opts.model_dir, opts.output)
46 |     save_jsonl(results, output_path)
47 | 
48 |     # evaluate score if possible
49 |     if (hvd.rank() == 0
50 |             and 'descs' in json.loads(next(iter(open(opts.target_clip))))):
51 |         evaluator = Yc2cEval(opts.target_clip)
52 |         score = evaluator(results)
53 |         print(score)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     parser = argparse.ArgumentParser()
58 |     parser.add_argument("--sub_txt_db",
59 |                         default="/txt/yc2_subtitles.db",
60 |                         type=str,
61 |                         help="The input video subtitle corpus. (LMDB)")
62 |     parser.add_argument("--vfeat_db",
63 |                         default="/video/yc2", type=str,
64 |                         help="The input video frame features.")
65 |     parser.add_argument("--model_dir", required=True, type=str,
66 |                         help="dir root to trained model")
67 |     parser.add_argument("--ckpt_step", required=True, type=int,
68 |                         help="checkpoint step")
69 |     parser.add_argument("--output", type=str, required=True,
70 |                         help="output file name")
71 | 
72 |     parser.add_argument("--batch_size", default=16, type=int,
73 |                         help="validation batch size (per GPU)")
74 |     parser.add_argument("--max_gen_step", default=30, type=int,
75 |                         help="max generation steps")
76 | 
77 |     parser.add_argument('--n_workers', type=int, default=4,
78 |                         help="number of data workers")
79 |     parser.add_argument('--no_pin_mem', action='store_true',
80 |                         help="disable pin memory")
81 |     parser.add_argument("--no_fp16", action='store_true',
82 |                         help="disable fp16")
83 | 
84 |     parser.add_argument("--target_clip", required=True, type=str,
85 |                         help="jsonl annotation")
86 | 
87 |     args = parser.parse_args()
88 | 
89 |     main(args)
90 | 


--------------------------------------------------------------------------------
/launch_container.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | # Modified from UNITER
 4 | # (https://github.com/ChenRocks/UNITER)
 5 | 
 6 | TXT_DB=$1
 7 | VID_DIR=$2
 8 | OUTPUT=$3
 9 | PRETRAIN_DIR=$4
10 | 
11 | if [ -z $CUDA_VISIBLE_DEVICES ]; then
12 |     CUDA_VISIBLE_DEVICES='all'
13 | fi
14 | 
15 | if [ "$5" = "--prepro" ]; then
16 |     RO=""
17 | else
18 |     RO=",readonly"
19 | fi
20 | 
21 | docker run --gpus '"'device=$CUDA_VISIBLE_DEVICES'"' --ipc=host --network=host --rm -it \
22 |     --mount src=$(pwd),dst=/src,type=bind \
23 |     --mount src=$OUTPUT,dst=/storage,type=bind \
24 |     --mount src=$PRETRAIN_DIR,dst=/pretrain,type=bind,readonly \
25 |     --mount src=$TXT_DB,dst=/txt,type=bind$RO \
26 |     --mount src=$VID_DIR,dst=/video,type=bind,readonly \
27 |     -e NVIDIA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
28 |     -w /src linjieli222/hero
29 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VALUE-Leaderboard/StarterCode/fe600a7dd552227a5d0297ab953a52d5ea667c9a/model/__init__.py


--------------------------------------------------------------------------------
/model/vcmr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | HERO for Video Corpus Moment Retrieval Tasks, shared by:
 6 | 1. TVR
 7 | 2. How2R
 8 | 
 9 | copied/modified from HERO
10 | (https://github.com/linjieli222/HERO)
11 | """
12 | from .pretrain import HeroForPretraining
13 | import torch
14 | 
15 | 
16 | class HeroForVcmr(HeroForPretraining):
17 |     def __init__(self, config, vfeat_dim, max_frm_seq_len,
18 |                  conv_stride=1, conv_kernel_size=5,
19 |                  ranking_loss_type="hinge", margin=0.1,
20 |                  lw_neg_ctx=0, lw_neg_q=0, lw_st_ed=0.01, drop_svmr_prob=0,
21 |                  use_hard_negative=False, hard_pool_size=20,
22 |                  hard_neg_weight=10, use_all_neg=True):
23 |         super(HeroForVcmr, self).__init__(
24 |             config, vfeat_dim, max_frm_seq_len,
25 |             conv_stride, conv_kernel_size,
26 |             ranking_loss_type, margin,
27 |             lw_neg_ctx, lw_neg_q, lw_st_ed, drop_svmr_prob,
28 |             use_hard_negative, hard_pool_size,
29 |             hard_neg_weight, use_all_neg)
30 |         assert lw_st_ed > 0 or lw_neg_ctx > 0 or lw_neg_q > 0
31 | 
32 |     def forward(self, batch, task='vcmr', compute_loss=True):
33 |         if task == "vcmr":
34 |             return super(HeroForVcmr, self).forward(
35 |                 batch, task='vsm', compute_loss=compute_loss)
36 |         elif task == "vr":
37 |             if compute_loss:
38 |                 _, loss_neg_ctx, loss_neg_q = super(HeroForVcmr, self).forward(
39 |                     batch, task='vsm', compute_loss=True)
40 |                 return torch.zeros_like(loss_neg_ctx), loss_neg_ctx, loss_neg_q
41 |             else:
42 |                 q2video_scores, _, _ = super(HeroForVcmr, self).forward(
43 |                     batch, task='vsm', compute_loss=False)
44 |                 return q2video_scores
45 |         else:
46 |             raise ValueError(f'Unrecognized task {task}')
47 | 
48 |     def get_pred_from_raw_query(self, frame_embeddings, c_attn_masks,
49 |                                 query_input_ids, query_pos_ids,
50 |                                 query_attn_masks, cross=False,
51 |                                 val_gather_gpus=False):
52 |         modularized_query = self.encode_txt_inputs(
53 |                     query_input_ids, query_pos_ids,
54 |                     query_attn_masks, attn_layer=self.q_feat_attn,
55 |                     normalized=False)
56 |         if self.lw_st_ed != 0:
57 |             st_prob, ed_prob = self.get_pred_from_mod_query(
58 |                 frame_embeddings, c_attn_masks,
59 |                 modularized_query, cross=cross)
60 |         else:
61 |             st_prob, ed_prob = None, None
62 | 
63 |         if self.lw_neg_ctx != 0 or self.lw_neg_q != 0:
64 |             q2video_scores = self.get_video_level_scores(
65 |                 modularized_query, frame_embeddings, c_attn_masks,
66 |                 val_gather_gpus)
67 |         else:
68 |             q2video_scores = None
69 |         return q2video_scores, st_prob, ed_prob
70 | 


--------------------------------------------------------------------------------
/model/vr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | HERO for Video Retrieval Tasks, shared by:
 6 | 1. VATEX_EN_R
 7 | 2. YC2R
 8 | 
 9 | copied/modified from HERO
10 | (https://github.com/linjieli222/HERO)
11 | """
12 | from .vcmr import HeroForVcmr
13 | 
14 | 
15 | class HeroForVr(HeroForVcmr):
16 |     def __init__(self, config, vfeat_dim, max_frm_seq_len,
17 |                  ranking_loss_type="hinge", margin=0.1,
18 |                  lw_neg_ctx=1, lw_neg_q=1,
19 |                  use_hard_negative=False, hard_pool_size=20,
20 |                  hard_neg_weight=10, use_all_neg=True):
21 |         assert lw_neg_ctx != 0 or lw_neg_q != 0,\
22 |             "Need to set lw_neg_ctx or lw_neg_q for VR training"
23 |         super(HeroForVr, self).__init__(
24 |             config, vfeat_dim, max_frm_seq_len,
25 |             ranking_loss_type=ranking_loss_type, margin=margin,
26 |             lw_neg_ctx=lw_neg_ctx, lw_neg_q=lw_neg_q,
27 |             lw_st_ed=0, drop_svmr_prob=1.0,
28 |             use_hard_negative=use_hard_negative,
29 |             hard_pool_size=hard_pool_size,
30 |             hard_neg_weight=hard_neg_weight,
31 |             use_all_neg=use_all_neg)
32 |         assert self.lw_st_ed == 0, "For VR, lw_st_ed should be 0"
33 | 
34 |     def forward(self, batch, task='msrvtt_video_sub', compute_loss=True):
35 |         if task in [
36 |                 'msrvtt_video_sub', 'msrvtt_video_only',
37 |                 'yc2r_video_sub', 'yc2r_video_only',
38 |                 'vatex_video_only', 'vatex_video_sub']:
39 |             if compute_loss:
40 |                 _, loss_neg_ctx, loss_neg_q = super(HeroForVr, self).forward(
41 |                     batch, task='tvr', compute_loss=True)
42 |                 return loss_neg_ctx, loss_neg_q
43 |             else:
44 |                 q2video_scores, _, _ = super(HeroForVr, self).forward(
45 |                     batch, task='tvr', compute_loss=False)
46 |                 return q2video_scores
47 |         else:
48 |             raise ValueError(f'Unrecognized task {task}')
49 | 
50 |     def get_pred_from_raw_query(self, frame_embeddings, c_attn_masks,
51 |                                 query_input_ids, query_pos_ids,
52 |                                 query_attn_masks, cross=False,
53 |                                 val_gather_gpus=False):
54 |         modularized_query = self.encode_txt_inputs(
55 |                     query_input_ids, query_pos_ids,
56 |                     query_attn_masks, attn_layer=self.q_feat_attn,
57 |                     normalized=False)
58 | 
59 |         q2video_scores = self.get_video_level_scores(
60 |             modularized_query, frame_embeddings, c_attn_masks,
61 |             val_gather_gpus)
62 |         return q2video_scores
63 | 


--------------------------------------------------------------------------------
/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | Copied from UNITER
 6 | (https://github.com/ChenRocks/UNITER)
 7 | """
 8 | from .sched import noam_schedule, warmup_linear, vqa_schedule, get_lr_sched
 9 | from .adamw import AdamW
10 | 


--------------------------------------------------------------------------------
/optim/adamw.py:
--------------------------------------------------------------------------------
  1 | """
  2 | AdamW optimizer (weight decay fix)
  3 | originally from hugginface (https://github.com/huggingface/transformers).
  4 | 
  5 | Copied from UNITER
  6 | (https://github.com/ChenRocks/UNITER)
  7 | """
  8 | import math
  9 | 
 10 | import torch
 11 | from torch.optim import Optimizer
 12 | 
 13 | 
 14 | class AdamW(Optimizer):
 15 |     """ Implements Adam algorithm with weight decay fix.
 16 |     Parameters:
 17 |         lr (float): learning rate. Default 1e-3.
 18 |         betas (tuple of 2 floats): Adams beta parameters (b1, b2).
 19 |             Default: (0.9, 0.999)
 20 |         eps (float): Adams epsilon. Default: 1e-6
 21 |         weight_decay (float): Weight decay. Default: 0.0
 22 |         correct_bias (bool): can be set to False to avoid correcting bias
 23 |             in Adam (e.g. like in Bert TF repository). Default True.
 24 |     """
 25 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
 26 |                  weight_decay=0.0, correct_bias=True):
 27 |         if lr < 0.0:
 28 |             raise ValueError(
 29 |                 "Invalid learning rate: {} - should be >= 0.0".format(lr))
 30 |         if not 0.0 <= betas[0] < 1.0:
 31 |             raise ValueError("Invalid beta parameter: {} - "
 32 |                              "should be in [0.0, 1.0[".format(betas[0]))
 33 |         if not 0.0 <= betas[1] < 1.0:
 34 |             raise ValueError("Invalid beta parameter: {} - "
 35 |                              "should be in [0.0, 1.0[".format(betas[1]))
 36 |         if not 0.0 <= eps:
 37 |             raise ValueError("Invalid epsilon value: {} - "
 38 |                              "should be >= 0.0".format(eps))
 39 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
 40 |                         correct_bias=correct_bias)
 41 |         super(AdamW, self).__init__(params, defaults)
 42 | 
 43 |     def step(self, closure=None):
 44 |         """Performs a single optimization step.
 45 |         Arguments:
 46 |             closure (callable, optional): A closure that reevaluates the model
 47 |                 and returns the loss.
 48 |         """
 49 |         loss = None
 50 |         if closure is not None:
 51 |             loss = closure()
 52 | 
 53 |         for group in self.param_groups:
 54 |             for p in group['params']:
 55 |                 if p.grad is None:
 56 |                     continue
 57 |                 grad = p.grad.data
 58 |                 if grad.is_sparse:
 59 |                     raise RuntimeError(
 60 |                         'Adam does not support sparse '
 61 |                         'gradients, please consider SparseAdam instead')
 62 | 
 63 |                 state = self.state[p]
 64 | 
 65 |                 # State initialization
 66 |                 if len(state) == 0:
 67 |                     state['step'] = 0
 68 |                     # Exponential moving average of gradient values
 69 |                     state['exp_avg'] = torch.zeros_like(p.data)
 70 |                     # Exponential moving average of squared gradient values
 71 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 72 | 
 73 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 74 |                 beta1, beta2 = group['betas']
 75 | 
 76 |                 state['step'] += 1
 77 | 
 78 |                 # Decay the first and second moment running average coefficient
 79 |                 # In-place operations to update the averages at the same time
 80 |                 exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
 81 |                 exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
 82 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
 83 | 
 84 |                 step_size = group['lr']
 85 |                 if group['correct_bias']:  # No bias correction for Bert
 86 |                     bias_correction1 = 1.0 - beta1 ** state['step']
 87 |                     bias_correction2 = 1.0 - beta2 ** state['step']
 88 |                     step_size = (step_size * math.sqrt(bias_correction2)
 89 |                                  / bias_correction1)
 90 | 
 91 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
 92 | 
 93 |                 # Just adding the square of the weights to the loss function is
 94 |                 # *not* the correct way of using L2 regularization/weight decay
 95 |                 # with Adam, since that will interact with the m and v
 96 |                 # parameters in strange ways.
 97 |                 #
 98 |                 # Instead we want to decay the weights in a manner that doesn't
 99 |                 # interact with the m/v parameters. This is equivalent to
100 |                 # adding the square of the weights to the loss with plain
101 |                 # (non-momentum) SGD.
102 |                 # Add weight decay at the end (fixed version)
103 |                 if group['weight_decay'] > 0.0:
104 |                     p.data.add_(-group['lr'] * group['weight_decay'], p.data)
105 | 
106 |         return loss
107 | 


--------------------------------------------------------------------------------
/optim/misc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | Copied from UNITER
 6 | (https://github.com/ChenRocks/UNITER)
 7 | 
 8 | Misc lr helper
 9 | """
10 | from torch.optim import Adam, Adamax
11 | from .adamw import AdamW
12 | 
13 | 
14 | def build_optimizer(model, opts):
15 |     # Prepare optimizer
16 |     param_optimizer = [(n, p) for n, p in model.named_parameters()
17 |                        if 'v_encoder' in n and p.requires_grad]
18 |     # top layer has larger learning rate
19 |     param_top = [(n, p) for n, p in model.named_parameters()
20 |                  if 'v_encoder' not in n and p.requires_grad]
21 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
22 |     optimizer_grouped_parameters = [
23 |         {'params': [p for n, p in param_top
24 |                     if not any(nd in n for nd in no_decay)],
25 |             'lr': opts.lr_mul*opts.learning_rate,
26 |             'weight_decay': opts.weight_decay},
27 |         {'params': [p for n, p in param_top
28 |                     if any(nd in n for nd in no_decay)],
29 |             'lr': opts.lr_mul*opts.learning_rate,
30 |             'weight_decay': 0.0},
31 |         {'params': [p for n, p in param_optimizer
32 |                     if not any(nd in n for nd in no_decay)],
33 |             'weight_decay': opts.weight_decay},
34 |         {'params': [p for n, p in param_optimizer
35 |                     if any(nd in n for nd in no_decay)],
36 |             'weight_decay': 0.0}
37 |     ]
38 | 
39 |     # currently Adam only
40 |     if opts.optim == 'adam':
41 |         OptimCls = Adam
42 |     elif opts.optim == 'adamax':
43 |         OptimCls = Adamax
44 |     elif opts.optim == 'adamw':
45 |         OptimCls = AdamW
46 |     else:
47 |         raise ValueError('invalid optimizer')
48 |     optimizer = OptimCls(optimizer_grouped_parameters,
49 |                          lr=opts.learning_rate, betas=opts.betas)
50 |     return optimizer
51 | 


--------------------------------------------------------------------------------
/optim/sched.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | Copied from UNITER
 6 | (https://github.com/ChenRocks/UNITER)
 7 | 
 8 | optimizer learning rate scheduling helpers
 9 | """
10 | from math import ceil
11 | 
12 | 
13 | def noam_schedule(step, warmup_step=4000):
14 |     """ original Transformer schedule"""
15 |     if step <= warmup_step:
16 |         return step / warmup_step
17 |     return (warmup_step ** 0.5) * (step ** -0.5)
18 | 
19 | 
20 | def warmup_linear(step, warmup_step, tot_step):
21 |     """ BERT schedule """
22 |     if step < warmup_step:
23 |         return step / warmup_step
24 |     return max(0, (tot_step-step)/(tot_step-warmup_step))
25 | 
26 | 
27 | def vqa_schedule(step, warmup_interval, decay_interval,
28 |                  decay_start, decay_rate):
29 |     """ VQA schedule from MCAN """
30 |     if step < warmup_interval:
31 |         return 1/4
32 |     elif step < 2 * warmup_interval:
33 |         return 2/4
34 |     elif step < 3 * warmup_interval:
35 |         return 3/4
36 |     elif step >= decay_start:
37 |         num_decay = ceil((step - decay_start) / decay_interval)
38 |         return decay_rate ** num_decay
39 |     else:
40 |         return 1
41 | 
42 | 
43 | def get_lr_sched(global_step, opts):
44 |     # learning rate scheduling
45 |     lr_this_step = opts.learning_rate * warmup_linear(
46 |         global_step, opts.warmup_steps, opts.num_train_steps)
47 |     if lr_this_step <= 0:
48 |         lr_this_step = 1e-8
49 |     return lr_this_step


--------------------------------------------------------------------------------
/scripts/collect_video_feature_paths.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) Microsoft Corporation.
  3 | Licensed under the MIT license.
  4 | 
  5 | gather feature paths
  6 | 
  7 | copied/modified from HERO
  8 | (https://github.com/linjieli222/HERO)
  9 | """
 10 | import os
 11 | import numpy as np
 12 | import pickle as pkl
 13 | import argparse
 14 | from tqdm import tqdm
 15 | from cytoolz import curry
 16 | import multiprocessing as mp
 17 | # released feature .tar filename: 'resnet' 'slowfast' 'mil-nce-s3d' 'clip-vit'
 18 | FEAT_DIR = {
 19 |     "resnet": "resnet",
 20 |     "slowfast": "slowfast",
 21 |     "mil-nce": "mil-nce-s3d",
 22 |     "clip-vit": "clip-vit"}
 23 | 
 24 | 
 25 | @curry
 26 | def load_npz(dir_3d, dir_2d, f_3d):
 27 |     vid = f_3d.split("/")[-1].split(".npz")[0]
 28 |     folder_name = f_3d.split("/")[-2]
 29 |     f_2d = f_3d.replace(dir_3d, dir_2d)
 30 |     try:
 31 |         feature_3d = np.load(f_3d, allow_pickle=True)
 32 |         feat_len_3d = max(0, len(feature_3d["features"]))
 33 |     except Exception:
 34 |         feat_len_3d = 0
 35 |     feat_len_2d = 0
 36 |     if feat_len_3d == 0:
 37 |         f_3d = ""
 38 |         print(f"Corrupted {dir_3d.split('/')[-1]} feature for {vid}")
 39 |     # print(f_2d)
 40 |     if not os.path.exists(f_2d):
 41 |         f_2d = ""
 42 |         print(f"{dir_2d.split('/')[-1]} files for {vid} does not exists")
 43 |     else:
 44 |         try:
 45 |             feature_2d = np.load(f_2d, allow_pickle=True)
 46 |             feat_len_2d = len(feature_2d["features"])
 47 |         except Exception:
 48 |             feat_len_2d = 0
 49 |             f_2d = ""
 50 |             print(f"Corrupted {dir_2d.split('/')[-1]} files for {vid}")
 51 |     frame_len = min(feat_len_3d, feat_len_2d)
 52 |     return vid, frame_len, f_3d, f_2d, folder_name
 53 | 
 54 | 
 55 | def main(opts):
 56 |     name_2d, name_3d = opts.feat_version.split("_")
 57 |     dir_3d = os.path.join(opts.feature_dir, FEAT_DIR[name_3d])
 58 |     dir_2d = os.path.join(opts.feature_dir, FEAT_DIR[name_3d])
 59 |     failed_2d_files = []
 60 |     failed_3d_files = []
 61 |     loaded_file = []
 62 |     for root, dirs, curr_files in os.walk(f'{dir_3d}/'):
 63 |         for f in curr_files:
 64 |             if f.endswith('.npz'):
 65 |                 f_3d = os.path.join(root, f)
 66 |                 loaded_file.append(f_3d)
 67 |     print(f"Found {len(loaded_file)} {name_3d} files....")
 68 |     print(f"sample loaded_file: {loaded_file[:3]}")
 69 |     failed_2d_files, failed_3d_files = [], []
 70 |     files = {}
 71 |     load = load_npz(dir_3d, dir_2d)
 72 |     with mp.Pool(opts.nproc) as pool, tqdm(total=len(loaded_file)) as pbar:
 73 |         for i, (vid, frame_len, f_3d,
 74 |                 f_2d, folder_name) in enumerate(
 75 |                 pool.imap_unordered(load, loaded_file, chunksize=128)):
 76 |             files[vid] = (frame_len, f_3d, f_2d, folder_name)
 77 |             if f_2d == "":
 78 |                 video_file = os.path.join(folder_name, vid)
 79 |                 failed_2d_files.append(video_file)
 80 |             if f_3d == "":
 81 |                 video_file = os.path.join(folder_name, vid)
 82 |                 failed_3d_files.append(video_file)
 83 |             pbar.update(1)
 84 |     output_dir = os.path.join(opts.output, opts.dataset)
 85 |     if not os.path.exists(output_dir):
 86 |         os.makedirs(output_dir, exist_ok=True)
 87 |     pkl.dump(files, open(os.path.join(
 88 |         output_dir, f"{opts.feat_version}_info.pkl"), "wb"))
 89 |     if len(failed_3d_files):
 90 |         pkl.dump(failed_3d_files, open(os.path.join(
 91 |             output_dir, f"failed_{name_3d}_files.pkl"), "wb"))
 92 |     if len(failed_2d_files):
 93 |         pkl.dump(failed_2d_files, open(os.path.join(
 94 |             output_dir, f"failed_{name_2d}_files.pkl"), "wb"))
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     parser = argparse.ArgumentParser()
 99 |     parser.add_argument("--feature_dir",
100 |                         default="",
101 |                         type=str, help="The input video feature dir.")
102 |     parser.add_argument("--output", default=None, type=str,
103 |                         help="output dir")
104 |     parser.add_argument('--dataset', type=str,
105 |                         default="")
106 |     parser.add_argument('--feat_version', type=str,
107 |                         choices=[
108 |                             "resnet_slowfast", "resnet_mil-nce",
109 |                             "clip-vit_slowfast", "clip-vit_mil-nce"],
110 |                         default="resnet_slowfast")
111 |     parser.add_argument('--nproc', type=int, default=10,
112 |                         help='number of cores used')
113 |     args = parser.parse_args()
114 |     main(args)
115 | 


--------------------------------------------------------------------------------
/scripts/create_txtdb.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | TXT_DB=$1
 6 | ANN_DIR=$2
 7 | VIDEO_DB=$3
 8 | 
 9 | set -e
10 | 
11 | # annotations
12 | DataBLOB='https://datarelease.blob.core.windows.net/value-leaderboard/tv_tasks'
13 | TVR='https://raw.githubusercontent.com/jayleicn/TVRetrieval/master/data/'
14 | 
15 | if [ ! -d $TXT_DB ]; then
16 |     mkdir -p $TXT_DB
17 | fi
18 | if [ ! -d $ANN_DIR ]; then
19 |     mkdir -p $ANN_DIR
20 | fi
21 | 
22 | 
23 | for SPLIT in 'train' 'val'; do
24 |     if [ ! -f $ANN_DIR/tvr_${SPLIT}_release.jsonl ]; then
25 |         echo "downloading ${SPLIT} annotations..."
26 |         wget $TVR/tvr_${SPLIT}_release.jsonl -O $ANN_DIR/tvr_${SPLIT}_release.jsonl
27 |     fi
28 | done
29 | if [ ! -f $ANN_DIR/tvr_test_release.jsonl ]; then
30 |     echo "downloading test annotations..."
31 |     wget $DataBLOB/tvr_test_release.jsonl -O $ANN_DIR/tvr_test_release.jsonl
32 | fi
33 | 
34 | for SPLIT in 'train' 'val' 'test'; do
35 |     if [ ! -d $TXT_DB/tvr_${SPLIT}.db ]; then
36 |         echo "preprocessing tvr ${SPLIT} annotations..."
37 |         docker run --ipc=host --rm -it \
38 |             --mount src=$(pwd),dst=/src,type=bind \
39 |             --mount src=$TXT_DB,dst=/txt_db,type=bind \
40 |             --mount src=$ANN_DIR,dst=/ann,type=bind,readonly \
41 |             -w /src linjieli222/hero \
42 |             python scripts/prepro_query.py --annotation /ann/tvr_${SPLIT}_release.jsonl \
43 |                             --output /txt_db/tvr_${SPLIT}.db \
44 |                             --task tvr
45 |     fi
46 | done
47 | 
48 | 
49 | 
50 | if [ ! -d $VIDEO_DB ]; then
51 |     echo "Make sure you have constructed/downloaded the video dbs before processing the subtitles..."
52 | else
53 |     if [ ! -f $ANN_DIR/tv_subtitles.jsonl ]; then
54 |         echo "downloading raw subtitle and additional annotations..."
55 |         wget  $DataBLOB/tvr_video2dur_idx.json -O $ANN_DIR/vid2dur_idx.json
56 | 
57 |         wget $TVR/tvqa_preprocessed_subtitles.jsonl -O $ANN_DIR/tv_subtitles.jsonl
58 |     fi
59 | 
60 |     if [ ! -d $TXT_DB/tv_subtitles.db ]; then
61 |         echo "preprocessing tv subtitles..."
62 |         docker run --ipc=host --rm -it \
63 |             --mount src=$(pwd),dst=/src,type=bind \
64 |             --mount src=$TXT_DB,dst=/txt_db,type=bind \
65 |             --mount src=$ANN_DIR,dst=/ann,type=bind,readonly \
66 |             --mount src=$VIDEO_DB,dst=/video_db,type=bind,readonly \
67 |             -w /src linjieli222/hero \
68 |             /bin/bash -c "python scripts/prepro_sub.py --annotation /ann/tv_subtitles.jsonl --output /txt_db/tv_subtitles.db --vid2nframe /video_db/tv/id2nframe_1.5.json --frame_length 1.5; cp /ann/vid2dur_idx.json /txt_db/tv_subtitles.db/"
69 |         echo "done"
70 |     fi
71 | fi


--------------------------------------------------------------------------------
/scripts/download_all.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | DOWNLOAD=$1
 5 | 
 6 | # checkpoint
 7 | bash ./scripts/download_pretrained.sh $DOWNLOAD
 8 | 
 9 | # data
10 | bash ./scripts/download_tvr.sh $DOWNLOAD
11 | bash ./scripts/download_tvqa.sh $DOWNLOAD
12 | bash ./scripts/download_tvc.sh $DOWNLOAD
13 | bash ./scripts/download_how2.sh $DOWNLOAD
14 | bash ./scripts/download_violin.sh $DOWNLOAD
15 | bash ./scripts/download_vlep.sh $DOWNLOAD
16 | bash ./scripts/download_yc2.sh $DOWNLOAD
17 | bash ./scripts/download_vatex.sh $DOWNLOAD


--------------------------------------------------------------------------------
/scripts/download_how2.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | DOWNLOAD=$1
 5 | 
 6 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do
 7 |     if [ ! -d $DOWNLOAD/$FOLDER ] ; then
 8 |         mkdir -p $DOWNLOAD/$FOLDER
 9 |     fi
10 | done
11 | 
12 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data'
13 | 
14 | # Use azcopy for video db downloading
15 | if [ -f ~/azcopy/azcopy ]; then
16 |     echo "azcopy exists, skip downloading"
17 | else 
18 |     echo "azcopy does not exist, start downloading"
19 |     wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy
20 | fi
21 | chmod +x ~/azcopy/azcopy
22 | 
23 | # video dbs
24 | if [ ! -d $DOWNLOAD/video_db/how2/ ] ; then
25 |     ~/azcopy/azcopy cp $BLOB/video_db/how2.tar $DOWNLOAD/video_db/how2.tar
26 |     tar -xvf $DOWNLOAD/video_db/how2.tar -C $DOWNLOAD/video_db 
27 |     rm $DOWNLOAD/video_db/how2.tar
28 | fi
29 | 
30 | # text dbs
31 | if [ ! -d $DOWNLOAD/txt_db/how2_subtitles.db/ ] ; then
32 |     wget $BLOB/txt_db/how2_subtitles.db.tar -P $DOWNLOAD/txt_db/
33 |     tar -xvf $DOWNLOAD/txt_db/how2_subtitles.db.tar -C $DOWNLOAD/txt_db
34 |     rm $DOWNLOAD/txt_db/how2_subtitles.db.tar
35 | fi
36 | # how2r
37 | for SPLIT in 'train' 'val_1k' 'test_public_1k' ; do
38 |     if [ ! -d $DOWNLOAD/txt_db/how2r_$SPLIT.db/ ] ; then
39 |         wget $BLOB/txt_db/how2r_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
40 |         tar -xvf $DOWNLOAD/txt_db/how2r_$SPLIT.db.tar -C $DOWNLOAD/txt_db
41 |         rm $DOWNLOAD/txt_db/how2r_$SPLIT.db.tar
42 |     fi
43 | done
44 | # how2qa
45 | for SPLIT in 'train' 'val' 'test_public' ; do
46 |     if [ ! -d $DOWNLOAD/txt_db/how2qa_$SPLIT.db/ ] ; then
47 |         wget $BLOB/txt_db/how2qa_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
48 |         tar -xvf $DOWNLOAD/txt_db/how2qa_$SPLIT.db.tar -C $DOWNLOAD/txt_db
49 |         rm $DOWNLOAD/txt_db/how2qa_$SPLIT.db.tar
50 |     fi
51 | done
52 | 
53 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero'
54 | # pretrained
55 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then
56 |     wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/
57 | fi
58 | 


--------------------------------------------------------------------------------
/scripts/download_pretrained.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | DOWNLOAD=$1
 6 | 
 7 | if [ ! -d $DOWNLOAD/pretrained ] ; then
 8 |     mkdir -p $DOWNLOAD/pretrained
 9 | fi
10 | 
11 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero'
12 | 
13 | # This will overwrite models
14 | wget $HEROBLOB/pretrained/hero-tv-ht100.pt -O $DOWNLOAD/pretrained/hero-tv-ht100.pt
15 | 
16 | # converted RoBERTa
17 | if [ ! -f $DOWNLOAD/pretrained/pretrain-tv-init.bin ] ; then
18 |     wget $HEROBLOB/pretrained/pretrain-tv-init.bin -O $DOWNLOAD/pretrained/pretrain-tv-init.bin
19 | fi


--------------------------------------------------------------------------------
/scripts/download_tvc.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | DOWNLOAD=$1
 6 | 
 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do
 8 |     if [ ! -d $DOWNLOAD/$FOLDER ] ; then
 9 |         mkdir -p $DOWNLOAD/$FOLDER
10 |     fi
11 | done
12 | 
13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data'
14 | 
15 | # Use azcopy for video db downloading
16 | if [ -f ~/azcopy/azcopy ]; then
17 |     echo "azcopy exists, skip downloading"
18 | else 
19 |     echo "azcopy does not exist, start downloading"
20 |     wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy
21 | fi
22 | chmod +x ~/azcopy/azcopy
23 | 
24 | # video dbs
25 | if [ ! -d $DOWNLOAD/video_db/tv/ ] ; then
26 |     ~/azcopy/azcopy cp $BLOB/video_db/tv.tar $DOWNLOAD/video_db/tv.tar
27 |     tar -xvf $DOWNLOAD/video_db/tv.tar -C $DOWNLOAD/video_db 
28 |     rm $DOWNLOAD/video_db/tv.tar
29 | fi
30 | 
31 | # text dbs
32 | for SPLIT in 'train' 'val' ; do
33 |     if [ ! -d $DOWNLOAD/txt_db/tvc_$SPLIT.db/ ] ; then
34 |         wget $BLOB/txt_db/tvc_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
35 |         tar -xvf $DOWNLOAD/txt_db/tvc_$SPLIT.db.tar -C $DOWNLOAD/txt_db
36 |         rm $DOWNLOAD/txt_db/tvc_$SPLIT.db.tar
37 |     fi
38 | done
39 | if [ ! -d $DOWNLOAD/txt_db/tv_subtitles.db/ ] ; then
40 |     wget $BLOB/txt_db/tv_subtitles.db.tar -P $DOWNLOAD/txt_db/
41 |     tar -xvf $DOWNLOAD/txt_db/tv_subtitles.db.tar -C $DOWNLOAD/txt_db
42 |     rm $DOWNLOAD/txt_db/tv_subtitles.db.tar
43 | fi
44 | 
45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero'
46 | # pretrained
47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then
48 |     wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/
49 | fi
50 | 
51 | # raw_data (for evaluation and inference)
52 | TVBLOB='https://datarelease.blob.core.windows.net/value-leaderboard/tv_tasks'
53 | TVC='https://raw.githubusercontent.com/jayleicn/TVCaption/master/data/'
54 | 
55 | wget -nc $TVC/tvc_val_release.jsonl -P $DOWNLOAD/txt_db
56 | wget -nc $TVBLOB/tvc_test_release.jsonl -P $DOWNLOAD/txt_db
57 | 


--------------------------------------------------------------------------------
/scripts/download_tvqa.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | DOWNLOAD=$1
 6 | 
 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do
 8 |     if [ ! -d $DOWNLOAD/$FOLDER ] ; then
 9 |         mkdir -p $DOWNLOAD/$FOLDER
10 |     fi
11 | done
12 | 
13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data'
14 | 
15 | # Use azcopy for video db downloading
16 | if [ -f ~/azcopy/azcopy ]; then
17 |     echo "azcopy exists, skip downloading"
18 | else 
19 |     echo "azcopy does not exist, start downloading"
20 |     wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy
21 | fi
22 | chmod +x ~/azcopy/azcopy
23 | 
24 | # video dbs
25 | if [ ! -d $DOWNLOAD/video_db/tv/ ] ; then
26 |     ~/azcopy/azcopy cp $BLOB/video_db/tv.tar $DOWNLOAD/video_db/tv.tar
27 |     tar -xvf $DOWNLOAD/video_db/tv.tar -C $DOWNLOAD/video_db 
28 |     rm $DOWNLOAD/video_db/tv.tar
29 | fi
30 | 
31 | # text dbs
32 | for SPLIT in 'train' 'val' 'test'; do
33 |     if [ ! -d $DOWNLOAD/txt_db/tvqa_$SPLIT.db/ ] ; then
34 |         wget $BLOB/txt_db/tvqa_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
35 |         tar -xvf $DOWNLOAD/txt_db/tvqa_$SPLIT.db.tar -C $DOWNLOAD/txt_db
36 |         rm $DOWNLOAD/txt_db/tvqa_$SPLIT.db.tar
37 |     fi
38 | done
39 | if [ ! -d $DOWNLOAD/txt_db/tv_subtitles.db/ ] ; then
40 |     wget $BLOB/txt_db/tv_subtitles.db.tar -P $DOWNLOAD/txt_db/
41 |     tar -xvf $DOWNLOAD/txt_db/tv_subtitles.db.tar -C $DOWNLOAD/txt_db
42 |     rm $DOWNLOAD/txt_db/tv_subtitles.db.tar
43 | fi
44 | 
45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero'
46 | # pretrained
47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then
48 |     wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/
49 | fi
50 | 


--------------------------------------------------------------------------------
/scripts/download_tvr.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | DOWNLOAD=$1
 6 | 
 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do
 8 |     if [ ! -d $DOWNLOAD/$FOLDER ] ; then
 9 |         mkdir -p $DOWNLOAD/$FOLDER
10 |     fi
11 | done
12 | 
13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data'
14 | 
15 | # Use azcopy for video db downloading
16 | if [ -f ~/azcopy/azcopy ]; then
17 |     echo "azcopy exists, skip downloading"
18 | else 
19 |     echo "azcopy does not exist, start downloading"
20 |     wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy
21 | fi
22 | chmod +x ~/azcopy/azcopy
23 | 
24 | # video dbs
25 | if [ ! -d $DOWNLOAD/video_db/tv/ ] ; then
26 |     ~/azcopy/azcopy cp $BLOB/video_db/tv.tar $DOWNLOAD/video_db/tv.tar
27 |     tar -xvf $DOWNLOAD/video_db/tv.tar -C $DOWNLOAD/video_db 
28 |     rm $DOWNLOAD/video_db/tv.tar
29 | fi
30 | 
31 | # text dbs
32 | for SPLIT in 'train' 'val' 'test'; do
33 |     if [ ! -d $DOWNLOAD/txt_db/tvr_$SPLIT.db/ ] ; then
34 |         wget $BLOB/txt_db/tvr_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
35 |         tar -xvf $DOWNLOAD/txt_db/tvr_$SPLIT.db.tar -C $DOWNLOAD/txt_db
36 |         rm $DOWNLOAD/txt_db/tvr_$SPLIT.db.tar
37 |     fi
38 | done
39 | if [ ! -d $DOWNLOAD/txt_db/tv_subtitles.db/ ] ; then
40 |     wget $BLOB/txt_db/tv_subtitles.db.tar -P $DOWNLOAD/txt_db/
41 |     tar -xvf $DOWNLOAD/txt_db/tv_subtitles.db.tar -C $DOWNLOAD/txt_db
42 |     rm $DOWNLOAD/txt_db/tv_subtitles.db.tar
43 | fi
44 | 
45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero'
46 | # pretrained
47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then
48 |     wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/
49 | fi
50 | 


--------------------------------------------------------------------------------
/scripts/download_vatex_en.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | DOWNLOAD=$1
 6 | 
 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do
 8 |     if [ ! -d $DOWNLOAD/$FOLDER ] ; then
 9 |         mkdir -p $DOWNLOAD/$FOLDER
10 |     fi
11 | done
12 | 
13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data'
14 | 
15 | # Use azcopy for video db downloading
16 | if [ -f ~/azcopy/azcopy ]; then
17 |     echo "azcopy exists, skip downloading"
18 | else 
19 |     echo "azcopy does not exist, start downloading"
20 |     wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy
21 | fi
22 | chmod +x ~/azcopy/azcopy
23 | 
24 | # video dbs
25 | if [ ! -d $DOWNLOAD/video_db/vatex/ ] ; then
26 |     ~/azcopy/azcopy cp $BLOB/video_db/vatex.tar $DOWNLOAD/video_db/vatex.tar
27 |     tar -xvf $DOWNLOAD/video_db/vatex.tar -C $DOWNLOAD/video_db
28 |     rm $DOWNLOAD/video_db/vatex.tar
29 | fi
30 | 
31 | # text dbs
32 | if [ ! -d $DOWNLOAD/txt_db/vatex_subtitles.db/ ] ; then
33 |     wget $BLOB/txt_db/vatex_subtitles.db.tar -P $DOWNLOAD/txt_db/
34 |     tar -xvf $DOWNLOAD/txt_db/vatex_subtitles.db.tar -C $DOWNLOAD/txt_db
35 |     rm $DOWNLOAD/txt_db/vatex_subtitles.db.tar
36 | fi
37 | # vatex_en_r
38 | for SPLIT in 'train' 'val' 'test_public' ; do
39 |     if [ ! -d $DOWNLOAD/txt_db/vatex_en_r_$SPLIT.db/ ] ; then
40 |         wget $BLOB/txt_db/vatex_en_r_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
41 |         tar -xvf $DOWNLOAD/txt_db/vatex_en_r_$SPLIT.db.tar -C $DOWNLOAD/txt_db
42 |         rm $DOWNLOAD/txt_db/vatex_en_r_$SPLIT.db.tar
43 |     fi
44 | done
45 | 
46 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero'
47 | # pretrained
48 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then
49 |     wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/
50 | fi
51 | 
52 | VATEXCBLOB='https://datarelease.blob.core.windows.net/value-leaderboard/vatex_en_c'
53 | # vatex_en_c raw data (evaluation and inference)
54 | for SPLIT in 'test_public' 'test_private'; do
55 |     wget -nc $VATEXCBLOB/vatex_en_c_${SPLIT}_release.jsonl -O $DOWNLOAD/txt_db/vatex_en_c_${SPLIT}_release.jsonl
56 | done
57 | 


--------------------------------------------------------------------------------
/scripts/download_violin.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | DOWNLOAD=$1
 6 | 
 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do
 8 |     if [ ! -d $DOWNLOAD/$FOLDER ] ; then
 9 |         mkdir -p $DOWNLOAD/$FOLDER
10 |     fi
11 | done
12 | 
13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data'
14 | 
15 | # Use azcopy for video db downloading
16 | if [ -f ~/azcopy/azcopy ]; then
17 |     echo "azcopy exists, skip downloading"
18 | else 
19 |     echo "azcopy does not exist, start downloading"
20 |     wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy
21 | fi
22 | chmod +x ~/azcopy/azcopy
23 | 
24 | # video dbs
25 | if [ ! -d $DOWNLOAD/video_db/violin/ ] ; then
26 |     ~/azcopy/azcopy cp $BLOB/video_db/violin.tar $DOWNLOAD/video_db/violin.tar
27 |     tar -xvf $DOWNLOAD/video_db/violin.tar -C $DOWNLOAD/video_db 
28 |     rm $DOWNLOAD/video_db/violin.tar
29 | fi
30 | 
31 | # text dbs
32 | for SPLIT in 'train' 'val' 'test' 'test_private'; do
33 |     if [ ! -d $DOWNLOAD/txt_db/violin_$SPLIT.db/ ] ; then
34 |         wget $BLOB/txt_db/violin_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
35 |         tar -xvf $DOWNLOAD/txt_db/violin_$SPLIT.db.tar -C $DOWNLOAD/txt_db
36 |         rm $DOWNLOAD/txt_db/violin_$SPLIT.db.tar
37 |     fi
38 | done
39 | if [ ! -d $DOWNLOAD/txt_db/violin_subtitles.db/ ] ; then
40 |     wget $BLOB/txt_db/violin_subtitles.db.tar -P $DOWNLOAD/txt_db/
41 |     tar -xvf $DOWNLOAD/txt_db/violin_subtitles.db.tar -C $DOWNLOAD/txt_db
42 |     rm $DOWNLOAD/txt_db/violin_subtitles.db.tar
43 | fi
44 | 
45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero'
46 | # pretrained
47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then
48 |     wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/
49 | fi
50 | 


--------------------------------------------------------------------------------
/scripts/download_vlep.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | DOWNLOAD=$1
 6 | 
 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do
 8 |     if [ ! -d $DOWNLOAD/$FOLDER ] ; then
 9 |         mkdir -p $DOWNLOAD/$FOLDER
10 |     fi
11 | done
12 | 
13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data'
14 | 
15 | # Use azcopy for video db downloading
16 | if [ -f ~/azcopy/azcopy ]; then
17 |     echo "azcopy exists, skip downloading"
18 | else 
19 |     echo "azcopy does not exist, start downloading"
20 |     wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy
21 | fi
22 | chmod +x ~/azcopy/azcopy
23 | 
24 | # video dbs
25 | if [ ! -d $DOWNLOAD/video_db/vlep/ ] ; then
26 |     ~/azcopy/azcopy cp $BLOB/video_db/vlep.tar $DOWNLOAD/video_db/vlep.tar
27 |     tar -xvf $DOWNLOAD/video_db/vlep.tar -C $DOWNLOAD/video_db 
28 |     rm $DOWNLOAD/video_db/vlep.tar
29 | fi
30 | 
31 | # text dbs
32 | for SPLIT in 'train' 'dev' 'test' ; do
33 |     if [ ! -d $DOWNLOAD/txt_db/vlep_$SPLIT.db/ ] ; then
34 |         wget $BLOB/txt_db/vlep_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
35 |         tar -xvf $DOWNLOAD/txt_db/vlep_$SPLIT.db.tar -C $DOWNLOAD/txt_db
36 |         rm $DOWNLOAD/txt_db/vlep_$SPLIT.db.tar
37 |     fi
38 | done
39 | if [ ! -d $DOWNLOAD/txt_db/vlep_subtitles.db/ ] ; then
40 |     wget $BLOB/txt_db/vlep_subtitles.db.tar -P $DOWNLOAD/txt_db/
41 |     tar -xvf $DOWNLOAD/txt_db/vlep_subtitles.db.tar -C $DOWNLOAD/txt_db
42 |     rm $DOWNLOAD/txt_db/vlep_subtitles.db.tar
43 | fi
44 | 
45 | HEROBLOB='https://convaisharables.blob.core.windows.net/hero'
46 | # pretrained
47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then
48 |     wget $HEROBLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/
49 | fi
50 | 


--------------------------------------------------------------------------------
/scripts/download_yc2.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # Licensed under the MIT license.
 3 | 
 4 | 
 5 | DOWNLOAD=$1
 6 | 
 7 | for FOLDER in 'video_db' 'txt_db' 'pretrained' 'finetune'; do
 8 |     if [ ! -d $DOWNLOAD/$FOLDER ] ; then
 9 |         mkdir -p $DOWNLOAD/$FOLDER
10 |     fi
11 | done
12 | 
13 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard/starter_code_data'
14 | 
15 | # Use azcopy for video db downloading
16 | if [ -f ~/azcopy/azcopy ]; then
17 |     echo "azcopy exists, skip downloading"
18 | else 
19 |     echo "azcopy does not exist, start downloading"
20 |     wget -P ~/azcopy/ https://convaisharables.blob.core.windows.net/azcopy/azcopy
21 | fi
22 | chmod +x ~/azcopy/azcopy
23 | 
24 | # video dbs
25 | if [ ! -d $DOWNLOAD/video_db/yc2/ ] ; then
26 |     ~/azcopy/azcopy cp $BLOB/video_db/yc2.tar $DOWNLOAD/video_db/yc2.tar
27 |     tar -xvf $DOWNLOAD/video_db/yc2.tar -C $DOWNLOAD/video_db 
28 |     rm $DOWNLOAD/video_db/yc2.tar
29 | fi
30 | 
31 | # text dbs
32 | if [ ! -d $DOWNLOAD/txt_db/yc2_subtitles.db/ ] ; then
33 |     wget $BLOB/txt_db/yc2_subtitles.db.tar -P $DOWNLOAD/txt_db/
34 |     tar -xvf $DOWNLOAD/txt_db/yc2_subtitles.db.tar -C $DOWNLOAD/txt_db
35 |     rm $DOWNLOAD/txt_db/yc2_subtitles.db.tar
36 | fi
37 | # yc2r
38 | for SPLIT in 'train' 'val' 'test' ; do
39 |     if [ ! -d $DOWNLOAD/txt_db/yc2r_$SPLIT.db/ ] ; then
40 |         wget $BLOB/txt_db/yc2r_$SPLIT.db.tar -P $DOWNLOAD/txt_db/
41 |         tar -xvf $DOWNLOAD/txt_db/yc2r_$SPLIT.db.tar -C $DOWNLOAD/txt_db
42 |         rm $DOWNLOAD/txt_db/yc2r_$SPLIT.db.tar
43 |     fi
44 | done
45 | 
46 | # pretrained
47 | if [ ! -f $DOWNLOAD/pretrained/hero-tv-ht100.pt ] ; then
48 |     wget $BLOB/pretrained/hero-tv-ht100.pt -P $DOWNLOAD/pretrained/
49 | fi
50 | 
51 | BLOB='https://datarelease.blob.core.windows.net/value-leaderboard'
52 | YC2C=$BLOB/'yc2c'
53 | # yc2c raw data (evaluation and inference)
54 | for SPLIT in 'val' 'test' ; do
55 |     wget -nc $YC2C/yc2c_${SPLIT}_release.jsonl -O $DOWNLOAD/txt_db/yc2c_${SPLIT}_release.jsonl
56 | done
57 | 


--------------------------------------------------------------------------------
/scripts/prepro_tvc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # Licensed under the MIT license.
 4 | 
 5 | DATA=$1  # txt_db
 6 | 
 7 | for SPLIT in 'val' 'train'; do
 8 |     CMD="python scripts/prepro_tvc.py \
 9 |          --annotation /txt/tvc_${SPLIT}_release.jsonl \
10 |          --subtitles /txt/tvqa_preprocessed_subtitles.jsonl \
11 |          --output /txt/tvc_${SPLIT}_new.db"
12 | 
13 |     docker run --ipc=host --rm \
14 |         --mount src=$(pwd),dst=/src,type=bind \
15 |         --mount src=$DATA,dst=/txt,type=bind \
16 |         -w /src linjieli222/hero \
17 |         bash -c "$CMD"
18 | done
19 | 


--------------------------------------------------------------------------------
/two_stream_eval/inf_yc2c.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) Microsoft Corporation.
  3 | Licensed under the MIT license.
  4 | 
  5 | Two-stream evaluation for YC2C
  6 | 
  7 | copied/modified from HERO
  8 | (https://github.com/linjieli222/HERO)
  9 | """
 10 | import sys
 11 | import os
 12 | currentdir = os.path.dirname(os.path.realpath(__file__))
 13 | parentdir = os.path.dirname(currentdir)
 14 | sys.path.insert(0, parentdir)
 15 | 
 16 | import argparse
 17 | import json
 18 | 
 19 | from horovod import torch as hvd
 20 | from transformers import RobertaTokenizer
 21 | 
 22 | from eval.yc2c import Yc2cEval
 23 | from utils.distributed import all_gather_list
 24 | from utils.basic_utils import save_jsonl
 25 | 
 26 | from os.path import exists
 27 | from pred_agg_eval.videocap_generator import VideoCapGenerator
 28 | from pred_agg_eval.inf_vatex_en_c import decode
 29 | from inf_tvc import load_model
 30 | from inf_vatex_en_c import load_inf_data
 31 | 
 32 | 
 33 | def main(opts):
 34 |     hvd.init()
 35 |     if hvd.rank() == 0:
 36 |         toker = RobertaTokenizer.from_pretrained('roberta-base')
 37 |         all_gather_list(None)
 38 |     else:
 39 |         all_gather_list(None)
 40 |         toker = RobertaTokenizer.from_pretrained('roberta-base')
 41 |     bos = toker.convert_tokens_to_ids(['<s>'])[0]
 42 |     eos = toker.convert_tokens_to_ids(['</s>'])[0]
 43 | 
 44 |     video_only_model_opts, video_only_model = load_model(
 45 |         opts.video_only_model_dir, opts.video_only_ckpt_step,
 46 |         opts)
 47 |     video_only_dataloader = load_inf_data(
 48 |         opts, video_only_model_opts, mode="video_only")
 49 | 
 50 |     if exists(opts.sub_only_model_dir):
 51 |         sub_only_model_opts, sub_only_model = load_model(
 52 |             opts.sub_only_model_dir,
 53 |             opts.sub_only_ckpt_step, opts)
 54 |         sub_only_dataloader = load_inf_data(
 55 |             opts, sub_only_model_opts, mode="sub_only")
 56 |     else:
 57 |         sub_only_model, sub_only_dataloader = None, None
 58 | 
 59 |     generator = VideoCapGenerator(
 60 |         video_only_model, opts.max_gen_step,
 61 |         bos, eos, not opts.no_fp16,
 62 |         model2=sub_only_model)
 63 | 
 64 |     results = decode(
 65 |         video_only_dataloader, sub_only_dataloader,
 66 |         generator, toker)
 67 |     output_path = os.path.join(
 68 |         opts.video_only_model_dir, opts.output)
 69 |     save_jsonl(results, output_path)
 70 | 
 71 |     # evaluate score if possible
 72 |     if (hvd.rank() == 0
 73 |             and 'descs' in json.loads(next(iter(open(opts.target_clip))))):
 74 |         evaluator = Yc2cEval(opts.target_clip)
 75 |         score = evaluator(results)
 76 |         print(score)
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = argparse.ArgumentParser()
 81 |     parser.add_argument("--sub_txt_db",
 82 |                         default="/txt/yc2_subtitles.db",
 83 |                         type=str,
 84 |                         help="The input video subtitle corpus. (LMDB)")
 85 |     parser.add_argument("--vfeat_db",
 86 |                         default="/video/yc2", type=str,
 87 |                         help="The input video frame features.")
 88 |     parser.add_argument("--video_only_model_dir", required=True, type=str,
 89 |                         help="dir root to trained model")
 90 |     parser.add_argument("--video_only_ckpt_step", required=True, type=int,
 91 |                         help="checkpoint step")
 92 |     parser.add_argument("--sub_only_model_dir", default="", type=str,
 93 |                         help="dir root to trained model")
 94 |     parser.add_argument("--sub_only_ckpt_step", default=-1, type=int,
 95 |                         help="checkpoint step")
 96 |     parser.add_argument("--output", type=str, required=True,
 97 |                         help="output file name")
 98 | 
 99 |     parser.add_argument("--batch_size", default=16, type=int,
100 |                         help="validation batch size (per GPU)")
101 |     parser.add_argument("--max_gen_step", default=30, type=int,
102 |                         help="max generation steps")
103 | 
104 |     parser.add_argument('--n_workers', type=int, default=4,
105 |                         help="number of data workers")
106 |     parser.add_argument('--no_pin_mem', action='store_true',
107 |                         help="disable pin memory")
108 |     parser.add_argument("--no_fp16", action='store_true',
109 |                         help="disable fp16")
110 | 
111 |     parser.add_argument("--target_clip", required=True, type=str,
112 |                         help="jsonl annotation")
113 | 
114 |     args = parser.parse_args()
115 | 
116 |     main(args)


--------------------------------------------------------------------------------
/two_stream_eval/videocap_generator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | copied/modified from HERO
 6 | (https://github.com/linjieli222/HERO)
 7 | """
 8 | import torch
 9 | from model.videoCap import _to_fp16
10 | 
11 | 
12 | class VideoCapGenerator(object):
13 |     def __init__(self, model1, max_step, bos, eos, fp16, model2=None):
14 |         self.model1 = model1
15 |         self.model2 = model2
16 |         self.max_step = max_step
17 |         self.bos = bos
18 |         self.eos = eos
19 |         self.fp16 = fp16
20 | 
21 |     def greedy_decode(self, batch1, batch2=None):
22 |         """
23 |         run greedy decoding
24 |         NOTE: Speed can potentially be improved by keeping past
25 |               decoder hidden states and only run `step-wise` forward.
26 |               Also, maybe can add early stop when all sequences reaches eos
27 |               instead of running until max_step.
28 |         """
29 |         if self.fp16:
30 |             batch1 = _to_fp16(batch1)
31 |             if batch2 is not None:
32 |                 batch2 = _to_fp16(batch2)
33 |         encoder_outputs1, enc_mask1 = self.model1.encode(batch1)  # (N, Lv, D)
34 |         if self.fp16:
35 |             encoder_outputs1 = encoder_outputs1.half()
36 |         batch_size = enc_mask1.size(0)
37 |         bos = torch.tensor([self.bos]).expand(batch_size).cuda()
38 |         input_ids = torch.zeros(batch_size, self.max_step).to(bos)
39 |         pos_ids = torch.arange(0, self.max_step+1).unsqueeze(0).cuda()
40 |         last_out = bos
41 | 
42 |         if batch2 is not None:
43 |             encoder_outputs2, enc_mask2 = self.model2.encode(
44 |                 batch2)  # (N, Lv, D)
45 |             if self.fp16:
46 |                 encoder_outputs2 = encoder_outputs2.half()
47 |         for step in range(self.max_step):
48 |             input_ids[:, step] = last_out
49 |             score = self.model1.decode(encoder_outputs1, enc_mask1,
50 |                                        input_ids[:, :step+1],
51 |                                        pos_ids[:, :step+1],
52 |                                        None, compute_loss=False)
53 |             if batch2 is not None:
54 |                 score2 = self.model2.decode(
55 |                     encoder_outputs2, enc_mask2,
56 |                     input_ids[:, :step+1],
57 |                     pos_ids[:, :step+1],
58 |                     None, compute_loss=False)
59 |                 score = score/2. + score2/2.
60 |             output_ids = score.max(dim=-1)[1]
61 |             last_out = output_ids[:, -1]
62 | 
63 |         outputs = [self.cut_eos(ids) for ids in output_ids.tolist()]
64 |         return outputs
65 | 
66 |     def cut_eos(self, ids):
67 |         out_ids = []
68 |         for i in ids:
69 |             if i == self.eos:
70 |                 break
71 |             out_ids.append(i)
72 |         return out_ids
73 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VALUE-Leaderboard/StarterCode/fe600a7dd552227a5d0297ab953a52d5ea667c9a/utils/__init__.py


--------------------------------------------------------------------------------
/utils/const.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | constants
 6 | 
 7 | copied/modified from HERO
 8 | (https://github.com/linjieli222/HERO)
 9 | """
10 | VFEAT_DIM = {"resnet_slowfast": 4352, "resnet_mil-nce": 3072,
11 |              "clip-vit_slowfast": 2816,
12 |              "clip-vit_mil-nce": 1536,
13 |              "resnet": 2048,
14 |              "slowfast": 2304,
15 |              "clip-vit": 512,
16 |              "mil-nce": 1024}
17 | # VFEAT_DIM = 4352
18 | MAX_FRM_SEQ_LEN = 100
19 | VCMR_IOU_THDS = (0.5, 0.7)
20 | 


--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) Microsoft Corporation.
  3 | Licensed under the MIT license.
  4 | 
  5 | some functions are modified from UNITER
  6 | (https://github.com/ChenRocks/UNITER)
  7 | 
  8 | helper for logging
  9 | NOTE: loggers are global objects use with caution
 10 | 
 11 | copied/modified from HERO
 12 | (https://github.com/linjieli222/HERO)
 13 | """
 14 | import logging
 15 | 
 16 | import tensorboardX
 17 | 
 18 | 
 19 | _LOG_FMT = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s'
 20 | _DATE_FMT = '%m/%d/%Y %H:%M:%S'
 21 | logging.basicConfig(format=_LOG_FMT, datefmt=_DATE_FMT, level=logging.INFO)
 22 | LOGGER = logging.getLogger('__main__')  # this is the global logger
 23 | 
 24 | 
 25 | def add_log_to_file(log_path):
 26 |     fh = logging.FileHandler(log_path)
 27 |     formatter = logging.Formatter(_LOG_FMT, datefmt=_DATE_FMT)
 28 |     fh.setFormatter(formatter)
 29 |     LOGGER.addHandler(fh)
 30 | 
 31 | 
 32 | class TensorboardLogger(object):
 33 |     def __init__(self):
 34 |         self._logger = None
 35 |         self._global_step = 0
 36 | 
 37 |     def create(self, path):
 38 |         self._logger = tensorboardX.SummaryWriter(path)
 39 | 
 40 |     def noop(self, *args, **kwargs):
 41 |         return
 42 | 
 43 |     def step(self):
 44 |         self._global_step += 1
 45 | 
 46 |     @property
 47 |     def global_step(self):
 48 |         return self._global_step
 49 | 
 50 |     @global_step.setter
 51 |     def global_step(self, step):
 52 |         self._global_step = step
 53 | 
 54 |     def log_scaler_dict(self, log_dict, prefix=''):
 55 |         """ log a dictionary of scalar values"""
 56 |         if self._logger is None:
 57 |             return
 58 |         if prefix:
 59 |             prefix = f'{prefix}_'
 60 |         for name, value in log_dict.items():
 61 |             if isinstance(value, dict):
 62 |                 self.log_scaler_dict(value, self._global_step,
 63 |                                      prefix=f'{prefix}{name}')
 64 |             else:
 65 |                 self._logger.add_scalar(f'{prefix}{name}', value,
 66 |                                         self._global_step)
 67 | 
 68 |     def __getattr__(self, name):
 69 |         if self._logger is None:
 70 |             return self.noop
 71 |         return self._logger.__getattribute__(name)
 72 | 
 73 | 
 74 | TB_LOGGER = TensorboardLogger()
 75 | 
 76 | 
 77 | class RunningMeter(object):
 78 |     """ running meteor of a scalar value
 79 |         (useful for monitoring training loss)
 80 |     """
 81 |     def __init__(self, name, val=None, smooth=0.99):
 82 |         self._name = name
 83 |         self._sm = smooth
 84 |         self._val = val
 85 | 
 86 |     def __call__(self, value):
 87 |         self._val = (value if self._val is None
 88 |                      else value*(1-self._sm) + self._val*self._sm)
 89 | 
 90 |     def __str__(self):
 91 |         return f'{self._name}: {self._val:.4f}'
 92 | 
 93 |     @property
 94 |     def val(self):
 95 |         return self._val
 96 | 
 97 |     @property
 98 |     def name(self):
 99 |         return self._name
100 | 


--------------------------------------------------------------------------------
/utils/misc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) Microsoft Corporation.
 3 | Licensed under the MIT license.
 4 | 
 5 | Copied from UNITER
 6 | (https://github.com/ChenRocks/UNITER)
 7 | 
 8 | Misc utilities
 9 | 
10 | copied/modified from HERO
11 | (https://github.com/linjieli222/HERO)
12 | """
13 | import random
14 | 
15 | import torch
16 | import numpy as np
17 | 
18 | from utils.logger import LOGGER
19 | 
20 | 
21 | class Struct(object):
22 |     def __init__(self, dict_):
23 |         self.__dict__.update(dict_)
24 | 
25 | 
26 | class NoOp(object):
27 |     """ useful for distributed training No-Ops """
28 |     def __getattr__(self, name):
29 |         return self.noop
30 | 
31 |     def noop(self, *args, **kwargs):
32 |         return
33 | 
34 | 
35 | def set_dropout(model, drop_p):
36 |     for name, module in model.named_modules():
37 |         # we might want to tune dropout for smaller dataset
38 |         if isinstance(module, torch.nn.Dropout):
39 |             if module.p != drop_p:
40 |                 module.p = drop_p
41 |                 LOGGER.info(f'{name} set to {drop_p}')
42 | 
43 | 
44 | def set_random_seed(seed):
45 |     random.seed(seed)
46 |     np.random.seed(seed)
47 |     torch.manual_seed(seed)
48 |     torch.cuda.manual_seed_all(seed)
49 | 


--------------------------------------------------------------------------------