├── allinone
    ├── __init__.py
    ├── AllInOne
    │   ├── __init__.py
    │   ├── gadgets
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── my_metrics.cpython-37.pyc
    │   │   └── my_metrics.py
    │   ├── datasets
    │   │   ├── k400_zero_shot.py
    │   │   ├── __pycache__
    │   │   │   ├── vcr.cpython-37.pyc
    │   │   │   ├── cc3m.cpython-37.pyc
    │   │   │   ├── didemo.cpython-37.pyc
    │   │   │   ├── ego4d.cpython-37.pyc
    │   │   │   ├── hmdb51.cpython-37.pyc
    │   │   │   ├── k400.cpython-37.pyc
    │   │   │   ├── msrvtt.cpython-37.pyc
    │   │   │   ├── msvd.cpython-37.pyc
    │   │   │   ├── msvdqa.cpython-37.pyc
    │   │   │   ├── tgif.cpython-37.pyc
    │   │   │   ├── tgifqa.cpython-37.pyc
    │   │   │   ├── tvqa.cpython-37.pyc
    │   │   │   ├── webvid.cpython-37.pyc
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── msrvttqa.cpython-37.pyc
    │   │   │   ├── activitynet.cpython-37.pyc
    │   │   │   ├── howto100m.cpython-37.pyc
    │   │   │   ├── yttemporal.cpython-37.pyc
    │   │   │   ├── base_dataset.cpython-37.pyc
    │   │   │   ├── ego4d_choice.cpython-37.pyc
    │   │   │   ├── lsmdc_choice.cpython-37.pyc
    │   │   │   ├── lsmdc_dataset.cpython-37.pyc
    │   │   │   ├── msrvtt_choice.cpython-37.pyc
    │   │   │   ├── nlvr2_dataset.cpython-37.pyc
    │   │   │   ├── vqav2_dataset.cpython-37.pyc
    │   │   │   ├── vg_caption_dataset.cpython-37.pyc
    │   │   │   ├── video_base_dataset.cpython-37.pyc
    │   │   │   ├── sbu_caption_dataset.cpython-37.pyc
    │   │   │   ├── coco_caption_karpathy_dataset.cpython-37.pyc
    │   │   │   └── f30k_caption_karpathy_dataset.cpython-37.pyc
    │   │   ├── sbu_caption_dataset.py
    │   │   ├── vg_caption_dataset.py
    │   │   ├── f30k_caption_karpathy_dataset.py
    │   │   ├── coco_caption_karpathy_dataset.py
    │   │   ├── __init__.py
    │   │   ├── didemo.py
    │   │   ├── vqav2_dataset.py
    │   │   ├── nlvr2_dataset.py
    │   │   ├── msvd.py
    │   │   ├── lsmdc_dataset.py
    │   │   ├── ego4d.py
    │   │   ├── msrvtt.py
    │   │   ├── msrvtt_choice.py
    │   │   ├── hmdb51_zero_shot.py
    │   │   ├── hmdb51.py
    │   │   ├── ego4d_choice.py
    │   │   ├── msrvttqa.py
    │   │   ├── msvdqa.py
    │   │   ├── tgif.py
    │   │   ├── lsmdc_choice.py
    │   │   ├── k400.py
    │   │   ├── activitynet.py
    │   │   ├── webvid.py
    │   │   └── tvqaplus.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── heads.cpython-37.pyc
    │   │   │   ├── meltr.cpython-37.pyc
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── dist_utils.cpython-37.pyc
    │   │   │   ├── objectives.cpython-37.pyc
    │   │   │   ├── allinone_utils.cpython-37.pyc
    │   │   │   ├── temporal_roll.cpython-37.pyc
    │   │   │   ├── allinone_module.cpython-37.pyc
    │   │   │   ├── retrieval_metrics.cpython-37.pyc
    │   │   │   └── base_vision_transformer.cpython-37.pyc
    │   │   ├── forzen_param.py
    │   │   ├── heads.py
    │   │   ├── temporal_roll.py
    │   │   └── meltr.py
    │   ├── __pycache__
    │   │   ├── config.cpython-37.pyc
    │   │   └── __init__.cpython-37.pyc
    │   ├── transforms
    │   │   ├── __pycache__
    │   │   │   ├── mix.cpython-37.pyc
    │   │   │   ├── utils.cpython-37.pyc
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── pixelbert.cpython-37.pyc
    │   │   │   ├── randaug.cpython-37.pyc
    │   │   │   ├── videoaug.cpython-37.pyc
    │   │   │   ├── functional.cpython-37.pyc
    │   │   │   └── video_transform.cpython-37.pyc
    │   │   ├── __init__.py
    │   │   ├── pixelbert.py
    │   │   ├── mix.py
    │   │   ├── utils.py
    │   │   ├── videoaug.py
    │   │   └── functional.py
    │   └── datamodules
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-37.pyc
    │   │       ├── cc3m_datamodule.cpython-37.pyc
    │   │       ├── datamodule_base.cpython-37.pyc
    │   │       ├── k400_datamodule.cpython-37.pyc
    │   │       ├── msvd_datamodule.cpython-37.pyc
    │   │       ├── sbu_datamodule.cpython-37.pyc
    │   │       ├── tgif_datamodule.cpython-37.pyc
    │   │       ├── tvqa_datamodule.cpython-37.pyc
    │   │       ├── vcr_datamodule.cpython-37.pyc
    │   │       ├── didemo_datamodule.cpython-37.pyc
    │   │       ├── ego4d_datamodule.cpython-37.pyc
    │   │       ├── hmdb51_datamodule.cpython-37.pyc
    │   │       ├── lsmdc_datamodule.cpython-37.pyc
    │   │       ├── msrvtt_datamodule.cpython-37.pyc
    │   │       ├── msvdqa_datamodule.cpython-37.pyc
    │   │       ├── nlvr2_datamodule.cpython-37.pyc
    │   │       ├── tgifqa_datamodule.cpython-37.pyc
    │   │       ├── vqav2_datamodule.cpython-37.pyc
    │   │       ├── webvid_datamodule.cpython-37.pyc
    │   │       ├── howto100m_datamodule.cpython-37.pyc
    │   │       ├── msrvttqa_datamodule.cpython-37.pyc
    │   │       ├── multitask_datamodule.cpython-37.pyc
    │   │       ├── activitynet_datamodule.cpython-37.pyc
    │   │       ├── ego4d_choice_datamodule.cpython-37.pyc
    │   │       ├── lsmdc_choice_datamodule.cpython-37.pyc
    │   │       ├── vg_caption_datamodule.cpython-37.pyc
    │   │       ├── yttemporal_datamodule.cpython-37.pyc
    │   │       ├── msrvtt_choice_datamodule.cpython-37.pyc
    │   │       ├── coco_caption_karpathy_datamodule.cpython-37.pyc
    │   │       └── f30k_caption_karpathy_datamodule.cpython-37.pyc
    │   │   ├── vcr_datamodule.py
    │   │   ├── cc3m_datamodule.py
    │   │   ├── nlvr2_datamodule.py
    │   │   ├── sbu_datamodule.py
    │   │   ├── activitynet_datamodule.py
    │   │   ├── vg_caption_datamodule.py
    │   │   ├── k400_datamodule.py
    │   │   ├── msvd_datamodule.py
    │   │   ├── tgif_datamodule.py
    │   │   ├── tvqa_datamodule.py
    │   │   ├── ego4d_datamodule.py
    │   │   ├── lsmdc_datamodule.py
    │   │   ├── didemo_datamodule.py
    │   │   ├── hmdb51_datamodule.py
    │   │   ├── msrvtt_datamodule.py
    │   │   ├── tgifqa_datamodule.py
    │   │   ├── webvid_datamodule.py
    │   │   ├── howto100m_datamodule.py
    │   │   ├── yttemporal_datamodule.py
    │   │   ├── ego4d_choice_datamodule.py
    │   │   ├── lsmdc_choice_datamodule.py
    │   │   ├── msrvtt_choice_datamodule.py
    │   │   ├── coco_caption_karpathy_datamodule.py
    │   │   ├── f30k_caption_karpathy_datamodule.py
    │   │   ├── msvdqa_datamodule.py
    │   │   ├── msrvttqa_datamodule.py
    │   │   ├── vqav2_datamodule.py
    │   │   ├── __init__.py
    │   │   └── multitask_datamodule.py
    ├── setup.py
    ├── param_and_flop.py
    ├── move_pretrained_weight.py
    ├── README.md
    ├── run.py
    └── requirements.txt
├── univl
    ├── eval
    │   ├── __init__.py
    │   └── retrieval.py
    ├── utils
    │   ├── __init__.py
    │   └── metrics.py
    ├── modules
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── beam.cpython-37.pyc
    │   │   ├── __init__.cpython-37.pyc
    │   │   ├── file_utils.cpython-37.pyc
    │   │   ├── module_bert.cpython-37.pyc
    │   │   ├── module_cross.cpython-37.pyc
    │   │   ├── module_decoder.cpython-37.pyc
    │   │   ├── module_meltr.cpython-37.pyc
    │   │   ├── module_visual.cpython-37.pyc
    │   │   ├── tokenization.cpython-37.pyc
    │   │   ├── until_config.cpython-37.pyc
    │   │   └── until_module.cpython-37.pyc
    │   ├── cross-base
    │   │   └── cross_config.json
    │   ├── visual-base
    │   │   └── visual_config.json
    │   ├── bert-base-uncased
    │   │   └── bert_config.json
    │   ├── decoder-base
    │   │   └── decoder_config.json
    │   ├── optimization_MELTR.py
    │   ├── meltr.py
    │   ├── beam.py
    │   └── until_config.py
    ├── requirements.txt
    ├── dataloaders
    │   ├── __pycache__
    │   │   ├── dataloader_meta_msrvtt.cpython-37.pyc
    │   │   ├── dataloader_meta_youcook.cpython-37.pyc
    │   │   └── dataloader_msrvtt_caption.cpython-37.pyc
    │   └── README.md
    ├── asset
    │   └── bert_config.json
    ├── LICENSE
    └── README.md
├── asset
    └── main.png
├── violet
    ├── lib.py
    ├── args
    │   ├── args_msvd-qaoe.json
    │   ├── args_tgif-action.json
    │   ├── args_tgif-frame.json
    │   ├── args_tgif-transition.json
    │   ├── args_msrvtt-retrieval_7k.json
    │   ├── args_msrvtt-retrieval_9k.json
    │   └── args_msrvtt-retrieval_eval.json
    ├── tools
    │   ├── extract_tsv.py
    │   ├── extract_video-frame.py
    │   └── extract_vq.py
    ├── dataset.py
    ├── agent.py
    ├── utils.py
    ├── README.md
    ├── meltr.py
    ├── model.py
    └── eval_retrieval.py
├── LICENSE
└── README.md


/allinone/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/univl/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/univl/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/univl/modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/gadgets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/k400_zero_shot.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/asset/main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/asset/main.png


--------------------------------------------------------------------------------
/univl/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.7.0
2 | tqdm
3 | boto3
4 | requests
5 | pandas


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from AllInOne.modules.allinone_module import AllinoneTransformerSS


--------------------------------------------------------------------------------
/univl/modules/__pycache__/beam.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/beam.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/__pycache__/config.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/file_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/file_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/module_bert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_bert.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/module_cross.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_cross.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/module_decoder.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_decoder.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/module_meltr.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_meltr.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/module_visual.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_visual.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/tokenization.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/tokenization.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/until_config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/until_config.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/__pycache__/until_module.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/until_module.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/vcr.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/vcr.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/cc3m.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/cc3m.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/didemo.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/didemo.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/ego4d.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/ego4d.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/hmdb51.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/hmdb51.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/k400.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/k400.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/msrvtt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msrvtt.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/msvd.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msvd.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/msvdqa.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msvdqa.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/tgif.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/tgif.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/tgifqa.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/tgifqa.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/tvqa.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/tvqa.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/webvid.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/webvid.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/heads.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/heads.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/meltr.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/meltr.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__pycache__/mix.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/mix.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/msrvttqa.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msrvttqa.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/gadgets/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/gadgets/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/activitynet.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/activitynet.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/howto100m.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/howto100m.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/yttemporal.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/yttemporal.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/gadgets/__pycache__/my_metrics.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/gadgets/__pycache__/my_metrics.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/dist_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/dist_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/objectives.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/objectives.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__pycache__/pixelbert.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/pixelbert.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__pycache__/randaug.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/randaug.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__pycache__/videoaug.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/videoaug.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/base_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/base_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/ego4d_choice.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/ego4d_choice.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/lsmdc_choice.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/lsmdc_choice.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/lsmdc_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/lsmdc_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/msrvtt_choice.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msrvtt_choice.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/nlvr2_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/nlvr2_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/vqav2_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/vqav2_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/allinone_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/allinone_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/temporal_roll.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/temporal_roll.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__pycache__/functional.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/functional.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/dataloaders/__pycache__/dataloader_meta_msrvtt.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/dataloaders/__pycache__/dataloader_meta_msrvtt.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/allinone_module.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/allinone_module.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/retrieval_metrics.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/retrieval_metrics.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/dataloaders/__pycache__/dataloader_meta_youcook.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/dataloaders/__pycache__/dataloader_meta_youcook.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/dataloaders/__pycache__/dataloader_msrvtt_caption.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/dataloaders/__pycache__/dataloader_msrvtt_caption.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/cc3m_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/cc3m_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/datamodule_base.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/datamodule_base.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/k400_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/k400_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/msvd_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msvd_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/sbu_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/sbu_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/tgif_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/tgif_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/tvqa_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/tvqa_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/vcr_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/vcr_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/vg_caption_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/vg_caption_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/video_base_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/video_base_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__pycache__/video_transform.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/video_transform.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/didemo_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/didemo_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/ego4d_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/ego4d_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/hmdb51_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/hmdb51_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/lsmdc_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/lsmdc_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/msrvtt_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msrvtt_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/msvdqa_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msvdqa_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/nlvr2_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/nlvr2_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/tgifqa_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/tgifqa_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/vqav2_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/vqav2_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/webvid_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/webvid_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/sbu_caption_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/sbu_caption_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/howto100m_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/howto100m_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/msrvttqa_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msrvttqa_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/multitask_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/multitask_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/__pycache__/base_vision_transformer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/base_vision_transformer.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/activitynet_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/activitynet_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/ego4d_choice_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/ego4d_choice_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/lsmdc_choice_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/lsmdc_choice_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/vg_caption_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/vg_caption_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/yttemporal_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/yttemporal_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/msrvtt_choice_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msrvtt_choice_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/coco_caption_karpathy_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/coco_caption_karpathy_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__pycache__/f30k_caption_karpathy_dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/f30k_caption_karpathy_dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/coco_caption_karpathy_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/coco_caption_karpathy_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__pycache__/f30k_caption_karpathy_datamodule.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/f30k_caption_karpathy_datamodule.cpython-37.pyc


--------------------------------------------------------------------------------
/univl/modules/cross-base/cross_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 1024,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 2,
11 |   "vocab_size": 768
12 | }


--------------------------------------------------------------------------------
/univl/modules/visual-base/visual_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 1,
11 |   "vocab_size": 1024
12 | }
13 | 


--------------------------------------------------------------------------------
/violet/lib.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse, sys, os, io, base64, pickle, json, math
 3 | 
 4 | from datetime import datetime
 5 | from tqdm import tqdm
 6 | 
 7 | import numpy as np
 8 | import torch as T
 9 | import torchvision as TV
10 | import torch.distributed as DIST
11 | 
12 | import cv2
13 | from PIL import Image
14 | 
15 | import transformers
16 | os.environ['TOKENIZERS_PARALLELISM'] = 'true'
17 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/__init__.py:
--------------------------------------------------------------------------------
 1 | from .pixelbert import (
 2 |     pixelbert_transform,
 3 |     pixelbert_transform_randaug,
 4 | )
 5 | 
 6 | _transforms = {
 7 |     "pixelbert": pixelbert_transform,
 8 |     "pixelbert_randaug": pixelbert_transform_randaug,
 9 | }
10 | 
11 | 
12 | def keys_to_transforms(keys: list, size=224):
13 |     return [_transforms[key](size=size) for key in keys]
14 | 


--------------------------------------------------------------------------------
/univl/asset/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 12,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }
14 | 


--------------------------------------------------------------------------------
/univl/modules/bert-base-uncased/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 12,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }
14 | 


--------------------------------------------------------------------------------
/univl/modules/decoder-base/decoder_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "num_attention_heads": 12,
 9 |   "num_hidden_layers": 12,
10 |   "type_vocab_size": 2,
11 |   "vocab_size": 30522,
12 |   "num_decoder_layers": 4,
13 |   "max_target_embeddings": 512
14 | }
15 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/vcr_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import VCRDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class VCRDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return VCRDataset
12 | 
13 |     @property
14 |     def dataset_name(self):
15 |         return "vcr"
16 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/cc3m_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import CC3MDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class CC3MDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return CC3MDataset
12 | 
13 |     @property
14 |     def dataset_name(self):
15 |         return "cc3m"
16 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/nlvr2_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import NLVR2Dataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class NLVR2DataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return NLVR2Dataset
12 | 
13 |     @property
14 |     def dataset_name(self):
15 |         return "nlvr2"
16 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/sbu_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import SBUCaptionDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class SBUCaptionDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return SBUCaptionDataset
12 | 
13 |     @property
14 |     def dataset_name(self):
15 |         return "sbu"
16 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/activitynet_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import ActivityNetDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class ActivityNetDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return ActivityNetDataset
12 | 
13 |     @property
14 |     def dataset_name(self):
15 |         return "activitynet"
16 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/vg_caption_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import VisualGenomeCaptionDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class VisualGenomeCaptionDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return VisualGenomeCaptionDataset
12 | 
13 |     @property
14 |     def dataset_name(self):
15 |         return "vg"
16 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/k400_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import K400Dataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class K400DataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return K400Dataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return K400Dataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "k400"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/msvd_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import MSVDDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class MSVDDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return MSVDDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return MSVDDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "msvd"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/tgif_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import TGIFDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class TGIFDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return TGIFDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return TGIFDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "tgif"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/tvqa_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import TVQADataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class TVQADataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return TVQADataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return TVQADataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "tvqa"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/ego4d_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import Ego4DDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class Ego4DDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return Ego4DDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return Ego4DDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "ego4d"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/lsmdc_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import LSMDCDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class LSMDCDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return LSMDCDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return LSMDCDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "lsmdc"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/didemo_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import DIDEMODataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class DIDEMODataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return DIDEMODataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return DIDEMODataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "didemo"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/hmdb51_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import HMDB51Dataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class HMDB51DataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return HMDB51Dataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return HMDB51Dataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "hmdb51"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/msrvtt_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import MSRVTTDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class MSRVTTDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return MSRVTTDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return MSRVTTDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "msrvtt"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/tgifqa_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import TGIFQADataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class TGIFQADataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return TGIFQADataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return TGIFQADataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "tgifqa"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/webvid_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import WEBVIDDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class WEBVIDDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return WEBVIDDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return WEBVIDDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "webvid"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/howto100m_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import HT100MDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class HT100MDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return HT100MDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return HT100MDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "howto100m"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/yttemporal_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import YTTemporalDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class YTTemporalMDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return YTTemporalDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return YTTemporalDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "yttemporal"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/ego4d_choice_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import EGO4DChoiceDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class EGO4DChoiceDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return EGO4DChoiceDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return EGO4DChoiceDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "ego4d_choice"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/lsmdc_choice_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import LSMDCChoiceDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class LSMDCChoiceDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return LSMDCChoiceDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return LSMDCChoiceDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "lsmdc_choice"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/msrvtt_choice_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import MSRVTTChoiceDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class MSRVTTChoiceDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return MSRVTTChoiceDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return MSRVTTChoiceDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "msrvtt_choice"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/forzen_param.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | # def forzen_param(model):
 5 | #     for name, param in model.named_parameters():
 6 | #         if 'mlm_score' in name or 'itm_score' in name or 'mpp_score' in name:
 7 | #             param.requires_grad = True
 8 | #         else:
 9 | #             param.requires_grad = False
10 | #     return True
11 | 
12 | 
13 | def forzen_param(model):
14 |     flag = False
15 |     for name, param in model.named_parameters():
16 |         if '10' in name:
17 |             flag = True
18 |         param.requires_grad = flag
19 |     return True


--------------------------------------------------------------------------------
/violet/args/args_msvd-qaoe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "qaoe",
 3 |     
 4 |     "dataset": "msvd", 
 5 |     "task": "msvd-qa", 
 6 |     "annotation_file": "txt_msvd-qa.json",
 7 |     
 8 |     "size_img": 224,
 9 |     "size_txt": 25,
10 |     "size_vocab": 1000,
11 |     
12 |     "size_epoch": 30, 
13 |     "size_batch": 5, 
14 | 
15 |     "lr": 1.2e-5, 
16 |     "decay": 1e-3, 
17 | 
18 |     "meltr_lr": 1e-4, 
19 |     "meltr_decay": 1e-4, 
20 |     "max_grad_norm": 12,
21 |     "auxgrad_every": 3,
22 |     "gamma": 0.1,
23 |     
24 |     "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt"
25 | }
26 | 


--------------------------------------------------------------------------------
/violet/args/args_tgif-action.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "qamc",
 3 |     
 4 |     "dataset": "tgif", 
 5 |     "task": "tgif-action", 
 6 |     "annotation_file": "txt_tgif-action.json",
 7 |     
 8 |     "size_img": 224,
 9 |     "size_txt": 40,
10 |     "size_option": 5,
11 |     
12 |     "size_epoch": 20, 
13 |     "size_batch": 3, 
14 | 
15 |     "lr": 6e-6, 
16 |     "decay": 1e-3, 
17 | 
18 |     "meltr_lr": 1e-4, 
19 |     "meltr_decay": 1e-4, 
20 |     "max_grad_norm": 12,
21 |     "auxgrad_every": 3,
22 |     "gamma": 0.1,
23 |     
24 |     "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt"
25 | }
26 | 


--------------------------------------------------------------------------------
/violet/args/args_tgif-frame.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "qaoe",
 3 |     
 4 |     "dataset": "tgif", 
 5 |     "task": "tgif-frame", 
 6 |     "annotation_file": "txt_tgif-frame.json",
 7 |     
 8 |     "size_img": 224,
 9 |     "size_txt": 25,
10 |     "size_vocab": 1540,
11 |     
12 |     "size_epoch": 20, 
13 |     "size_batch": 5, 
14 | 
15 |     "lr": 3e-5, 
16 |     "decay": 1e-4, 
17 | 
18 |     "meltr_lr": 1e-4, 
19 |     "meltr_decay": 1e-4, 
20 |     "max_grad_norm": 12,
21 |     "auxgrad_every": 3,
22 |     "gamma": 0.1,
23 |     
24 |     "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt"
25 | }
26 | 


--------------------------------------------------------------------------------
/allinone/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="AllInOne",
 5 |     packages=find_packages(
 6 |         exclude=[".dfc", ".vscode", "dataset", "notebooks", "result", "scripts"]
 7 |     ),
 8 |     version="1.0.0",
 9 |     license="MIT",
10 |     description="All in One: Exploring Unified Video-Language Pre-training",
11 |     author="Alex Jinpeng Wang",
12 |     author_email="awinyimgprocess@gmail.com",
13 |     url="https://github.com/fingerrec'",
14 |     keywords=["video and language pretraining"],
15 |     install_requires=["torch", "pytorch_lightning"],
16 | )
17 | 


--------------------------------------------------------------------------------
/violet/args/args_tgif-transition.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "qamc",
 3 |     
 4 |     "dataset": "tgif", 
 5 |     "task": "tgif-transition", 
 6 |     "annotation_file": "txt_tgif-transition.json",
 7 |     
 8 |     "size_img": 224,
 9 |     "size_txt": 40,
10 |     "size_option": 5,
11 |     
12 |     "size_epoch": 20, 
13 |     "size_batch": 3, 
14 | 
15 |     "lr": 3e-6, 
16 |     "decay": 1e-3, 
17 | 
18 |     "meltr_lr": 1e-4, 
19 |     "meltr_decay": 1e-4, 
20 |     "max_grad_norm": 12,
21 |     "auxgrad_every": 3,
22 |     "gamma": 0.1,
23 |     
24 |     "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt"
25 | }
26 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/coco_caption_karpathy_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import CocoCaptionKarpathyDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class CocoCaptionKarpathyDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return CocoCaptionKarpathyDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return CocoCaptionKarpathyDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "coco"
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/f30k_caption_karpathy_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import F30KCaptionKarpathyDataset
 2 | from .datamodule_base import BaseDataModule
 3 | 
 4 | 
 5 | class F30KCaptionKarpathyDataModule(BaseDataModule):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super().__init__(*args, **kwargs)
 8 | 
 9 |     @property
10 |     def dataset_cls(self):
11 |         return F30KCaptionKarpathyDataset
12 | 
13 |     @property
14 |     def dataset_cls_no_false(self):
15 |         return F30KCaptionKarpathyDataset
16 | 
17 |     @property
18 |     def dataset_name(self):
19 |         return "f30k"
20 | 


--------------------------------------------------------------------------------
/allinone/param_and_flop.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import pytorch_lightning as pl
 3 | from AllInOne.config import ex
 4 | from AllInOne.modules import ViLTransformerSS
 5 | from AllInOne.datamodules.multitask_datamodule import MTDataModule
 6 | from thop import profile
 7 | import torch
 8 | 
 9 | @ex.automain
10 | def main(_config):
11 |     _config = copy.deepcopy(_config)
12 |     pl.seed_everything(_config["seed"])
13 | 
14 |     dm = MTDataModule(_config, dist=True)
15 | 
16 |     model = ViLTransformerSS(_config)
17 |     input = torch.randn(1, 3, 3, 224, 224)
18 |     macs, params = profile(model, inputs=(input,))
19 |     print(macs, params)
20 | 
21 |     # 110M


--------------------------------------------------------------------------------
/violet/args/args_msrvtt-retrieval_7k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "retrieval",
 3 |     
 4 |     "dataset": "msrvtt", 
 5 |     "task": "msrvtt-retrieval", 
 6 |     "train_annotation": "./_data/msrvtt/train_7k.json",
 7 |     "test_annotation": "./_data/msrvtt/test.json",
 8 |     
 9 |     "size_img": 224,
10 |     "size_txt": 25,
11 |     
12 |     "size_epoch": 20, 
13 |     "size_batch": 5, 
14 | 
15 |     "lr": 3e-6, 
16 |     "decay": 1e-3, 
17 | 
18 |     "meltr_lr": 3e-6, 
19 |     "meltr_decay": 1e-3, 
20 |     "max_grad_norm": 12,
21 |     "auxgrad_every": 3,
22 |     "gamma": 0.1,
23 |     
24 |     "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt"
25 | }
26 | 


--------------------------------------------------------------------------------
/violet/args/args_msrvtt-retrieval_9k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "retrieval",
 3 |     
 4 |     "dataset": "msrvtt", 
 5 |     "task": "msrvtt-retrieval", 
 6 |     "train_annotation": "./_data/msrvtt/train_9k.json",
 7 |     "test_annotation": "./_data/msrvtt/test.json",
 8 |     
 9 |     "size_img": 224,
10 |     "size_txt": 25,
11 |     
12 |     "size_epoch": 20, 
13 |     "size_batch": 5, 
14 | 
15 |     "lr": 3e-6, 
16 |     "decay": 1e-3, 
17 | 
18 |     "meltr_lr": 3e-6, 
19 |     "meltr_decay": 1e-3, 
20 |     "max_grad_norm": 12,
21 |     "auxgrad_every": 3,
22 |     "gamma": 0.1,
23 |     
24 |     "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt"
25 | }
26 | 


--------------------------------------------------------------------------------
/violet/args/args_msrvtt-retrieval_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "retrieval",
 3 |     
 4 |     "dataset": "msrvtt", 
 5 |     "task": "msrvtt-retrieval", 
 6 |     "train_annotation": "./_data/msrvtt/train_9k.json",
 7 |     "test_annotation": "./_data/msrvtt/test.json",
 8 |     
 9 |     "size_img": 224,
10 |     "size_txt": 25,
11 |     
12 |     "size_epoch": 20, 
13 |     "size_batch": 5, 
14 | 
15 |     "lr": 3e-6, 
16 |     "decay": 1e-3, 
17 | 
18 |     "meltr_lr": 3e-6, 
19 |     "meltr_decay": 1e-3, 
20 |     "max_grad_norm": 12,
21 |     "auxgrad_every": 3,
22 |     "gamma": 0.1,
23 |     
24 |     "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt"
25 | }
26 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/sbu_caption_dataset.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | from .base_dataset import BaseDataset
 3 | 
 4 | 
 5 | class SBUCaptionDataset(BaseDataset):
 6 |     def __init__(self, *args, split="", **kwargs):
 7 |         assert split in ["train", "val", "test"]
 8 |         if split == "test":
 9 |             split = "val"
10 | 
11 |         if split == "train":
12 |             names = [f"sbu_{i}" for i in range(9)]
13 |         elif split == "val":
14 |             names = []
15 | 
16 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
17 | 
18 |     def __getitem__(self, index):
19 |         return self.get_suite(index)
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/vg_caption_dataset.py:
--------------------------------------------------------------------------------
 1 | from .base_dataset import BaseDataset
 2 | 
 3 | 
 4 | class VisualGenomeCaptionDataset(BaseDataset):
 5 |     def __init__(self, *args, split="", **kwargs):
 6 |         assert split in ["train", "val", "test"]
 7 |         if split == "test":
 8 |             split = "val"
 9 | 
10 |         if split == "train":
11 |             names = ["vg_train"]
12 |         elif split == "val":
13 |             names = []
14 |         elif split == "test":
15 |             names = []
16 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
17 | 
18 |     def __getitem__(self, index):
19 |         return self.get_suite(index)
20 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/f30k_caption_karpathy_dataset.py:
--------------------------------------------------------------------------------
 1 | from .base_dataset import BaseDataset
 2 | 
 3 | 
 4 | class F30KCaptionKarpathyDataset(BaseDataset):
 5 |     def __init__(self, *args, split="", **kwargs):
 6 |         assert split in ["train", "val", "test"]
 7 | 
 8 |         if split == "train":
 9 |             names = ["f30k_caption_karpathy_train", "f30k_caption_karpathy_val"]
10 |         elif split == "val":
11 |             names = ["f30k_caption_karpathy_test"]
12 |         elif split == "test":
13 |             names = ["f30k_caption_karpathy_test"]
14 | 
15 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
16 | 
17 |     def __getitem__(self, index):
18 |         return self.get_suite(index)
19 | 


--------------------------------------------------------------------------------
/violet/tools/extract_tsv.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse, pickle
 3 | from tqdm import tqdm
 4 | 
 5 | def get_args():
 6 |     parser = argparse.ArgumentParser()
 7 |     
 8 |     parser.add_argument('--path', required=True, type=str)
 9 |     
10 |     args = parser.parse_args()
11 |     
12 |     return args
13 | 
14 | if __name__=='__main__':
15 |     args = get_args()
16 |     
17 |     pkl = pickle.load(open('%s.pkl'%(args.path), 'rb'))
18 |     
19 |     file_tsv, file_lineidx = open('%s.tsv'%(args.path), 'w'), open('%s.lineidx'%(args.path), 'w')
20 |     for vid in tqdm(pkl, ascii=True):
21 |         file_lineidx.write('%d\n'%(file_tsv.tell()))
22 |         file_tsv.write(vid)
23 |         for b in pkl[vid]:
24 |             file_tsv.write('\t%s'%(b))
25 |         file_tsv.write('\n')
26 |         
27 |         file_tsv.flush(), file_lineidx.flush()
28 |         
29 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/pixelbert.py:
--------------------------------------------------------------------------------
 1 | from .utils import (
 2 |     inception_normalize,
 3 |     MinMaxResize,
 4 | )
 5 | from torchvision import transforms
 6 | from .randaug import RandAugment
 7 | 
 8 | 
 9 | def pixelbert_transform(size=800):
10 |     longer = int((1333 / 800) * size)
11 |     return transforms.Compose(
12 |         [
13 |             MinMaxResize(shorter=size, longer=longer),
14 |             transforms.ToTensor(),
15 |             inception_normalize,
16 |         ]
17 |     )
18 | 
19 | 
20 | def pixelbert_transform_randaug(size=800):
21 |     longer = int((1333 / 800) * size)
22 |     trs = transforms.Compose(
23 |         [
24 |             MinMaxResize(shorter=size, longer=longer),
25 |             transforms.ToTensor(),
26 |             inception_normalize,
27 |         ]
28 |     )
29 |     trs.transforms.insert(0, RandAugment(2, 9))
30 |     return trs


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/mix.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random
 3 | 
 4 | 
 5 | class SpatialMixup(object):
 6 |     def __init__(self, alpha=0.2, trace=True, version=2):
 7 |         self.alpha = alpha
 8 |         self.trace = trace
 9 |         self.version = version
10 | 
11 |     def mixup_data(self, x):
12 |         """
13 |         return mixed inputs. pairs of targets
14 |         """
15 |         b, t, c, h, w = x.size()
16 |         loss_prob = random.random() * self.alpha
17 |         if self.trace:
18 |             mixed_x = x
19 |         else:
20 |             mixed_x = torch.zeros_like(x)
21 |         for i in range(b):
22 |             tmp = (i+1) % b
23 |             img_index = random.randint(0, t-1)
24 |             for j in range(t):
25 |                 mixed_x[i, j, :, :, :] = (1-loss_prob) * x[i, j, :, :, :] + loss_prob * x[tmp, img_index, :, :, :]
26 |         return mixed_x
27 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/msvdqa_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import MSVDQADataset
 2 | from .datamodule_base import BaseDataModule
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | class MSVDQADataModule(BaseDataModule):
 7 |     def __init__(self, *args, **kwargs):
 8 |         super().__init__(*args, **kwargs)
 9 | 
10 |     @property
11 |     def dataset_cls(self):
12 |         return MSVDQADataset
13 | 
14 |     @property
15 |     def dataset_name(self):
16 |         return "msvdqa"
17 | 
18 |     def setup(self, stage):
19 |         super().setup(stage)
20 |         self.answer2id = self.train_dataset.ans_lab_dict
21 |         sorted_a2i = sorted(self.answer2id.items(), key=lambda x: x[1])
22 |         self.num_class = max(self.answer2id.values()) + 1
23 |         self.id2answer = defaultdict(lambda: "unknown")
24 |         for k, v in sorted_a2i:
25 |             self.id2answer[v] = k
26 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/msrvttqa_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import MSRVTTQADataset
 2 | from .datamodule_base import BaseDataModule
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | class MSRVTTQADataModule(BaseDataModule):
 7 |     def __init__(self, *args, **kwargs):
 8 |         super().__init__(*args, **kwargs)
 9 | 
10 |     @property
11 |     def dataset_cls(self):
12 |         return MSRVTTQADataset
13 | 
14 |     @property
15 |     def dataset_name(self):
16 |         return "msrvttqa"
17 | 
18 |     def setup(self, stage):
19 |         super().setup(stage)
20 |         self.answer2id = self.train_dataset.ans_lab_dict
21 |         sorted_a2i = sorted(self.answer2id.items(), key=lambda x: x[1])
22 |         self.num_class = max(self.answer2id.values()) + 1
23 |         self.id2answer = defaultdict(lambda: "unknown")
24 |         for k, v in sorted_a2i:
25 |             self.id2answer[v] = k
26 | 


--------------------------------------------------------------------------------
/univl/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import print_function
 5 | 
 6 | import numpy as np
 7 | 
 8 | def compute_metrics(x):
 9 |     print("metrics")
10 | 
11 |     sx = np.sort(-x, axis=1)
12 |     d = np.diag(-x)
13 |     d = d[:, np.newaxis]
14 |     ind = sx - d
15 |     ind = np.where(ind == 0)
16 |     ind = ind[1]
17 |     metrics = {}
18 |     metrics['R1'] = float(np.sum(ind == 0)) / len(ind)
19 |     metrics['R5'] = float(np.sum(ind < 5)) / len(ind)
20 |     metrics['R10'] = float(np.sum(ind < 10)) / len(ind)
21 |     metrics['MR'] = np.median(ind) + 1
22 |     return metrics
23 | 
24 | def print_computed_metrics(metrics):
25 |     r1 = metrics['R1']
26 |     r5 = metrics['R5']
27 |     r10 = metrics['R10']
28 |     mr = metrics['MR']
29 |     print('R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'.format(r1, r5, r10, mr))


--------------------------------------------------------------------------------
/allinone/move_pretrained_weight.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | print('move pretrained weights...')
 3 | try:
 4 |     # v100 machines
 5 |     if not os.path.exists('~/.cache/torch/hub/checkpoints/'):
 6 |         os.makedirs('~/.cache/torch/hub/checkpoints/')
 7 |     os.system(
 8 |         'cp pretrained/*.pth ~/.cache/torch/hub/checkpoints/.')
 9 | except Exception as e:
10 |     print(e)
11 | try:
12 |     # v100 machines
13 |     if not os.path.exists('/usr/local/app/.cache/torch/hub/checkpoints/'):
14 |         os.makedirs('/usr/local/app/.cache/torch/hub/checkpoints')
15 |     os.system(
16 |         'cp pretrained/*.pth /usr/local/app/.cache/torch/hub/checkpoints/.')
17 | except Exception as e:
18 |     print(e)
19 | try:
20 |     # a100 machines
21 |     if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
22 |         os.makedirs('/root/.cache/torch/hub/checkpoints/')
23 |     os.system(
24 |         'cp pretrained/*.pth /root/.cache/torch/hub/checkpoints/.')
25 |     print('move finished...')
26 | except Exception as e:
27 |     print(e)


--------------------------------------------------------------------------------
/univl/modules/optimization_MELTR.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.utils import clip_grad_norm_
 2 | from modules.modeling_MELTR import MELTRgrad
 3 | 
 4 | class MELTROptimizer:
 5 | 
 6 |     def __init__(self, meta_optimizer, max_grad_norm=10):
 7 |         self.meta_optimizer = meta_optimizer
 8 |         self.hypergrad = MELTRgrad()
 9 | 
10 |         self.max_grad_norm = max_grad_norm
11 | 
12 |     def step(self, train_loss, val_loss, parameters, aux_params):
13 |         self.zero_grad()
14 | 
15 |         hyper_gards = self.hypergrad.grad(
16 |             loss_val=val_loss,
17 |             loss_train=train_loss,
18 |             aux_params=aux_params,
19 |             params=parameters,
20 |         )
21 |         for p, g in zip(aux_params, hyper_gards):
22 |             if g is not None:
23 |                 p.grad = -g
24 | 
25 |         if self.max_grad_norm is not None:
26 |             clip_grad_norm_(aux_params, max_norm=self.max_grad_norm)
27 | 
28 |         self.meta_optimizer.step()
29 |     def zero_grad(self):
30 |         self.meta_optimizer.zero_grad()


--------------------------------------------------------------------------------
/violet/dataset.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from lib import *
 3 | 
 4 | class Dataset_Base(T.utils.data.Dataset):
 5 |     def __init__(self, args):
 6 |         super().__init__()
 7 |         
 8 |         self.args = args
 9 |         self.tokzr = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')
10 |     
11 |     def str2img(self, b):
12 |         img = Image.open(io.BytesIO(base64.b64decode(b))).convert('RGB')
13 |         w, h = img.size
14 |         img = TV.transforms.Compose([TV.transforms.Pad([0, (w-h)//2] if w>h else [(h-w)//2, 0]), 
15 |                                      TV.transforms.Resize([self.args['size_img'], self.args['size_img']]), 
16 |                                      TV.transforms.ToTensor()])(img)
17 |         return img
18 |     
19 |     def str2txt(self, s):
20 |         txt = self.tokzr.encode(s, padding='max_length', max_length=self.args['size_txt'], truncation=True)
21 |         mask = [1 if w!=0 else w for w in txt]
22 |         txt, mask = np.array(txt, dtype=np.int64), np.array(mask, dtype=np.int64)
23 |         return txt, mask
24 |         


--------------------------------------------------------------------------------
/univl/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Microsoft
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 MLV Lab (Machine Learning and Vision Lab at Korea University)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/coco_caption_karpathy_dataset.py:
--------------------------------------------------------------------------------
 1 | from .base_dataset import BaseDataset
 2 | 
 3 | 
 4 | class CocoCaptionKarpathyDataset(BaseDataset):
 5 |     def __init__(self, *args, split="", **kwargs):
 6 |         assert split in ["train", "val", "test"]
 7 |         self.split = split
 8 | 
 9 |         if split == "train":
10 |             names = ["coco_caption_karpathy_train", "coco_caption_karpathy_restval"]
11 |         elif split == "val":
12 |             names = ["coco_caption_karpathy_val"]
13 |             # names = ["coco_caption_karpathy_test"]
14 |             # names = []  # for fast train
15 |         elif split == "test":
16 |             names = ["coco_caption_karpathy_test"]
17 |             # names = []
18 | 
19 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
20 | 
21 |     def __getitem__(self, index):
22 |         suite = self.get_suite(index)
23 | 
24 |         if "test" in self.split:
25 |             _index, _question_index = self.index_mapper[index]
26 |             iid = self.table["image_id"][_index].as_py()
27 |             iid = int(iid.split(".")[0].split("_")[-1])
28 |             suite.update({"iid": iid})
29 | 
30 |         return suite
31 | 


--------------------------------------------------------------------------------
/violet/tools/extract_video-frame.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse, av, base64, io, pickle
 3 | 
 4 | from glob import glob
 5 | from tqdm import tqdm
 6 | 
 7 | def get_args():
 8 |     parser = argparse.ArgumentParser()
 9 |     
10 |     parser.add_argument('--sample', required=True, type=int)
11 |     
12 |     args = parser.parse_args()
13 |     
14 |     return args
15 | 
16 | if __name__=='__main__':
17 |     args = get_args()
18 |     
19 |     lst = glob('/hub_data2/dohwan/MSVD/videos/*.avi')
20 |     
21 |     pkl = {}
22 |     for f in tqdm(lst, ascii=True):
23 |         vid = f.split('/')[-1].replace('.avi', '')
24 |         
25 |         imgs = []
26 |         for pack in av.open(f).demux():
27 |             for buf in pack.decode():
28 |                 if str(type(buf))=="<class 'av.video.frame.VideoFrame'>":
29 |                     imgs.append(buf.to_image().convert('RGB'))
30 |         N = len(imgs)/(args.sample+1)
31 |         
32 |         pkl[vid] = []
33 |         for i in range(args.sample):
34 |             buf = io.BytesIO()
35 |             imgs[int(N*(i+1))].save(buf, format='JPEG')
36 |             pkl[vid].append(str(base64.b64encode(buf.getvalue()))[2:-1])
37 |     pickle.dump(pkl, open('msvd.pkl', 'wb'))
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MELTR: Meta Loss Transformer for Learning to Fine-tune Video Foundation Models
 2 | 
 3 | This is the official implementation of MELTR (CVPR 2023). ([arxiv](https://arxiv.org/abs/2303.13009))
 4 | 
 5 | > Dohwan Ko<sup>1*</sup>, Joonmyung Choi<sup>1*</sup>, Hyeong Kyu Choi<sup>1</sup>, Kyoung-Woon On<sup>2</sup>, Byungseok Roh<sup>2</sup>, Hyunwoo J. Kim<sup>1</sup>.
 6 | >
 7 | > <sup>1</sup>Korea University  <sup>2</sup>Kakao Brain
 8 | 
 9 | 
10 | 
11 | <div align="center">
12 |   <img src="asset/main.png" width="900px" />
13 | </div>
14 | 
15 | 
16 | ## Code Repositories
17 | * [UniVL + MELTR](https://github.com/mlvlab/MELTR/tree/master/univl)
18 | 
19 | * [Violet + MELTR](https://github.com/mlvlab/MELTR/tree/master/violet)
20 | 
21 | * [All-in-one + MELTR](https://github.com/mlvlab/MELTR/tree/master/allinone)
22 | 
23 | 
24 | 
25 | ## Citation
26 | 
27 | ```
28 | @inproceedings{ko2023meltr,
29 |   title={MELTR: Meta Loss Transformer for Learning to Fine-tune Video Foundation Models},
30 |   author={Ko, Dohwan and Choi, Joonmyung and Choi, Hyeong Kyu and On, Kyoung-Woon and Roh, Byungseok and Kim, Hyunwoo J},
31 |   booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
32 |   year={2023}
33 | }
34 | ```


--------------------------------------------------------------------------------
/violet/agent.py:
--------------------------------------------------------------------------------
 1 | from meltr import MELTROptimizer
 2 | from lib import *
 3 | 
 4 | class Agent_Base:
 5 |     def __init__(self, args, model):
 6 |         super().__init__()
 7 |         
 8 |         self.args, self.model = args, model
 9 |         
10 |         self.loss_func = T.nn.CrossEntropyLoss(ignore_index=-1).cuda()
11 |         self.optzr = T.optim.AdamW(self.model.parameters(), lr=args['lr'], betas=(0.9, 0.98), weight_decay=args['decay'])
12 |         self.scaler = T.cuda.amp.GradScaler()
13 |         
14 | class Agent_Base_MELTR:
15 |     def __init__(self, args, model, aux_model=None):
16 |         super().__init__()
17 |         
18 |         self.args, self.model, self.aux_model = args, model, aux_model
19 |         
20 |         self.loss_func = T.nn.CrossEntropyLoss(ignore_index=-1).cuda()
21 |         self.optzr = T.optim.AdamW(self.model.parameters(), lr=args['lr'], betas=(0.9, 0.98), weight_decay=args['decay'])
22 |         self.scaler = T.cuda.amp.GradScaler()
23 |         
24 |         if aux_model is not None:
25 |             self.aux_optzr = T.optim.AdamW(self.aux_model.parameters(), lr=args['meltr_lr'], betas=(0.9, 0.98), weight_decay=args['meltr_decay'])
26 |             self.meta_optim = MELTROptimizer(meta_optimizer=self.aux_optzr, max_grad_norm=args['max_grad_norm'])


--------------------------------------------------------------------------------
/violet/utils.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | 
 3 | class AverageMeter:
 4 |     ''' Computes and stores the average and current value. '''
 5 |     def __init__(self) -> None:
 6 |         self.reset()
 7 | 
 8 |     def reset(self) -> None:
 9 |         self.val = 0.0
10 |         self.avg = 0.0
11 |         self.sum = 0.0
12 |         self.count = 0
13 | 
14 |     def update(self, val: float, n: int = 1) -> None:
15 |         if type(val) == torch.Tensor:
16 |             val = float(val.detach().cpu().data)
17 |         self.val = val
18 |         self.sum += val * n
19 |         self.count += n
20 |         self.avg = self.sum / self.count
21 |     def sample(self):
22 |         return "\
23 |         end = time.time() \n\
24 |         batch_time = AverageMeter() \n\
25 |         batch_time.update(time.time() - end) \n\
26 |         end = time.time() \n\
27 |         avg_score = AverageMeter()\n\
28 |         accuracy = 0.1\n\
29 |         avg_score.update(accuracy)\n\
30 |         losses = AverageMeter()\n\
31 |         loss = 0\n\
32 |         batch_size = 128\n\
33 |         losses.update(loss.data.item(), batch_size)\n\
34 |         print(f'time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'\n\
35 |               f'loss {losses.val:.4f} ({losses.avg:.4f})\t' \n\
36 |               f'acc {avg_score.val:.4f} ({avg_score.avg:.4f})')"


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # == pretrain data
 2 | # = image
 3 | from .vg_caption_dataset import VisualGenomeCaptionDataset
 4 | from .coco_caption_karpathy_dataset import CocoCaptionKarpathyDataset
 5 | from .sbu_caption_dataset import SBUCaptionDataset
 6 | from .cc3m import CC3MDataset
 7 | # = video
 8 | from .webvid import WEBVIDDataset
 9 | from .howto100m import HT100MDataset
10 | from .yttemporal import YTTemporalDataset
11 | # == downstream data
12 | # = image
13 | from .f30k_caption_karpathy_dataset import F30KCaptionKarpathyDataset
14 | from .vqav2_dataset import VQAv2Dataset
15 | from .nlvr2_dataset import NLVR2Dataset
16 | # = video
17 | from .msrvtt import MSRVTTDataset
18 | from .msrvttqa import MSRVTTQADataset
19 | from .msrvtt_choice import MSRVTTChoiceDataset
20 | from .msvd import MSVDDataset
21 | from .lsmdc_dataset import LSMDCDataset
22 | from .msvdqa import MSVDQADataset
23 | from .vcr import VCRDataset
24 | from .ego4d import Ego4DDataset
25 | from .tvqa import TVQADataset
26 | from .lsmdc_choice import LSMDCChoiceDataset
27 | from .ego4d_choice import EGO4DChoiceDataset
28 | from .tgif import TGIFDataset
29 | from .tgifqa import TGIFQADataset
30 | from .didemo import DIDEMODataset
31 | from .hmdb51 import HMDB51Dataset
32 | from .k400 import K400Dataset
33 | from .activitynet import ActivityNetDataset


--------------------------------------------------------------------------------
/allinone/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # All-in-one + MELTR
 3 | 
 4 | 
 5 | ## Preparation
 6 | ### Requirements
 7 | 
 8 | Our code is implemented under [All-in-one](https://github.com/showlab/all-in-one) environment with PyTorch 1.10+.
 9 | 
10 | ### Datasets
11 | 
12 | We use MSRVTT for text-to-video retrieval and All-in-one also provides downstream datasets [here](https://github.com/showlab/all-in-one/blob/main/DATA.md).
13 | 
14 | Annotation files of MSRVTT can be found [here](https://drive.google.com/drive/folders/1nXWGRKjm6fwYly4YCgdKu7XtV2IXGUix).
15 | 
16 | ### Pretrained checkpoint
17 | 
18 | You can download the pretrained checkpoint of All-in-one [here](https://drive.google.com/file/d/1Yd2lKppaduqG_RO1gCA6OpAfB0_IXDoX/view?usp=sharing).
19 | 
20 | Then, place the files as follows:
21 | 
22 | ```
23 | data
24 |  |─ msrvtt
25 |  │   └─ videos
26 |  |   |   │─ video0.mp4
27 |  |   |   :
28 |  │   │─ train_list_9k.txt
29 |  |   │─ train_list_7k.txt
30 |  |   │─ val_list_jsfusion.txt
31 |  |   │─ MSR_VTT.json
32 |  |   │─ jsfusion_val_caption_idx.pkl
33 | 
34 | checkpoint
35 |  |─ all-in-one-plus-224.ckpt
36 | ```
37 | 
38 | 
39 | 
40 | ## Training & Evaluation
41 | 
42 | ```
43 | python run.py with \
44 | data_root=./data/msrvtt num_gpus=8 num_nodes=1 \
45 | per_gpu_batchsize=16 msrvtt_retrieval_MELTR \
46 | num_frames=3 \
47 | load_path="./checkpoint/all-in-one-plus-224.ckpt"
48 | ```
49 | 
50 | 
51 | 
52 | 
53 | ## Acknowledgement
54 | This repo is built upon [All-in-one](https://github.com/showlab/all-in-one).
55 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/vqav2_datamodule.py:
--------------------------------------------------------------------------------
 1 | from AllInOne.datasets import VQAv2Dataset
 2 | from .datamodule_base import BaseDataModule
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | class VQAv2DataModule(BaseDataModule):
 7 |     def __init__(self, *args, **kwargs):
 8 |         super().__init__(*args, **kwargs)
 9 | 
10 |     @property
11 |     def dataset_cls(self):
12 |         return VQAv2Dataset
13 | 
14 |     @property
15 |     def dataset_name(self):
16 |         return "vqa"
17 | 
18 |     def setup(self, stage):
19 |         super().setup(stage)
20 | 
21 |         train_answers = self.train_dataset.table["answers"].to_pandas().tolist()
22 |         val_answers = self.val_dataset.table["answers"].to_pandas().tolist()
23 |         train_labels = self.train_dataset.table["answer_labels"].to_pandas().tolist()
24 |         val_labels = self.val_dataset.table["answer_labels"].to_pandas().tolist()
25 | 
26 |         all_answers = [c for c in train_answers + val_answers if c is not None]
27 |         all_answers = [l for lll in all_answers for ll in lll for l in ll]
28 |         all_labels = [c for c in train_labels + val_labels if c is not None]
29 |         all_labels = [l for lll in all_labels for ll in lll for l in ll]
30 | 
31 |         self.answer2id = {k: v for k, v in zip(all_answers, all_labels)}
32 |         sorted_a2i = sorted(self.answer2id.items(), key=lambda x: x[1])
33 |         self.num_class = max(self.answer2id.values()) + 1
34 |         self.id2answer = defaultdict(lambda: "unknown")
35 |         for k, v in sorted_a2i:
36 |             self.id2answer[v] = k
37 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/didemo.py:
--------------------------------------------------------------------------------
 1 | from .video_base_dataset import BaseDataset
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | # some videos are missed, for better results, do IO exception.
 6 | 
 7 | 
 8 | class DIDEMODataset(BaseDataset):
 9 |     def __init__(self, *args, split="", **kwargs):
10 |         assert split in ["train", "val", "test"]
11 |         self.split = split
12 |         self.metadata = None
13 |         if split == "train":
14 |             names = ["didemo_train"]
15 |         elif split == "val":
16 |             names = ["didemo_val"]
17 |         elif split == "test":
18 |             names = ["didemo_val"]
19 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
20 |         self._load_metadata()
21 | 
22 |     def _load_metadata(self):
23 |         metadata_dir = './meta_data/didemo'
24 |         split_files = {
25 |             'train': 'DiDeMo_train.tsv',
26 |             'val': 'DiDeMo_val.tsv',  # there is no test
27 |             'test': 'DiDeMo_test.tsv'
28 |         }
29 |         target_split_fp = split_files[self.split]
30 |         metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t')
31 |         self.metadata = metadata
32 |         print("load split {}, {} samples".format(self.split, len(metadata)))
33 | 
34 |     def _get_video_path(self, sample):
35 |         rel_video_fp = sample[1]
36 |         full_video_fp = os.path.join(self.data_dir, 'video', rel_video_fp)
37 |         return full_video_fp, rel_video_fp
38 | 
39 |     def _get_caption(self, sample):
40 |         return sample[0]
41 | 
42 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/vqav2_dataset.py:
--------------------------------------------------------------------------------
 1 | from .base_dataset import BaseDataset
 2 | 
 3 | 
 4 | class VQAv2Dataset(BaseDataset):
 5 |     def __init__(self, *args, split="", **kwargs):
 6 |         assert split in ["train", "val", "test"]
 7 |         self.split = split
 8 | 
 9 |         if split == "train":
10 |             names = ["vqav2_train", "vqav2_trainable_val"]
11 |         elif split == "val":
12 |             names = ["vqav2_rest_val"]
13 |         elif split == "test":
14 |             names = ["vqav2_test"]  # vqav2_test-dev for test-dev
15 | 
16 |         super().__init__(
17 |             *args,
18 |             **kwargs,
19 |             names=names,
20 |             text_column_name="questions",
21 |             remove_duplicate=False,
22 |         )
23 | 
24 |     def __getitem__(self, index):
25 |         image_tensor = self.get_image(index)["image"]
26 |         text = self.get_text(index)["text"]
27 | 
28 |         index, question_index = self.index_mapper[index]
29 |         qid = self.table["question_id"][index][question_index].as_py()
30 | 
31 |         if self.split != "test":
32 |             answers = self.table["answers"][index][question_index].as_py()
33 |             labels = self.table["answer_labels"][index][question_index].as_py()
34 |             scores = self.table["answer_scores"][index][question_index].as_py()
35 |         else:
36 |             answers = list()
37 |             labels = list()
38 |             scores = list()
39 | 
40 |         return {
41 |             "image": image_tensor,
42 |             "text": text,
43 |             "vqa_answer": answers,
44 |             "vqa_labels": labels,
45 |             "vqa_scores": scores,
46 |             "qid": qid,
47 |         }
48 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/nlvr2_dataset.py:
--------------------------------------------------------------------------------
 1 | from .base_dataset import BaseDataset
 2 | import sys
 3 | import random
 4 | 
 5 | 
 6 | class NLVR2Dataset(BaseDataset):
 7 |     def __init__(self, *args, split="", **kwargs):
 8 |         assert split in ["train", "val", "test"]
 9 |         self.split = split
10 | 
11 |         if split == "train":
12 |             names = ["nlvr2_train"]
13 |         elif split == "val":
14 |             names = ["nlvr2_dev", "nlvr2_test1"]
15 |         elif split == "test":
16 |             names = ["nlvr2_dev", "nlvr2_test1"]
17 | 
18 |         super().__init__(
19 |             *args,
20 |             **kwargs,
21 |             names=names,
22 |             text_column_name="questions",
23 |             remove_duplicate=False,
24 |         )
25 | 
26 |     def __getitem__(self, index):
27 |         result = None
28 |         while result is None:
29 |             try:
30 |                 image_tensor_0 = self.get_image(index, image_key="image_0")["image"]
31 |                 image_tensor_1 = self.get_image(index, image_key="image_1")["image"]
32 |                 text = self.get_text(index)["text"]
33 |                 result = True
34 |             except:
35 |                 print(
36 |                     f"error while read file idx {index} in {self.names[0]}",
37 |                     file=sys.stderr,
38 |                 )
39 |                 index = random.randint(0, len(self.index_mapper) - 1)
40 | 
41 |         index, question_index = self.index_mapper[index]
42 |         answers = self.table["answers"][index][question_index].as_py()
43 |         answers = answers == "True"
44 | 
45 |         return {
46 |             "image_0": image_tensor_0,
47 |             "image_1": image_tensor_1,
48 |             "text": text,
49 |             "answers": answers,
50 |             "table_name": self.table_names[index],
51 |         }
52 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/utils.py:
--------------------------------------------------------------------------------
 1 | from torchvision import transforms
 2 | from PIL import Image
 3 | 
 4 | 
 5 | class MinMaxResize:
 6 |     def __init__(self, shorter=800, longer=1333):
 7 |         self.min = shorter
 8 |         self.max = longer
 9 | 
10 |     def __call__(self, x):
11 |         w, h = x.size
12 |         scale = self.min / min(w, h)
13 |         if h < w:
14 |             newh, neww = self.min, scale * w
15 |         else:
16 |             newh, neww = scale * h, self.min
17 | 
18 |         if max(newh, neww) > self.max:
19 |             scale = self.max / max(newh, neww)
20 |             newh = newh * scale
21 |             neww = neww * scale
22 | 
23 |         newh, neww = int(newh + 0.5), int(neww + 0.5)
24 |         newh, neww = newh // 32 * 32, neww // 32 * 32
25 | 
26 |         return x.resize((neww, newh), resample=Image.BICUBIC)
27 | 
28 | 
29 | class UnNormalize(object):
30 |     def __init__(self, mean, std):
31 |         self.mean = mean
32 |         self.std = std
33 | 
34 |     def __call__(self, tensor):
35 |         """
36 |         Args:
37 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
38 |         Returns:
39 |             Tensor: Normalized image.
40 |         """
41 |         for t, m, s in zip(tensor, self.mean, self.std):
42 |             t.mul_(s).add_(m)
43 |             # The normalize code -> t.sub_(m).div_(s)
44 |         return tensor
45 | 
46 | 
47 | # This is simple maximum entropy normalization performed in Inception paper
48 | inception_normalize = transforms.Compose(
49 |     [transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]
50 | )
51 | 
52 | # ViT uses simple non-biased inception normalization
53 | # https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py#L132
54 | inception_unnormalize = transforms.Compose(
55 |     [UnNormalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]
56 | )
57 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/msvd.py:
--------------------------------------------------------------------------------
 1 | from .video_base_dataset import BaseDataset
 2 | import random
 3 | import os
 4 | import pandas as pd
 5 | 
 6 | 
 7 | class MSVDDataset(BaseDataset):
 8 |     def __init__(self, *args, split="", **kwargs):
 9 |         assert split in ["train", "val", "test"]
10 |         self.split = split
11 |         self.metadata = None
12 |         if split == "train":
13 |             names = ["msvd_train"]
14 |         elif split == "val":
15 |             names = ["msvd_val"]
16 |         elif split == "test":
17 |             names = ["msvd_test"]
18 |         self._load_metadata()
19 |         # self.num_frames = kwargs['num_frames']
20 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
21 | 
22 |     def _load_metadata(self):
23 |         metadata_dir = './meta_data/msvd'
24 |         split_files = {
25 |             'train': 'MSVD_train.tsv',
26 |             'val': 'MSVD_test.tsv',  # MSVD_val.tsv
27 |             'test': 'MSVD_test.tsv'
28 |         }
29 |         target_split_fp = split_files[self.split]
30 |         metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t')
31 |         self.metadata = metadata
32 |         print("load split {}, {} samples".format(self.split, len(metadata)))
33 | 
34 |     def _get_video_path(self, sample):
35 |         rel_video_fp = sample[1] + '.avi'
36 |         full_video_fp = os.path.join(self.data_dir, 'YouTubeClips', rel_video_fp)
37 |         return full_video_fp, rel_video_fp
38 | 
39 |     def _get_caption(self, sample):
40 |         if self.split == 'train':
41 |             words = sample[0].split(',')
42 |             num_word = len(words)
43 |             index = random.randint(0, num_word - 1)
44 |             caption = words[index]
45 |         else:
46 |             # caption = sample[0]
47 |             words = sample[0].split(',')
48 |             num_word = len(words)
49 |             index = random.randint(0, num_word - 1)
50 |             caption = words[index]
51 |         return caption
52 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/videoaug.py:
--------------------------------------------------------------------------------
 1 | # input: (C, T, H, W) output: (C, T, H, W)
 2 | def VideoTransform(mode='train', crop_size=224, backend='v100'):
 3 |     if backend == 'a100':
 4 |         print("initalize data augmentation for a100 gpus")
 5 |         import AllInOne.transforms.video_transform as video_transform
 6 |         from torchvision import transforms
 7 |         # https://github.com/FingerRec/BE/blob/main/src/Contrastive/augment/video_transformations/volume_transforms.py
 8 |         if mode == 'train':
 9 |             data_transforms = transforms.Compose([
10 |                 video_transform.TensorToNumpy(),
11 |                 video_transform.Resize(int(crop_size*1.2)),  # 256/224 = 1.14
12 |                 video_transform.RandomCrop(crop_size),
13 |                 # video_transform.ColorJitter(0.5, 0.5, 0.25, 0.5),  # color operation perimitted, damage attribute
14 |                 video_transform.ClipToTensor(channel_nb=3),
15 |                 video_transform.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
16 |             ])
17 |         else:
18 |             data_transforms = transforms.Compose([
19 |                 video_transform.TensorToNumpy(),
20 |                 video_transform.Resize(int(crop_size*1.2)),  # 256
21 |                 video_transform.CenterCrop(crop_size),  # 224
22 |                 video_transform.ClipToTensor(channel_nb=3),
23 |                 video_transform.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
24 |             ])
25 |         return data_transforms
26 |     else:
27 |         # for pytorch > 1.9.0, V100
28 |         import pytorchvideo.transforms as video_transforms
29 |         # https://pytorchvideo.readthedocs.io/en/latest/api/transforms/transforms.html
30 |         return video_transforms.create_video_transform(mode=mode, min_size=int(crop_size*1.2),
31 |                                                        max_size=int(crop_size*1.5),
32 |                                                        crop_size=crop_size,
33 |                                                        aug_type='randaug',  # randaug/augmix
34 |                                                        num_samples=None)  # not use temporal sub sampling


--------------------------------------------------------------------------------
/violet/tools/extract_vq.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse, base64, io, pickle
 3 | from glob import glob
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | import numpy as np
 8 | import torch as T
 9 | import torchvision as TV
10 | from dall_e import map_pixels, unmap_pixels, load_model
11 | 
12 | from PIL import Image
13 | 
14 | def get_args():
15 |     parser = argparse.ArgumentParser()
16 |     
17 |     parser.add_argument('--path', required=True, type=str)
18 |     parser.add_argument('--frame', required=True, type=int)
19 |     
20 |     args = parser.parse_args()
21 |     
22 |     return args
23 | 
24 | def proc_buf(buf, _F):
25 |     img = Image.open(io.BytesIO(base64.b64decode(buf)))
26 |     w, h = img.size
27 |     img = TV.transforms.Compose([TV.transforms.Pad([0, (w-h)//2] if w>h else [(h-w)//2, 0]), 
28 |                                  TV.transforms.Resize([_F, _F]), 
29 |                                  TV.transforms.ToTensor()])(img).unsqueeze(0)
30 |     img = map_pixels(img)
31 |     return img
32 | 
33 | if __name__=='__main__':
34 |     args = get_args()
35 |     
36 |     dalle_enc = load_model('encoder.pkl', T.device('cpu')).cuda() # https://cdn.openai.com/dall-e/encoder.pkl
37 |     # dalle_dec = load_model('decoder.pkl', T.device('cpu')).cuda() # https://cdn.openai.com/dall-e/decoder.pkl
38 |     
39 |     
40 |     lst = glob(f'{args.path}/pickles/*.pkl')
41 |     pickle_list = []
42 |     for file in tqdm(lst):
43 |         pickle_list.append(pickle.load(open(f'{file}', 'rb'))) 
44 | 
45 |     for pkl in tqdm(pickle_list):
46 |         vq = {}
47 |         for vid in pkl:
48 |             imgs = [proc_buf(b, int(args.frame//32*8)) for b in pkl[vid]]
49 |             imgs = T.cat(imgs, dim=0)
50 |             
51 |             z = dalle_enc(imgs.cuda())
52 |             z = T.argmax(z, dim=1)
53 |             vq[vid] = z.data.cpu().numpy().astype(np.int16)
54 |             
55 |             '''o = T.nn.functional.one_hot(z, num_classes=dalle_enc.vocab_size).permute(0, 3, 1, 2).float()
56 |             o = dalle_dec(o).float()
57 |             rec = unmap_pixels(T.sigmoid(o[:, :3]))
58 |             rec = [TV.transforms.ToPILImage(mode='RGB')(r) for r in rec]'''
59 |         pickle.dump(vq, open(f'{args.path}/vq/{vid}_vq.pkl', 'wb'))
60 |     


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/heads.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from transformers.models.bert.modeling_bert import BertPredictionHeadTransform
 6 | 
 7 | 
 8 | class Pooler(nn.Module):
 9 |     def __init__(self, hidden_size):
10 |         super().__init__()
11 |         self.dense = nn.Linear(hidden_size, hidden_size)
12 |         self.activation = nn.Tanh()
13 | 
14 |     def forward(self, hidden_states):
15 |         # print(hidden_states.size()) # 64 x 237 x 768
16 |         first_token_tensor = hidden_states[:, 0]
17 |         pooled_output = self.dense(first_token_tensor)
18 |         pooled_output = self.activation(pooled_output)
19 |         return pooled_output
20 | 
21 | 
22 | class ITMHead(nn.Module):
23 |     def __init__(self, hidden_size):
24 |         super().__init__()
25 |         self.fc = nn.Linear(hidden_size, 2)
26 | 
27 |     def forward(self, x):
28 |         x = self.fc(x)
29 |         return x
30 | 
31 | 
32 | class MLMHead(nn.Module):
33 |     def __init__(self, config, weight=None):
34 |         super().__init__()
35 |         self.transform = BertPredictionHeadTransform(config)
36 |         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
37 |         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
38 |         if weight is not None:
39 |             self.decoder.weight = weight
40 | 
41 |     def forward(self, x):
42 |         x = self.transform(x)
43 |         x = self.decoder(x) + self.bias
44 |         return x
45 | 
46 | 
47 | class MPPHead(nn.Module):
48 |     def __init__(self, config):
49 |         super().__init__()
50 |         self.transform = BertPredictionHeadTransform(config)
51 |         self.decoder = nn.Linear(config.hidden_size, 256 * 3)
52 | 
53 |     def forward(self, x):
54 |         x = self.transform(x)
55 |         x = self.decoder(x)
56 |         return x
57 | 
58 | 
59 | class MLP(nn.Module):
60 |     def __init__(self, hidden_size):
61 |         super().__init__()
62 |         self.fc = nn.Sequential(
63 |                             nn.Linear(hidden_size, 128),
64 |                             nn.GELU(),
65 |                             nn.Linear(128, 1))
66 | 
67 |         # self.fc = nn.Linear(hidden_size, 1)
68 | 
69 |     def forward(self, x):
70 | 
71 |         x = self.fc(x.squeeze(-1))
72 |         return x


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/lsmdc_dataset.py:
--------------------------------------------------------------------------------
 1 | from .video_base_dataset import BaseDataset
 2 | import random
 3 | import os
 4 | import pandas as pd
 5 | from AllInOne.transforms.videoaug import VideoTransform
 6 | 
 7 | 
 8 | class LSMDCDataset(BaseDataset):
 9 |     def __init__(self, *args, split="", **kwargs):
10 |         assert split in ["train", "val", "test"]
11 |         self.split = split
12 |         self.metadata = None
13 |         if split == "train":
14 |             names = ["lsmdc_train"]
15 |         elif split == "val":
16 |             names = ["lsmdc_val"]
17 |         elif split == "test":
18 |             names = ["lsmdc_test"]
19 |         self._load_metadata()
20 |         # self.num_frames = kwargs['num_frames']
21 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
22 | 
23 |     def _load_metadata(self):
24 |         metadata_dir = './meta_data/lsmdc'
25 |         split_files = {
26 |             'train': 'LSMDC16_annos_training.csv',
27 |             'val': 'LSMDC16_challenge_1000_publictect.csv',  # LSMDC16_annos_val.csv
28 |             'test': 'LSMDC16_challenge_1000_publictect.csv'
29 |         }
30 |         target_split_fp = split_files[self.split]
31 |         metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t', header=None, error_bad_lines=False)
32 |         self.metadata = metadata
33 |         print("load split {}, {} samples".format(self.split, len(metadata)))
34 | 
35 |     def _get_video_path(self, sample):
36 |         # e.g. 3009_BATTLE_LOS_ANGELES_00.03.07.170-00.03.09.675 -> 3009_BATTLE_LOS_ANGELES/3009_BATTLE_LOS_ANGELES_00.03.07.170-00.03.09.675
37 |         sub_dir = '_'.join(sample[0].split('_')[:-1])
38 |         rel_video_fp = sample[0] + '.avi'
39 |         full_video_fp = os.path.join(self.data_dir, sub_dir, rel_video_fp)
40 |         return full_video_fp, rel_video_fp
41 | 
42 |     def _get_caption(self, sample):
43 |         if self.split == 'train':
44 |             words = sample[0].split(',')
45 |             num_word = len(words)
46 |             index = random.randint(0, num_word - 1)
47 |             caption = words[index]
48 |         else:
49 |             # caption = sample[0]
50 |             words = sample[0].split(',')
51 |             num_word = len(words)
52 |             index = random.randint(0, num_word - 1)
53 |             caption = words[index]
54 |         return caption
55 | 


--------------------------------------------------------------------------------
/violet/README.md:
--------------------------------------------------------------------------------
 1 | # Violet + MELTR
 2 | 
 3 | 
 4 | ## Preparation
 5 | 
 6 | ### Requirements
 7 | 
 8 | Our code is implemented under [Violet](https://github.com/tsujuifu/pytorch_violet) environment with PyTorch 1.10+.
 9 | 
10 | ### Datasets
11 | 
12 | We use three datasets (MSRVTT, TGIF, and MSVD). Violet also provides downstream datasets and annotation files [here](https://drive.google.com/drive/u/2/folders/1BisJHVUOLeHWmnAeMrCHvy1BP9XBXNkQ). 
13 | 
14 | Annotation files of msrvtt can be found [here](https://drive.google.com/drive/folders/1rVnRBZ45g96TlTnxbFBBP2AVfOB-Tf3J).
15 | 
16 | Download them and run the below command to extract VQ tokens for MVM.
17 | 
18 | ```
19 | cd tools
20 | wget https://cdn.openai.com/dall-e/encoder.pkl # download trained dall-e encoder
21 | python extract_vq.py --path=msrvtt --frame=224 # output: msrvtt_vq.pkl
22 | ```
23 | 
24 | ### Pretrained checkpoint
25 | 
26 | You can download the pretrained checkpoint of Violet [here](https://drive.google.com/file/d/1RLbthdRIflxCFjRTcVV5jQJGP30_lNfg/view).
27 | 
28 | Then, place the files as follows:
29 | 
30 | ```
31 | data
32 |  |─ msrvtt
33 |  │   |─ img_msrvtt.pkl
34 |  │   │─ msrvtt_vq.pkl
35 |  |   │─ train_9k.json
36 |  |   │─ train_7k.json
37 |  |   │─ test.json
38 |  |
39 |  |─ tgif
40 |  |   │─ img_tgif.pkl
41 |  |   │─ tgif_vq.pkl
42 |  |   |─ txt_tgif-action.json
43 |  |   |─ txt_tgif-transition.json
44 |  |   |─ txt_tgif-frame.json
45 |  |   
46 |  |─ msvd
47 |  |   │─ img_msvd.pkl
48 |  |   │─ msvd_vq.pkl
49 |  |   │─ txt_msvd-qa.json
50 | 
51 | checkpoint
52 |  |─ ckpt_violet_pretrain.pt
53 | ```
54 | 
55 | 
56 | 
57 | ## Training & Evaluation
58 | 
59 | + Multiple-Choice Question Answering
60 | ```
61 | python main_qamc.py ./args/args_tgif-action.json
62 | python main_qamc.py ./args/args_tgif-transition.json
63 | ```
64 | + Open-Ended Question Answering
65 | ```
66 | python main_qaoe.py ./args/args_msvd-qaoe.json
67 | python main_qaoe.py ./args/args_tgif-frame.json
68 | ```
69 | + Text-to-Video Retrieval
70 | ```
71 | python main_retrieval.py ./args/args_msrvtt-retrieval_7k.json
72 | python main_retrieval.py ./args/args_msrvtt-retrieval_9k.json
73 | python eval_retrieval.py ./args/args_msrvtt-retrieval_eval.json
74 | ```
75 | You may modify 'path_ckpt' of './args/args_msrvtt-retrieval_eval.json' for evaluation.
76 | 
77 | 
78 | 
79 | ## Acknowledgement
80 | 
81 | This repo is built upon [Violet](https://github.com/tsujuifu/pytorch_violet).
82 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/ego4d.py:
--------------------------------------------------------------------------------
 1 | from .video_base_dataset import BaseDataset, video_reader, read_large_frames_decord
 2 | import torch as th
 3 | from torch.utils.data import Dataset
 4 | import pandas as pd
 5 | import os
 6 | import numpy as np
 7 | import random
 8 | import ffmpeg
 9 | import time
10 | import re
11 | import json
12 | from AllInOne.transforms.videoaug import VideoTransform
13 | import cv2
14 | import subprocess
15 | 
16 | # {'timestamp_sec': 221.29666, 'narration_text': '#C C walks on the ground'}
17 | 
18 | 
19 | class Ego4DDataset(BaseDataset):
20 |     """EGO4D Video-Text loader."""
21 | 
22 |     def __init__(self, *args, split="", **kwargs):
23 |         assert split in ["train", "val", "test"]
24 |         self.split = split
25 | 
26 |         if split == "train":
27 |             names = ["ego4d_train"]
28 |         elif split == "val":
29 |             names = ["ego4d_val"]
30 |         elif split == "test":
31 |             names = ["ego4d_test"]
32 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
33 | 
34 |         self._load_metadata()
35 | 
36 |     def _load_metadata(self):
37 |         metadata_dir = './meta_data/ego4d'
38 |         split_files = {
39 |             'train': 'ego4d_train_subset.csv',
40 |             'val': 'ego4d_val_ts_clean.csv',
41 |             'test': 'ego4d_val_ts_clean.csv' # there is no test
42 |         }
43 |         target_split_fp = split_files[self.split]
44 |         self.metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t',  header=None, error_bad_lines=False)
45 | 
46 |     def _get_video_path(self, sample):
47 |         rel_video_fp = sample[0] + '.mp4'
48 |         full_video_fp = os.path.join(self.data_dir, 'videos', rel_video_fp)
49 |         if not os.path.exists(full_video_fp):
50 |             Exception(IOError)
51 |         return full_video_fp, rel_video_fp
52 | 
53 |     def _get_caption(self, sample):
54 |         return sample[6]
55 | 
56 |     def get_raw_video(self, sample):
57 |         abs_fp, rel_fp = self._get_video_path(sample)
58 |         # if int(sample[2]) > 600:
59 |         #     raise Exception("Video is longer than 10m!", rel_fp)
60 |         frame_end, frame_loc = int(sample[3]), int(sample[5])
61 |         # imgs = video_reader(abs_fp, frame_loc, frame_end, self.num_frames)
62 |         imgs = read_large_frames_decord(abs_fp, frame_loc, frame_end, self.num_frames)
63 |         if imgs is None:
64 |             raise Exception("Invalid video!", rel_fp)
65 |         else:
66 |             return imgs
67 | 
68 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/__init__.py:
--------------------------------------------------------------------------------
 1 | # pretrain dataset
 2 | ## video
 3 | from .webvid_datamodule import WEBVIDDataModule
 4 | from .howto100m_datamodule import HT100MDataModule
 5 | from .yttemporal_datamodule import YTTemporalMDataModule
 6 | ## image
 7 | from .cc3m_datamodule import CC3MDataModule
 8 | from .vg_caption_datamodule import VisualGenomeCaptionDataModule
 9 | from .coco_caption_karpathy_datamodule import CocoCaptionKarpathyDataModule
10 | from .sbu_datamodule import SBUCaptionDataModule
11 | # finetune dataset
12 | ## image
13 | from .f30k_caption_karpathy_datamodule import F30KCaptionKarpathyDataModule
14 | from .vqav2_datamodule import VQAv2DataModule
15 | from .nlvr2_datamodule import NLVR2DataModule
16 | from .msrvtt_datamodule import MSRVTTDataModule
17 | from .msrvttqa_datamodule import MSRVTTQADataModule
18 | from .msrvtt_choice_datamodule import MSRVTTChoiceDataModule
19 | from .msvd_datamodule import MSVDDataModule
20 | from .msvdqa_datamodule import MSVDQADataModule
21 | from .vcr_datamodule import VCRDataModule
22 | ## video
23 | from .ego4d_datamodule import Ego4DDataModule
24 | from .tvqa_datamodule import TVQADataModule
25 | from .lsmdc_choice_datamodule import LSMDCChoiceDataModule
26 | from .ego4d_choice_datamodule import EGO4DChoiceDataModule
27 | from .tgif_datamodule import TGIFDataModule
28 | from .tgifqa_datamodule import TGIFQADataModule
29 | from .didemo_datamodule import DIDEMODataModule
30 | from .hmdb51_datamodule import HMDB51DataModule
31 | from .k400_datamodule import K400DataModule
32 | from .lsmdc_datamodule import LSMDCDataModule
33 | from .activitynet_datamodule import ActivityNetDataModule
34 | 
35 | _datamodules = {
36 |     "vg": VisualGenomeCaptionDataModule,
37 |     "f30k": F30KCaptionKarpathyDataModule,
38 |     "coco": CocoCaptionKarpathyDataModule,
39 |     "sbu": SBUCaptionDataModule,
40 |     "vqa": VQAv2DataModule,
41 |     "nlvr2": NLVR2DataModule,
42 |     "cc3m": CC3MDataModule,
43 |     'howto100m': HT100MDataModule,
44 |     'webvid': WEBVIDDataModule,
45 |     'msrvtt': MSRVTTDataModule,
46 |     'msrvttqa': MSRVTTQADataModule,
47 |     'msrvtt_choice': MSRVTTChoiceDataModule,
48 |     'msvd': MSVDDataModule,
49 |     'msvdqa': MSVDQADataModule,
50 |     'vcr': VCRDataModule,
51 |     'ego4d': Ego4DDataModule,
52 |     'tvqa': TVQADataModule,
53 |     'lsmdc_choice': LSMDCChoiceDataModule,
54 |     'ego4d_choice': EGO4DChoiceDataModule,
55 |     'yttemporal': YTTemporalMDataModule,
56 |     'tgif': TGIFDataModule,
57 |     "tgifqa": TGIFQADataModule,
58 |     'didemo': DIDEMODataModule,
59 |     'hmdb51': HMDB51DataModule,
60 |     'k400': K400DataModule,
61 |     'lsmdc': LSMDCDataModule,
62 |     'activitynet': ActivityNetDataModule
63 | }
64 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/msrvtt.py:
--------------------------------------------------------------------------------
 1 | from .video_base_dataset import BaseDataset
 2 | import random
 3 | import os
 4 | import pandas as pd
 5 | import json
 6 | import numpy as np
 7 | 
 8 | 
 9 | class MSRVTTDataset(BaseDataset):
10 |     def __init__(self, *args, split="", **kwargs):
11 |         assert split in ["train", "val", "test"]
12 |         self.split = split
13 |         self.metadata = None
14 |         self.cut = "7k"
15 |         if split == "train":
16 |             names = ["msrvtt_train"]
17 |         elif split == "val":
18 |             names = ["msrvtt_val"]
19 |         elif split == "test":
20 |             names = ["msrvtt_val"]
21 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
22 | 
23 |         self._load_metadata()
24 | 
25 |     def _load_metadata(self):
26 |         json_fp = os.path.join(self.data_dir, 'MSR_VTT.json')
27 |         with open(json_fp, 'r') as fid:
28 |             data = json.load(fid)
29 |         df = pd.DataFrame(data['annotations'])
30 | 
31 |         js_test_cap_idx_path = None
32 |         if self.cut == "7k":
33 |             train_list_path = "train_list_7k.txt"
34 |         elif self.cut == "9k":
35 |             train_list_path = "train_list_9k.txt"
36 |         test_list_path = "val_list_jsfusion.txt" 
37 |         js_test_cap_idx_path = "jsfusion_val_caption_idx.pkl"
38 | 
39 | 
40 |         train_df = pd.read_csv(os.path.join(self.data_dir, train_list_path), names=['videoid'])
41 |         test_df = pd.read_csv(os.path.join(self.data_dir, test_list_path), names=['videoid'])
42 |         self.split_sizes = {'train': len(train_df), 'val': len(test_df), 'test': len(test_df)}
43 | 
44 |         if self.split == 'train':
45 |             df = df[df['image_id'].isin(train_df['videoid'])]
46 |         else:
47 |             df = df[df['image_id'].isin(test_df['videoid'])]
48 | 
49 |         self.metadata = df.groupby(['image_id'])['caption'].apply(list)
50 |         if js_test_cap_idx_path is not None and self.split != 'train':
51 |             caps = pd.Series(np.load(os.path.join(self.data_dir, js_test_cap_idx_path), allow_pickle=True))
52 |             new_res = pd.DataFrame({'caps': self.metadata, 'cap_idx': caps})
53 |             new_res['test_caps'] = new_res.apply(lambda x: [x['caps'][x['cap_idx']]], axis=1)
54 |             self.metadata = new_res['test_caps']
55 | 
56 |         self.metadata = pd.DataFrame({'captions': self.metadata})
57 |         print("load split {}, {} samples".format(self.split, len(self.metadata)))
58 | 
59 |     # random choice or fixed?
60 |     def _get_caption(self, sample):
61 |         caption_sample = "rand"
62 |         if self.split in ['train', 'val'] and caption_sample == "rand":
63 |             caption = random.choice(sample['captions'])
64 |         else:
65 |             caption = sample['captions'][0]
66 |         return caption
67 | 
68 | 


--------------------------------------------------------------------------------
/univl/modules/meltr.py:
--------------------------------------------------------------------------------
 1 | from torch.nn import functional as F
 2 | from typing import Optional
 3 | from torch import Tensor
 4 | from torch import nn
 5 | import torch
 6 | from torch.nn.utils import clip_grad_norm_
 7 | 
 8 | class MELTRgrad:
 9 |     def __init__(self):
10 |         pass
11 | 
12 |     def grad(self, loss_val, loss_train, aux_params, params):
13 | 
14 |         dwdA = torch.autograd.grad(
15 |             loss_val,
16 |             params,
17 |             retain_graph=True,
18 |             allow_unused=True
19 |         )
20 | 
21 |         dwdT = torch.autograd.grad(
22 |             loss_train,
23 |             params,
24 |             create_graph=True,
25 |             allow_unused=True
26 |         )
27 | 
28 |         temp_t, temp_a = [], []
29 |         for t, a in zip(dwdT, dwdA):
30 |             if a is None:
31 |                 continue
32 |             temp_t.append(t)
33 |             temp_a.append(a)
34 | 
35 |         v4 = torch.autograd.grad(
36 |             tuple(temp_t), 
37 |             aux_params,    
38 |             grad_outputs=tuple(temp_a), 
39 |             allow_unused=True,
40 |         )
41 |         return v4
42 | 
43 | class MELTROptimizer:
44 | 
45 |     def __init__(self, meta_optimizer, max_grad_norm=10):
46 |         self.meta_optimizer = meta_optimizer
47 |         self.hypergrad = MELTRgrad()
48 | 
49 |         self.max_grad_norm = max_grad_norm
50 | 
51 |     def step(self, train_loss, val_loss, parameters, aux_params):
52 |         self.zero_grad()
53 | 
54 |         hyper_gards = self.hypergrad.grad(
55 |             loss_val=val_loss,
56 |             loss_train=train_loss,
57 |             aux_params=aux_params,
58 |             params=parameters,
59 |         )
60 |         for p, g in zip(aux_params, hyper_gards):
61 |             if g is not None:
62 |                 p.grad = -g
63 | 
64 |         if self.max_grad_norm is not None:
65 |             clip_grad_norm_(aux_params, max_norm=self.max_grad_norm)
66 | 
67 |         self.meta_optimizer.step()
68 |         
69 |     def zero_grad(self):
70 |         self.meta_optimizer.zero_grad()
71 | 
72 | class MELTR(nn.Module):
73 |     def __init__(self, t_dim, f_dim, i_dim, h1_dim, h2_dim, o_dim):
74 |         super(MELTR, self).__init__()
75 |         self.task_embedding = nn.Embedding(t_dim, h2_dim) 
76 |         self.loss_fc1 = nn.Linear(i_dim, h1_dim)
77 |         self.activation = nn.ReLU()
78 |         self.loss_fc2 = nn.Linear(h1_dim, h2_dim)
79 | 
80 |         self.encoder = nn.TransformerEncoderLayer(d_model=h2_dim, nhead=8, batch_first=True, dim_feedforward=f_dim)
81 |         self.fc1 = nn.Linear(h2_dim, o_dim, bias=False)
82 | 
83 |     def forward(self, x):
84 |         scale_embedding = self.loss_fc2(self.activation(self.loss_fc1(x)))
85 |         input = scale_embedding + self.task_embedding.weight
86 |         output = self.encoder(input.unsqueeze(0))
87 |         output = self.fc1(output.mean(1))
88 |         return output
89 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/temporal_roll.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import random
 4 | 
 5 | 
 6 | class TemporalRoll(nn.Module):
 7 |     def __init__(self, n_segment=3, n_div=8, v=0):
 8 |         super(TemporalRoll, self).__init__()
 9 |         self.n_segment = n_segment
10 |         self.fold_div = n_div
11 |         self.v = v
12 | 
13 |     def forward(self, x, layer=1):
14 |         # return x
15 |         nt, l, c = x.size()
16 |         n_batch = nt // self.n_segment
17 |         x = x.view(n_batch, self.n_segment, l, c)
18 |         if self.v == 0:
19 |             # 16, 3, 197, 768
20 |             fold = l // self.fold_div
21 |             out = torch.zeros_like(x)
22 |             # keep cls token
23 |             out[:, :, 0] = x[:, :, 0]
24 |             #  roll left step 1 along time dimension (1)
25 |             out[:, :, 1:fold+1] = torch.roll(x[:, :, 1:fold+1], 1, 1)
26 |             # roll right step 1 along time dimension (1)
27 |             out[:, :, -fold:] = torch.roll(x[:, :, -fold:], -1, 1)
28 |             # not roll
29 |             out[:, :, 1+fold:-fold] = x[:, :, 1+fold: -fold]
30 |             # # 16, 3, 197, 768
31 |             # fold = l // self.fold_div
32 |             # out = torch.zeros_like(x)
33 |             # #  roll left step 1 along time dimension (1)
34 |             # out[:, :, :fold] = torch.roll(x[:, :, :fold], 1, 1)
35 |             # # roll right step 1 along time dimension (1)
36 |             # out[:, :, -fold:] = torch.roll(x[:, :, -fold:], -1, 1)
37 |             # # not roll
38 |             # out[:, :, fold:-fold] = x[:, :, fold: -fold]
39 |         # random sampling
40 |         elif self.v == 1:
41 |             out = torch.zeros_like(x)
42 |             roll_token_idexs = random.sample(range(1, l), l//2)
43 |             # print(roll_token_idexs)
44 |             out = x
45 |             out[:, :, roll_token_idexs] = torch.roll(x[:, :, roll_token_idexs], 1, 1)
46 |         # roll different tokens for different blocks
47 |         elif self.v == 2:
48 |             rolled_token_len = l // self.fold_div
49 |             fold = rolled_token_len * (layer % self.fold_div)
50 |             begin_index = 1 + fold
51 |             end_index = min(1 + fold + rolled_token_len, l)
52 |             out = torch.zeros_like(x)
53 |             out[:, :, 0] = x[:, :, 0]  # cls token unchanged
54 |             out[:, :, begin_index:] = x[:, :, begin_index:]
55 |             out[:, :, begin_index:end_index] = torch.roll(x[:, :, begin_index:end_index], 1, 1)
56 |             out[:, :, end_index:] = x[:, :, end_index:]
57 |         else:  # not roll
58 |             fold = c // self.fold_div
59 |             out = torch.zeros_like(x)
60 |             out[:, :-1, :fold] = x[:, 1:, :fold]  # shift left tokens
61 |             out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold]  # shift right tokens
62 |             out[:, :, 2 * fold:] = x[:, :, 2 * fold:]  # not shift
63 |         return out.view(nt, l, c)


--------------------------------------------------------------------------------
/violet/meltr.py:
--------------------------------------------------------------------------------
 1 | from lib import *
 2 | from torch.nn.utils import clip_grad_norm_
 3 | from torch.nn import functional as F
 4 | from typing import Optional, Any
 5 | from torch import Tensor
 6 | from torch import nn
 7 | import torch
 8 | 
 9 | 
10 | class MELTRgrad:
11 |     def __init__(self):
12 |         pass
13 | 
14 |     def grad(self, loss_val, loss_train, aux_params, params):
15 |         dwdA = T.autograd.grad(
16 |             loss_val,
17 |             params,
18 |             retain_graph=True,
19 |             allow_unused=True
20 |         )
21 | 
22 |         dwdT = T.autograd.grad(
23 |             loss_train,
24 |             params,
25 |             create_graph=True,
26 |             allow_unused=True
27 |         )
28 | 
29 |         temp_t, temp_a = [], []
30 |         for t, a in zip(dwdT, dwdA):
31 |             if a is None:
32 |                 continue
33 |             temp_t.append(t)
34 |             temp_a.append(a)
35 | 
36 |         v4 = T.autograd.grad(
37 |             tuple(temp_t),
38 |             aux_params,
39 |             grad_outputs=tuple(temp_a),
40 |             allow_unused=True
41 |         )
42 | 
43 |         return v4
44 | 
45 | 
46 | class MELTROptimizer:
47 |     def __init__(self, meta_optimizer, hpo_lr, max_grad_norm=10):
48 |         self.meta_optimizer = meta_optimizer
49 |         self.hypergrad = MELTRgrad()
50 | 
51 |         self.max_grad_norm = max_grad_norm
52 | 
53 |     def step(self, train_loss, val_loss, parameters, aux_params):
54 |         self.zero_grad()
55 |         hyper_grads = self.hypergrad.grad(
56 |             loss_val=val_loss,
57 |             loss_train=train_loss,
58 |             aux_params=aux_params,
59 |             params=parameters,
60 |         )
61 |         for p, g in zip(aux_params, hyper_grads):
62 |             if g is not None:
63 |                 p.grad = -g
64 | 
65 |         if self.max_grad_norm is not None:
66 |             clip_grad_norm_(aux_params, max_norm=self.max_grad_norm)
67 | 
68 |         self.meta_optimizer.step()
69 |         
70 | 
71 |     def zero_grad(self):
72 |         self.meta_optimizer.zero_grad()
73 |         
74 | 
75 | class MELTR(nn.Module):
76 |     def __init__(self, t_dim, f_dim, i_dim, h1_dim, h2_dim, o_dim):
77 |         super(MELTR, self).__init__()
78 |         self.task_embedding = nn.Embedding(t_dim, h2_dim) 
79 |         self.loss_fc1 = nn.Linear(i_dim, h1_dim)
80 |         self.activation1 = nn.ReLU()
81 |         self.loss_fc2 = nn.Linear(h1_dim, h2_dim)
82 | 
83 |         self.encoder = nn.TransformerEncoderLayer(d_model=h2_dim, nhead=8, batch_first=True, dim_feedforward=f_dim)
84 |         self.fc1 = nn.Linear(h2_dim, o_dim, bias=False)
85 | 
86 |     def forward(self, x):
87 |         scale_embedding = self.loss_fc2(self.activation1(self.loss_fc1(x)))
88 |         input = scale_embedding + self.task_embedding.weight
89 |         output = self.encoder(input)
90 |         output = self.fc1(output.mean(1))
91 |         return output


--------------------------------------------------------------------------------
/allinone/AllInOne/modules/meltr.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.utils import clip_grad_norm_
 2 | import torch
 3 | from torch.nn import functional as F
 4 | from typing import Optional, Any
 5 | from torch import Tensor
 6 | from torch import nn
 7 | import torch
 8 | 
 9 | class MELTRgrad:
10 |     def __init__(self):
11 |         pass
12 |     
13 |     def grad(self, loss_train, loss_val, params, aux_params):
14 |         dwdA = torch.autograd.grad(
15 |             loss_val,
16 |             params,
17 |             retain_graph=True,
18 |             allow_unused=True
19 |         )
20 | 
21 |         dwdT = torch.autograd.grad(
22 |             loss_train,
23 |             params,
24 |             create_graph=True,
25 |             allow_unused=True
26 |         )
27 | 
28 |         temp_t, temp_a = [], []
29 |         for t, a in zip(dwdT, dwdA):
30 |             if a is None:
31 |                 continue
32 |             temp_t.append(t)
33 |             temp_a.append(a)
34 | 
35 | 
36 |         v4 = torch.autograd.grad(
37 |             tuple(temp_t), 
38 |             aux_params, 
39 |             grad_outputs=tuple(temp_a),
40 |             allow_unused=True
41 |         )
42 | 
43 |         return v4
44 | 
45 | 
46 | class MELTROptimizer:
47 |     def __init__(self, meta_optimizer, max_grad_norm=10):
48 |         self.meta_optimizer = meta_optimizer
49 |         self.hypergrad = MELTRgrad()
50 | 
51 |         self.max_grad_norm = max_grad_norm
52 | 
53 |     def step(self, train_loss, val_loss, parameters, aux_params):
54 |         self.zero_grad()
55 | 
56 |         hyper_grads = self.hypergrad.grad(
57 |             loss_train=train_loss,
58 |             loss_val=val_loss,
59 |             params=parameters,
60 |             aux_params=aux_params,
61 |         )
62 | 
63 |         for p, g in zip(aux_params, hyper_grads):
64 |             if g is not None:
65 |                 p.grad = -g
66 | 
67 |         if self.max_grad_norm is not None:
68 |             clip_grad_norm_(aux_params, max_norm=self.max_grad_norm)
69 |         
70 |         self.meta_optimizer.step()
71 |         
72 |     def zero_grad(self):
73 |         self.meta_optimizer.zero_grad()
74 | 
75 | 
76 | class MELTR(nn.Module):
77 |     def __init__(self, t_dim, f_dim, i_dim, h1_dim, h2_dim, o_dim):
78 |         super(MELTR, self).__init__()
79 |         self.task_embedding = nn.Embedding(t_dim, h2_dim) 
80 |         self.loss_fc1 = nn.Linear(i_dim, h1_dim)
81 |         self.activation1 = nn.ReLU()
82 |         self.loss_fc2 = nn.Linear(h1_dim, h2_dim)
83 | 
84 |         self.encoder = nn.TransformerEncoderLayer(d_model=h2_dim, nhead=8, batch_first=True, dim_feedforward=f_dim)
85 |         self.fc1 = nn.Linear(h2_dim, o_dim, bias=False)
86 | 
87 | 
88 | 
89 |     def forward(self, x):
90 |         scale_embedding = self.loss_fc2(self.activation1(self.loss_fc1(x)))
91 |         input = scale_embedding + self.task_embedding.weight
92 |         output = self.encoder(input)
93 |         output = self.fc1(output.mean(1))
94 |         return output


--------------------------------------------------------------------------------
/allinone/AllInOne/datamodules/multitask_datamodule.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | from pytorch_lightning import LightningDataModule
 4 | from torch.utils.data import DataLoader
 5 | from torch.utils.data.dataset import ConcatDataset
 6 | from torch.utils.data.distributed import DistributedSampler
 7 | 
 8 | from . import _datamodules
 9 | 
10 | 
11 | class MTDataModule(LightningDataModule):
12 |     def __init__(self, _config, dist=False):
13 |         datamodule_keys = _config["datasets"]
14 |         assert len(datamodule_keys) > 0
15 | 
16 |         super().__init__()
17 | 
18 |         self.dm_keys = datamodule_keys
19 |         self.dm_dicts = {key: _datamodules[key](_config) for key in datamodule_keys}
20 |         self.dms = [v for k, v in self.dm_dicts.items()]
21 | 
22 |         self.batch_size = self.dms[0].batch_size
23 |         self.vocab_size = self.dms[0].vocab_size
24 |         self.num_workers = self.dms[0].num_workers
25 | 
26 |         self.dist = dist
27 | 
28 |     def prepare_data(self):
29 |         for dm in self.dms:
30 |             dm.prepare_data()
31 | 
32 |     def setup(self, stage):
33 |         for dm in self.dms:
34 |             dm.setup(stage)
35 | 
36 |         self.train_dataset = ConcatDataset([dm.train_dataset for dm in self.dms])
37 |         self.val_dataset = ConcatDataset([dm.val_dataset for dm in self.dms])
38 |         self.test_dataset = ConcatDataset([dm.test_dataset for dm in self.dms])
39 |         self.tokenizer = self.dms[0].tokenizer
40 | 
41 |         self.collate = functools.partial(
42 |             self.dms[0].train_dataset.collate, mlm_collator=self.dms[0].mlm_collator,
43 |         )
44 | 
45 |         if self.dist:
46 |             self.train_sampler = DistributedSampler(self.train_dataset, shuffle=True)
47 |             self.val_sampler = DistributedSampler(self.val_dataset, shuffle=True)
48 |             self.test_sampler = DistributedSampler(self.test_dataset, shuffle=False)
49 |         else:
50 |             self.train_sampler = None
51 |             self.val_sampler = None
52 |             self.test_sampler = None
53 | 
54 |     def train_dataloader(self):
55 |         loader = DataLoader(
56 |             self.train_dataset,
57 |             batch_size=self.batch_size,
58 |             sampler=self.train_sampler,
59 |             num_workers=self.num_workers,
60 |             collate_fn=self.collate,
61 |         )
62 |         return loader
63 | 
64 |     def val_dataloader(self, batch_size=None):
65 |         loader = DataLoader(
66 |             self.val_dataset,
67 |             batch_size=batch_size if batch_size is not None else self.batch_size,
68 |             sampler=self.val_sampler,
69 |             num_workers=self.num_workers,
70 |             collate_fn=self.collate,
71 |         )
72 |         return loader
73 | 
74 |     def test_dataloader(self):
75 |         loader = DataLoader(
76 |             self.test_dataset,
77 |             batch_size=self.batch_size,
78 |             sampler=self.test_sampler,
79 |             num_workers=self.num_workers,
80 |             collate_fn=self.collate,
81 |         )
82 |         return loader
83 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/msrvtt_choice.py:
--------------------------------------------------------------------------------
 1 | from .video_base_dataset import BaseDataset
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class MSRVTTChoiceDataset(BaseDataset):
 7 |     def __init__(self, *args, split="", **kwargs):
 8 |         assert split in ["train", "val", "test"]
 9 |         self.split = split
10 |         if self.split == "train":
11 |             Exception("no train data provided")
12 |         self.metadata = None
13 |         self.ans_lab_dict = None
14 |         if split == "train":
15 |             names = ["msrvtt_choice_train"]
16 |         elif split == "val":
17 |             names = ["msrvtt_choice_val"]
18 |         elif split == "test":
19 |             names = ["msrvtt_choice_test"]  # vqav2_test-dev for test-dev
20 | 
21 |         super().__init__(
22 |             *args,
23 |             **kwargs,
24 |             names=names,
25 |             text_column_name="unknown",
26 |             remove_duplicate=False,
27 |         )
28 |         self._load_metadata()
29 | 
30 |     def _load_metadata(self):
31 |         metadata_dir = './meta_data/msrvtt'
32 |         split_files = {
33 |             'train': 'msrvtt_mc_test.jsonl',         # no train and test available, only for zero-shot
34 |             'val': 'msrvtt_mc_test.jsonl',
35 |             'test': 'msrvtt_mc_test.jsonl'
36 |         }
37 |         target_split_fp = split_files[self.split]
38 |         metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True)
39 |         self.metadata = metadata
40 | 
41 |     def _get_video_path(self, sample):
42 |         return os.path.join(self.data_dir, 'videos', 'all', sample['clip_name'] + '.mp4'), sample['clip_name'] + '.mp4'
43 | 
44 |     def get_text(self, sample):
45 |         texts = []
46 |         for text in sample['options']:
47 |             encoding = self.tokenizer(
48 |                 text,
49 |                 padding="max_length",
50 |                 truncation=True,
51 |                 max_length=self.max_text_len,
52 |                 return_special_tokens_mask=True,
53 |             )
54 |             texts.append((text, encoding))
55 |         return texts
56 | 
57 |     def get_answer_label(self, sample):
58 |         answer = sample['answer']
59 |         return answer
60 | 
61 |     def __getitem__(self, index):
62 |         sample = self.metadata.iloc[index]
63 |         image_tensor = self.get_video(sample)
64 |         # index, question_index = self.index_mapper[index]
65 |         qid = index
66 |         answer = self.get_answer_label(sample)
67 |         ret = {
68 |             "image": image_tensor,
69 |             "img_index": index,
70 |             "cap_index": index,
71 |             "raw_index": index,
72 |             'answer': answer
73 |         }
74 |         texts = self.get_text(sample)
75 |         ret["text"] = texts[0]
76 |         # print(len(texts))
77 |         for i in range(self.draw_false_text - 1):
78 |             ret.update({f"false_text_{i}": texts[i+1]})
79 |         # for i in range(self.draw_false_text-1):
80 |         #     ret[f"false_text_{i}"] = texts[i+1]
81 |         # print(ret.keys())
82 |         return ret
83 | 
84 |     def __len__(self):
85 |         return len(self.metadata)


--------------------------------------------------------------------------------
/allinone/AllInOne/gadgets/my_metrics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from pytorch_lightning.metrics import Metric
 3 | # from torchmetrics import Metric
 4 | 
 5 | 
 6 | def order_class_index(order):
 7 |     """Return the index of the order in its full permutation.
 8 | 
 9 |     Args:
10 |         order (tensor): e.g. [0,1,2]
11 |     """
12 |     classes = list(itertools.permutations(list(range(len(order)))))
13 |     return classes.index(tuple(order.tolist()))
14 | 
15 | 
16 | class Accuracy(Metric):
17 |     def __init__(self, dist_sync_on_step=False):
18 |         super().__init__(dist_sync_on_step=dist_sync_on_step)
19 |         self.add_state("correct", default=torch.tensor(0.0), dist_reduce_fx="sum")
20 |         self.add_state("total", default=torch.tensor(0.0), dist_reduce_fx="sum")
21 | 
22 |     def update(self, logits, target, unfilterd=False):
23 |         logits, target = (
24 |             logits.detach().to(self.correct.device),
25 |             target.detach().to(self.correct.device),
26 |         )
27 |         preds = logits.argmax(dim=-1)
28 |         preds = preds[target != -100]
29 |         unfilter_num = target.numel()
30 |         target = target[target != -100]
31 |         if target.numel() == 0:
32 |             return 1
33 | 
34 |         assert preds.shape == target.shape
35 | 
36 |         self.correct += torch.sum(preds == target)
37 |         if unfilterd:
38 |             # print("no filter")
39 |             self.total += unfilter_num
40 |         else:
41 |             self.total += target.numel()
42 | 
43 |     def compute(self):
44 |         return self.correct / self.total
45 | 
46 | 
47 | class Scalar(Metric):
48 |     def __init__(self, dist_sync_on_step=False):
49 |         super().__init__(dist_sync_on_step=dist_sync_on_step)
50 |         self.add_state("scalar", default=torch.tensor(0.0), dist_reduce_fx="sum")
51 |         self.add_state("total", default=torch.tensor(0.0), dist_reduce_fx="sum")
52 | 
53 |     def update(self, scalar):
54 |         if isinstance(scalar, torch.Tensor):
55 |             scalar = scalar.detach().to(self.scalar.device)
56 |         else:
57 |             scalar = torch.tensor(scalar).float().to(self.scalar.device)
58 |         self.scalar += scalar
59 |         self.total += 1
60 | 
61 |     def compute(self):
62 |         return self.scalar / self.total
63 | 
64 | 
65 | class VQAScore(Metric):
66 |     def __init__(self, dist_sync_on_step=False):
67 |         super().__init__(dist_sync_on_step=dist_sync_on_step)
68 |         self.add_state("score", default=torch.tensor(0.0), dist_reduce_fx="sum")
69 |         self.add_state("total", default=torch.tensor(0.0), dist_reduce_fx="sum")
70 | 
71 |     def update(self, logits, target):
72 |         logits, target = (
73 |             logits.detach().float().to(self.score.device),
74 |             target.detach().float().to(self.score.device),
75 |         )
76 |         logits = torch.max(logits, 1)[1]
77 |         one_hots = torch.zeros(*target.size()).to(target)
78 |         one_hots.scatter_(1, logits.view(-1, 1), 1)
79 |         scores = one_hots * target
80 | 
81 |         self.score += scores.sum()
82 |         self.total += len(logits)
83 | 
84 |     def compute(self):
85 |         return self.score / self.total
86 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/hmdb51_zero_shot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .video_base_dataset import BaseDataset
 3 | import os
 4 | 
 5 | 
 6 | class HMDB51Dataset(BaseDataset):
 7 |     def __init__(self, *args, split="", **kwargs):
 8 |         assert split in ["train", "val", "test"]
 9 |         self.split = split
10 |         self.metadata = None
11 |         self.ans_lab_dict = dict()
12 |         if split == "train":
13 |             names = ["hmdb51_train"]
14 |         elif split == "val":
15 |             names = ["hmdb51_val"]
16 |         elif split == "test":
17 |             names = ["hmdb51_test"]
18 |         super().__init__(
19 |             *args,
20 |             **kwargs,
21 |             names=names,
22 |             text_column_name="questions",
23 |             remove_duplicate=False,
24 |         )
25 |         self._load_metadata()
26 | 
27 |     def _load_metadata(self):
28 |         metadata_dir = './meta_data/hmdb51'
29 |         split_files = {
30 |             'train': 'hmdb51_rgb_train_split_1.txt',
31 |             'val': 'hmdb51_rgb_val_split_1.txt',
32 |             'test': 'hmdb51_rgb_val_split_1.txt'
33 |         }
34 |         target_split_fp = split_files[self.split]
35 |         self.metadata = [x.strip().split(' ') for x in open(os.path.join(metadata_dir, target_split_fp))]
36 |         answer_fp = os.path.join(metadata_dir, 'hmdb51_classInd.txt')
37 |         with open(answer_fp, 'r') as f:
38 |             lines = f.readlines()
39 |             for line in lines:
40 |                 self.ans_lab_dict[str(int(line.strip().split(' ')[0]) - 1)] = line.strip().split(' ')[1]
41 | 
42 |     def _get_video_path(self, sample):
43 |         # self.ans_lab_dict[sample[2]],
44 |         return os.path.join(self.data_dir, sample[0].split('/')[-1]) + '.avi', sample[0].split('/')[-1] + '.avi'
45 | 
46 |     def get_text(self, sample):
47 |         text = "A"
48 |         encoding = self.tokenizer(
49 |             text,
50 |             padding="max_length",
51 |             truncation=True,
52 |             max_length=self.max_text_len,
53 |             return_special_tokens_mask=True,
54 |         )
55 |         return (text, encoding)
56 | 
57 |     def get_answer_label(self, sample):
58 |         text = "None"
59 |         ans_total_len = len(self.ans_lab_dict) + 1  # one additional class
60 |         ans_label = int(sample[2])
61 |         scores = np.zeros(ans_total_len).astype(int)
62 |         scores[ans_label] = 1
63 |         return text, ans_label, scores
64 |         # return text, ans_label_vector, scores
65 | 
66 |     def __getitem__(self, index):
67 |         sample = self.metadata[index]  # .split(' ')
68 |         image_tensor = self.get_video(sample)
69 |         text = self.get_text(sample)
70 |         qid = index
71 |         if self.split != "test":
72 |             answers, labels, scores = self.get_answer_label(sample)
73 |         else:
74 |             answers = list()
75 |             labels = list()
76 |             scores = list()
77 | 
78 |         return {
79 |             "image": image_tensor,
80 |             "text": text,
81 |             "vqa_answer": answers,
82 |             "vqa_labels": labels,
83 |             "vqa_scores": scores,
84 |             "qid": qid,
85 |         }
86 | 
87 |     def __len__(self):
88 |         return len(self.metadata)


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/hmdb51.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .video_base_dataset import BaseDataset
 3 | import os
 4 | 
 5 | 
 6 | class HMDB51Dataset(BaseDataset):
 7 |     def __init__(self, *args, split="", **kwargs):
 8 |         assert split in ["train", "val", "test"]
 9 |         self.split = split
10 |         self.metadata = None
11 |         self.ans_lab_dict = dict()
12 |         if split == "train":
13 |             names = ["hmdb51_train"]
14 |         elif split == "val":
15 |             names = ["hmdb51_val"]
16 |         elif split == "test":
17 |             names = ["hmdb51_test"]
18 |         super().__init__(
19 |             *args,
20 |             **kwargs,
21 |             names=names,
22 |             text_column_name="questions",
23 |             remove_duplicate=False,
24 |         )
25 |         self._load_metadata()
26 | 
27 |     def _load_metadata(self):
28 |         metadata_dir = './meta_data/hmdb51'
29 |         split_files = {
30 |             'train': 'hmdb51_rgb_train_split_1.txt',
31 |             'val': 'hmdb51_rgb_val_split_1.txt',
32 |             'test': 'hmdb51_rgb_val_split_1.txt'
33 |         }
34 |         target_split_fp = split_files[self.split]
35 |         self.metadata = [x.strip().split(' ') for x in open(os.path.join(metadata_dir, target_split_fp))]
36 |         answer_fp = os.path.join(metadata_dir, 'hmdb51_classInd.txt')
37 |         with open(answer_fp, 'r') as f:
38 |             lines = f.readlines()
39 |             for line in lines:
40 |                 self.ans_lab_dict[str(int(line.strip().split(' ')[0]) - 1)] = line.strip().split(' ')[1]
41 | 
42 |     def _get_video_path(self, sample):
43 |         # self.ans_lab_dict[sample[2]],
44 |         return os.path.join(self.data_dir, sample[0].split('/')[-1]) + '.avi', sample[0].split('/')[-1] + '.avi'
45 | 
46 |     def get_text(self, sample):
47 |         text = "A person is doing [MASK]"
48 |         encoding = self.tokenizer(
49 |             text,
50 |             padding="max_length",
51 |             truncation=True,
52 |             max_length=self.max_text_len,
53 |             return_special_tokens_mask=True,
54 |         )
55 |         return (text, encoding)
56 | 
57 |     def get_answer_label(self, sample):
58 |         text = "None"
59 |         ans_total_len = len(self.ans_lab_dict) + 1  # one additional class
60 |         ans_label = int(sample[2])
61 |         scores = np.zeros(ans_total_len).astype(int)
62 |         scores[ans_label] = 1
63 |         return text, ans_label, scores
64 |         # return text, ans_label_vector, scores
65 | 
66 |     def __getitem__(self, index):
67 |         sample = self.metadata[index]  # .split(' ')
68 |         image_tensor = self.get_video(sample)
69 |         text = self.get_text(sample)
70 |         qid = index
71 |         if self.split != "test":
72 |             answers, labels, scores = self.get_answer_label(sample)
73 |         else:
74 |             answers = list()
75 |             labels = list()
76 |             scores = list()
77 | 
78 |         return {
79 |             "image": image_tensor,
80 |             "text": text,
81 |             "vqa_answer": answers,
82 |             "vqa_labels": labels,
83 |             "vqa_scores": scores,
84 |             "qid": qid,
85 |         }
86 | 
87 |     def __len__(self):
88 |         return len(self.metadata)


--------------------------------------------------------------------------------
/allinone/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import copy
 3 | import pytorch_lightning as pl
 4 | import torch
 5 | from AllInOne.config import ex
 6 | from AllInOne.modules import AllinoneTransformerSS
 7 | from AllInOne.datamodules.multitask_datamodule import MTDataModule
 8 | import datetime
 9 | import time
10 | 
11 | @ex.automain
12 | def main(_config):
13 |     _config = copy.deepcopy(_config)
14 |     pl.seed_everything(_config["seed"])
15 | 
16 |     dm = MTDataModule(_config, dist=True)
17 |     model = AllinoneTransformerSS(_config)
18 | 
19 |     exp_name = f'{_config["exp_name"]}'
20 | 
21 |     os.makedirs(_config["log_dir"], exist_ok=True)
22 |     checkpoint_callback = pl.callbacks.ModelCheckpoint(
23 |         save_top_k=1,
24 |         # every_n_epochs=_config["save_checkpoints_interval"],
25 |         verbose=True,
26 |         monitor="val/the_metric",
27 |         mode="max",
28 |         save_last=True,
29 |     )
30 |     now = datetime.datetime.now()
31 |     instance_name = f'{exp_name}_seed{_config["seed"]}_from_{_config["load_path"].split("/")[-1][:-5]}{now.year}_{now.month}_{now.day}'
32 |     logger = pl.loggers.TensorBoardLogger(
33 |         _config["log_dir"],
34 |         name=instance_name,
35 |     )
36 | 
37 |     lr_callback = pl.callbacks.LearningRateMonitor(logging_interval="step")
38 |     callbacks = [checkpoint_callback, lr_callback]
39 | 
40 |     num_gpus = (
41 |         _config["num_gpus"]
42 |         if isinstance(_config["num_gpus"], int)
43 |         else len(_config["num_gpus"])
44 |     )
45 |     # print all config at the begin
46 |     print('='*70+'Config: '+'='*70)
47 |     print(instance_name)
48 |     print(_config)
49 |     print('='*150)
50 | 
51 |     # notice _config["batch_size"] should be max length for all machines, eg. at least 1024
52 |     grad_steps = _config["batch_size"] // (
53 |         _config["per_gpu_batchsize"] * num_gpus * _config["num_nodes"]
54 |     )
55 | 
56 |     max_steps = _config["max_steps"] if _config["max_steps"] is not None else None
57 | 
58 | 
59 |     trainer = pl.Trainer(
60 |         gpus=_config["num_gpus"],
61 |         num_nodes=_config["num_nodes"],
62 |         # precision=_config["precision"],
63 |         accelerator="ddp",
64 |         benchmark=True,
65 |         deterministic=True,
66 |         max_epochs=_config["max_epoch"] if max_steps is None else 1000,
67 |         max_steps=max_steps,
68 |         callbacks=callbacks,
69 |         logger=logger,
70 |         # prepare_data_per_node=False,
71 |         replace_sampler_ddp=False,
72 |         accumulate_grad_batches=grad_steps,
73 |         log_every_n_steps=10,
74 |         flush_logs_every_n_steps=10,
75 |         resume_from_checkpoint=_config["resume_from"],
76 |         weights_summary="top",
77 |         fast_dev_run=_config["fast_dev_run"],
78 |         val_check_interval=_config["val_check_interval"],
79 |         automatic_optimization=False
80 |         # num_sanity_val_steps=0,   # 처음 sanity check
81 | 
82 |         # gradient_clip_val = 0.1
83 | 
84 | 
85 |         # plugins=[DDPPlugin(find_unused_parameters=True)]
86 |         # show_progress_bar=False,
87 |         # progress_bar_refresh_rate=0
88 |     )
89 | 
90 |     print("accumulate grad batches is: ", trainer.accumulate_grad_batches)
91 | 
92 |     if not _config["test_only"]:
93 |         trainer.fit(model, datamodule=dm)
94 |     else:
95 |         trainer.test(model, datamodule=dm)
96 | 


--------------------------------------------------------------------------------
/violet/model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from lib import *
 3 | from video_swin import SwinTransformer3D
 4 | 
 5 | class EncImg(T.nn.Module):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 |         
 9 |         self.swin = SwinTransformer3D()
10 |         self.swin.load_state_dict(T.load('./_snapshot/ckpt_video-swin.pt', map_location='cpu'))
11 |         
12 |         self.emb_cls = T.nn.Parameter(0.02*T.randn(1, 1, 1, 768))
13 |         self.emb_pos = T.nn.Parameter(0.02*T.randn(1, 1, 1+14**2, 768))
14 |         self.emb_len = T.nn.Parameter(0.02*T.randn(1, 6, 1, 768))
15 |         self.norm = T.nn.LayerNorm(768)
16 |     
17 |     def forward(self, img):
18 |         _B, _T, _C, _H, _W = img.shape
19 |         _h, _w = _H//32, _W//32
20 |         
21 |         img = TV.transforms.Normalize([0.485, 0.456, 0.406], 
22 |                                       [0.229, 0.224, 0.225])(img)
23 |         
24 |         f_img = self.swin(img.transpose(1, 2)).transpose(1, 2)
25 |         
26 |         f_img = f_img.permute(0, 1, 3, 4, 2).view([_B, _T, _h*_w, 768])
27 |         f_img = T.cat([self.emb_cls.expand([_B, _T, -1, -1]), f_img], dim=2)
28 |         f_img += self.emb_pos.expand([_B, _T, -1, -1])[:, :, :1+_h*_w, :]+self.emb_len.expand([_B, -1, 1+_h*_w, -1])[:, :_T, :, :]
29 |         f_img = self.norm(f_img).view([_B, _T*(1+_h*_w), -1])
30 |         
31 |         m_img = T.ones(1+_h*_w).long().cuda().unsqueeze(0).unsqueeze(0)
32 |         m_img = m_img.expand([_B, _T, -1]).contiguous().view([_B, _T*(1+_h*_w)])
33 |         
34 |         return f_img, m_img
35 | 
36 | class EncTxt(T.nn.Module):
37 |     def __init__(self):
38 |         super().__init__()
39 |         
40 |         bert = transformers.BertModel.from_pretrained('bert-base-uncased')
41 |         self.emb_txt = bert.embeddings
42 |     
43 |     def forward(self, txt):
44 |         f_txt = self.emb_txt(txt)
45 |         
46 |         return f_txt
47 | 
48 | class VIOLET_Base(T.nn.Module):
49 |     def __init__(self):
50 |         super().__init__()
51 |         
52 |         self.enc_img, self.enc_txt = EncImg(), EncTxt()
53 |         bert = transformers.BertForMaskedLM.from_pretrained('bert-base-uncased')
54 |         self.mask_ext, self.trsfr = bert.get_extended_attention_mask, bert.bert.encoder
55 |     
56 |     def go_feat(self, img, txt, mask):
57 |         feat_img, mask_img = self.enc_img(img)
58 |         feat_txt, mask_txt = self.enc_txt(txt), mask
59 |         return feat_img, mask_img, feat_txt, mask_txt
60 |     
61 |     def go_cross(self, feat_img, mask_img, feat_txt, mask_txt):
62 |         feat, mask = T.cat([feat_img, feat_txt], dim=1), T.cat([mask_img, mask_txt], dim=1)
63 |         mask = self.mask_ext(mask, mask.shape, mask.device)
64 |         out = self.trsfr(feat, mask, output_attentions=True)
65 |         return out['last_hidden_state'], out['attentions']
66 | 
67 | 
68 |     def load_ckpt(self, ckpt):
69 |         if ckpt=='':
70 |             print('===== Init VIOLET =====')
71 |             return
72 |         
73 |         ckpt_new, ckpt_old = T.load(ckpt, map_location='cpu'), self.state_dict()
74 |         key_old = set(ckpt_old.keys())
75 |         for k in ckpt_new:
76 |             if k in ckpt_old and ckpt_new[k].shape==ckpt_old[k].shape:
77 |                 ckpt_old[k] = ckpt_new[k]
78 |                 key_old.remove(k)     
79 |         self.load_state_dict(ckpt_old)
80 |         print('===== Not Load:', key_old, '=====')
81 |         
82 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/transforms/functional.py:
--------------------------------------------------------------------------------
 1 | import numbers
 2 | import torch
 3 | import cv2
 4 | import numpy as np
 5 | import PIL
 6 | 
 7 | 
 8 | def _is_tensor_clip(clip):
 9 |     return torch.is_tensor(clip) and clip.ndimension() == 4
10 | 
11 | 
12 | def crop_clip(clip, min_h, min_w, h, w):
13 |     if isinstance(clip[0], np.ndarray):
14 |         cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip]
15 | 
16 |     elif isinstance(clip[0], PIL.Image.Image):
17 |         cropped = [
18 |             img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip
19 |         ]
20 |     else:
21 |         raise TypeError('Expected numpy.ndarray or PIL.Image' +
22 |                         'but got list of {0}'.format(type(clip[0])))
23 |     return cropped
24 | 
25 | 
26 | def resize_clip(clip, size, interpolation='bilinear'):
27 |     if isinstance(clip[0], np.ndarray):
28 |         if isinstance(size, numbers.Number):
29 |             im_h, im_w, im_c = clip[0].shape
30 |             # Min spatial dim already matches minimal size
31 |             if (im_w <= im_h and im_w == size) or (im_h <= im_w
32 |                                                    and im_h == size):
33 |                 return clip
34 |             new_h, new_w = get_resize_sizes(im_h, im_w, size)
35 |             size = (new_w, new_h)
36 |         else:
37 |             size = size[1], size[0]
38 |         if interpolation == 'bilinear':
39 |             np_inter = cv2.INTER_LINEAR
40 |         else:
41 |             np_inter = cv2.INTER_NEAREST
42 |         scaled = [
43 |             cv2.resize(img, size, interpolation=np_inter) for img in clip
44 |         ]
45 |     elif isinstance(clip[0], PIL.Image.Image):
46 |         if isinstance(size, numbers.Number):
47 |             im_w, im_h = clip[0].size
48 |             # Min spatial dim already matches minimal size
49 |             if (im_w <= im_h and im_w == size) or (im_h <= im_w
50 |                                                    and im_h == size):
51 |                 return clip
52 |             new_h, new_w = get_resize_sizes(im_h, im_w, size)
53 |             size = (new_w, new_h)
54 |         else:
55 |             size = size[1], size[0]
56 |         if interpolation == 'bilinear':
57 |             pil_inter = PIL.Image.NEAREST
58 |         else:
59 |             pil_inter = PIL.Image.BILINEAR
60 |         scaled = [img.resize(size, pil_inter) for img in clip]
61 |     else:
62 |         raise TypeError('Expected numpy.ndarray or PIL.Image' +
63 |                         'but got list of {0}'.format(type(clip[0])))
64 |     return scaled
65 | 
66 | 
67 | def get_resize_sizes(im_h, im_w, size):
68 |     if im_w < im_h:
69 |         ow = size
70 |         oh = int(size * im_h / im_w)
71 |     else:
72 |         oh = size
73 |         ow = int(size * im_w / im_h)
74 |     return oh, ow
75 | 
76 | 
77 | def normalize(clip, mean, std, inplace=False):
78 |     if not _is_tensor_clip(clip):
79 |         raise TypeError('tensor is not a torch clip_test.')
80 | 
81 |     if not inplace:
82 |         clip = clip.clone()
83 | 
84 |     dtype = clip.dtype
85 |     dim = len(mean)
86 |     mean = torch.as_tensor(mean, dtype=dtype, device=clip.device)
87 |     std = torch.as_tensor(std, dtype=dtype, device=clip.device)
88 |     # print(clip_test.size())
89 |     # if dim == 3:
90 |     clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
91 |     # else:
92 |     #     clip_test.sub_(mean[:, None, None]).div_(std[:, None, None])
93 |     return clip


--------------------------------------------------------------------------------
/univl/dataloaders/README.md:
--------------------------------------------------------------------------------
 1 | Data loaders for pretrain and downstream tasks (retrieval and caption). 
 2 | 
 3 | ## Preprocess on HowTo100M
 4 | 
 5 | For pretrain, you need to prepare 3 parts,
 6 | 
 7 | ### 1. s3d features pretrained on HowTo100M
 8 | 
 9 | Download raw videos from the [HowTo100M webpage]([https://www.di.ens.fr/willow/research/howto100m/](https://www.di.ens.fr/willow/research/howto100m/)) and extract [s3d (howto100m)](https://github.com/antoine77340/S3D_HowTo100M) features. You can refer to [VideoFeatureExtractor](https://github.com/ArrowLuo/VideoFeatureExtractor).
10 | 
11 | ### 2. HowTo100M.csv
12 | Note: this file is different from HowTo100M_v1.csv as in [README.txt](https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/README.txt)
13 | 
14 | The csv format contains two columns. The first column is the video id, and the second is the feature file (sub-path of the npy, which will post append to `--features_path` (refer to pretrain part in [README](../README.md)) to find the npy file when reading).
15 | 
16 | ```
17 | video_id,feature_file
18 | Z8xhli297v8,Z8xhli297v8.npy
19 | ...
20 | ```
21 | video_id: used to match the caption or transcript
22 | feature_file: used to find the feature file after joining with `--features_path`
23 | 
24 | ### 3. caption.pickle
25 | This pickle file is generated from raw_caption.json in raw_caption.zip introduced in [README.txt](https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/README.txt)
26 | 
27 | The format of this file is:
28 | ```
29 | {
30 |     'video_id 1':{
31 |         'start': array([0.08, 7.37, 15.05, ...], dtype=object),
32 |         'end': array([9.96, 16.98, 27.9, ...], dtype=object),
33 |         'text': array(['sentence 1 placehodolder',
34 |                     'sentence 2 placehodolder',
35 |                     'sentence 3 placehodolder', ...], dtype=object)
36 |     },
37 |     ...
38 | }
39 | ```
40 | Keep the `start` is a sorted array.
41 | 
42 | 
43 | ## Preprocess on YoucookII
44 | The s3d feature extraction is the same as HowTo100M introduced above.
45 | 
46 | ## Generate youcookii_data.pickle
47 | This file is generated from `youcookii_annotations_trainval.json`, which can be downloaded from [official webpage](http://youcook2.eecs.umich.edu/download).
48 | 
49 | The format of this file is (similar to `caption.pickle` introduced above, but one more key `transcript`. The `transcript` needs to generated by extra ASR tool from speech.):
50 | ```
51 | {
52 |     'video_id 1':{
53 |         'start': array([0.08, 7.37, 15.05, ...], dtype=object),
54 |         'end': array([9.96, 16.98, 27.9, ...], dtype=object),
55 |         'text': array(['sentence 1 placehodolder',
56 |                     'sentence 2 placehodolder',
57 |                     'sentence 3 placehodolder', ...], dtype=object)
58 |         'transcript': array(['transcript 1 placehodolder',
59 |                     'transcript 2 placehodolder',
60 |                     'transcript 3 placehodolder', ...], dtype=object)
61 |     },
62 |     ...
63 | }
64 | ```
65 | If you want to test on retrieval or caption w/o transcript tasks, you can set `transcript` with `array(['NONE', 'NONE', 'NONE', ...], dtype=object)`.
66 | 
67 | ## Format of csv
68 | ```
69 | video_id,feature_file
70 | Z8xhli297v8,Z8xhli297v8
71 | ...
72 | ```
73 | Note: The video_id and feature_file are the same for the consistency and our historical compatibility. We use feature_file to get the feature from feature pickle.
74 | 
75 | ## Preprocess on MSRVTT
76 | The s3d feature extraction is the same as HowTo100M introduced above.
77 | The data can be downloaded in: https://github.com/microsoft/UniVL/releases/download/v0/msrvtt.zip
78 | 


--------------------------------------------------------------------------------
/allinone/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.13.0
  2 | addict==2.4.0
  3 | aiohttp==3.8.1
  4 | aiosignal==1.2.0
  5 | apex==0.1
  6 | appdirs==1.4.4
  7 | async-timeout==4.0.2
  8 | attrdict==2.0.1
  9 | attrs==21.2.0
 10 | av==8.0.3
 11 | backcall==0.2.0
 12 | black==19.3b0
 13 | bravado==11.0.3
 14 | bravado-core==5.17.0
 15 | cachetools==4.2.2
 16 | certifi==2021.5.30
 17 | chardet==3.0.4
 18 | charset-normalizer==2.0.4
 19 | click==8.0.1
 20 | colorama==0.4.4
 21 | cycler==0.10.0
 22 | Cython==0.29.28
 23 | decorator==5.0.9
 24 | decord==0.6.0
 25 | demoji==1.1.0
 26 | dlib==19.22.1
 27 | docopt==0.6.2
 28 | dominate==2.6.0
 29 | editdistance==0.6.0
 30 | einops==0.3.0
 31 | ffmpeg-python==0.2.0
 32 | filelock==3.0.12
 33 | flake8==3.9.2
 34 | frozenlist==1.3.0
 35 | fsspec==2022.2.0
 36 | ftfy==6.1.1
 37 | future==0.18.2
 38 | fvcore==0.1.5.post20220305
 39 | gensim==3.4.0
 40 | gitdb==4.0.7
 41 | GitPython==3.1.20
 42 | google-auth==2.0.2
 43 | google-auth-oauthlib==0.4.6
 44 | googletrans==3.0.0
 45 | grpcio==1.39.0
 46 | h11==0.9.0
 47 | h2==3.2.0
 48 | hpack==3.0.0
 49 | hstspreload==2021.9.1
 50 | httpcore==0.9.1
 51 | httpx==0.13.3
 52 | huggingface-hub==0.0.16
 53 | humanize==3.11.0
 54 | hyperframe==5.2.0
 55 | idna==2.10
 56 | imageio==2.9.0
 57 | iopath==0.1.9
 58 | ipdb==0.13.4
 59 | ipython==7.27.0
 60 | isort==5.9.3
 61 | jedi==0.18.0
 62 | joblib==1.0.1
 63 | jsonpickle==1.5.2
 64 | jsonpointer==2.1
 65 | jsonref==0.2
 66 | jsonschema==3.2.0
 67 | kiwisolver==1.3.2
 68 | llvmlite==0.38.0
 69 | lmdb==1.2.1
 70 | Markdown==3.3.4
 71 | matplotlib==3.4.3
 72 | matplotlib-inline==0.1.2
 73 | mccabe==0.6.1
 74 | monotonic==1.6
 75 | msgpack==1.0.2
 76 | multidict==6.0.2
 77 | munch==2.5.0
 78 | neptune-client==0.10.8
 79 | neptune-contrib==0.27.3
 80 | networkx==2.6.2
 81 | nltk==3.6.2
 82 | numba==0.55.1
 83 | numpy==1.19.5
 84 | oauthlib==3.1.1
 85 | opencv-python==4.4.0.46
 86 | packaging==21.0
 87 | pandas==1.1.5
 88 | parameterized==0.8.1
 89 | parso==0.8.2
 90 | pexpect==4.8.0
 91 | pickleshare==0.7.5
 92 | Pillow==8.2.0
 93 | pipreqs==0.4.11
 94 | portalocker==2.4.0
 95 | prompt-toolkit==3.0.20
 96 | protobuf==3.17.3
 97 | psutil==5.8.0
 98 | ptyprocess==0.7.0
 99 | py-cpuinfo==8.0.0
100 | pyarrow==2.0.0
101 | pyasn1==0.4.8
102 | pyasn1-modules==0.2.8
103 | pycodestyle==2.7.0
104 | pyflakes==2.3.1
105 | Pygments==2.10.0
106 | PyJWT==2.1.0
107 | pyparsing==2.4.7
108 | pyrsistent==0.18.0
109 | python-dateutil==2.8.2
110 | pytorch-lightning==1.1.4
111 | pytorchvideo==0.1.5
112 | pytz==2021.1
113 | PyWavelets==1.1.1
114 | PyYAML==5.4.1
115 | regex==2021.8.28
116 | requests==2.26.0
117 | requests-oauthlib==1.3.0
118 | rfc3986==1.5.0
119 | rfc3987==1.3.8
120 | rsa==4.7.2
121 | sacred==0.8.2
122 | sacremoses==0.0.45
123 | scikit-image==0.18.3
124 | scikit-learn==0.24.2
125 | scipy==1.7.1
126 | simplejson==3.17.5
127 | six==1.16.0
128 | sklearn==0.0
129 | smart-open==5.2.1
130 | smmap==4.0.0
131 | sniffio==1.2.0
132 | strict-rfc3339==0.7
133 | swagger-spec-validator==2.7.3
134 | tabulate==0.8.9
135 | tb-nightly==2.7.0a20210905
136 | tensorboard==2.8.0
137 | tensorboard-data-server==0.6.1
138 | tensorboard-plugin-wit==1.8.0
139 | termcolor==1.1.0
140 | textaugment==1.3.4
141 | textblob==0.15.3
142 | threadpoolctl==2.2.0
143 | tifffile==2021.8.30
144 | timm==0.4.5
145 | tokenizers==0.9.4
146 | toml==0.10.2
147 | torch==1.8.0
148 | torchaudio==0.8.0
149 | torchvision==0.9.0
150 | tqdm==4.56.0
151 | traitlets==5.1.0
152 | transformers==4.2.1
153 | tslearn==0.5.2
154 | typing-extensions==3.10.0.2
155 | urllib3==1.26.6
156 | wcwidth==0.2.5
157 | webcolors==1.11.1
158 | websocket-client==1.2.1
159 | Werkzeug==2.0.1
160 | wrapt==1.12.1
161 | yacs==0.1.8
162 | yapf==0.31.0
163 | yarg==0.1.9
164 | yarl==1.7.2
165 | Footer
166 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/ego4d_choice.py:
--------------------------------------------------------------------------------
  1 | from .video_base_dataset import BaseDataset, read_large_frames_decord, get_video_len
  2 | import os
  3 | import pandas as pd
  4 | 
  5 | 
  6 | class EGO4DChoiceDataset(BaseDataset):
  7 |     def __init__(self, *args, split="", **kwargs):
  8 |         assert split in ["train", "val", "test"]
  9 |         self.split = split
 10 |         if self.split == "train":
 11 |             Exception("no train data provided")
 12 |         self.metadata = None
 13 |         self.ans_lab_dict = None
 14 |         if split == "train":
 15 |             names = ["ego4d_choice_train"]
 16 |         elif split == "val":
 17 |             names = ["ego4d_choice_val"]
 18 |         elif split == "test":
 19 |             names = ["ego4d_choice_test"]  # vqav2_test-dev for test-dev
 20 | 
 21 |         super().__init__(
 22 |             *args,
 23 |             **kwargs,
 24 |             names=names,
 25 |             text_column_name="unknown",
 26 |             remove_duplicate=False,
 27 |         )
 28 |         self._load_metadata()
 29 | 
 30 |     def _load_metadata(self):
 31 |         metadata_dir = './meta_data/ego4d'
 32 |         split_files = {
 33 |             'train': 'mc_val.csv',         # no train and test available, only for zero-shot testing
 34 |             'val': 'mc_val.csv',
 35 |             'test': 'mc_val.csv'
 36 |         }
 37 |         target_split_fp = split_files[self.split]
 38 |         self.metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep=',', header=0, error_bad_lines=False)
 39 | 
 40 |     def _get_video_path(self, sample):
 41 |         rel_video_fp = eval(sample["question"])[0] + '.mp4'
 42 |         full_video_fp = os.path.join(self.data_dir, 'videos', rel_video_fp)
 43 |         if not os.path.exists(full_video_fp):
 44 |             Exception(IOError)
 45 |         return full_video_fp, rel_video_fp
 46 | 
 47 |     def get_raw_video(self, sample):
 48 |         abs_fp, rel_fp = self._get_video_path(sample)
 49 |         frame_loc = eval(sample["question"])[1]
 50 |         frame_end = get_video_len(abs_fp)
 51 |         imgs = read_large_frames_decord(abs_fp, frame_loc, frame_end, self.num_frames, mode=self.split)
 52 |         if imgs is None:
 53 |             raise Exception("Invalid video!", rel_fp)
 54 |         else:
 55 |             return imgs
 56 | 
 57 |     def get_text(self, sample):
 58 |         texts = []
 59 |         for answer in eval(sample["answers"]):
 60 |             text = answer[-1]
 61 |             encoding = self.tokenizer(
 62 |                 text,
 63 |                 padding="max_length",
 64 |                 truncation=True,
 65 |                 max_length=self.max_text_len,
 66 |                 return_special_tokens_mask=True,
 67 |             )
 68 |             texts.append((text, encoding))
 69 |         return texts
 70 | 
 71 |     def get_answer_label(self, sample):
 72 |         gt_text = eval(sample["question"])[-1]
 73 |         answer_label = 0
 74 |         for index, answer in enumerate(eval(sample["answers"])):
 75 |             if answer[-1] == gt_text:
 76 |                 answer_label = index
 77 |         return answer_label
 78 | 
 79 |     def __getitem__(self, index):
 80 |         sample = self.metadata.iloc[index]
 81 |         # print(sample)
 82 |         image_tensor = self.get_video(sample)
 83 |         # index, question_index = self.index_mapper[index]
 84 |         qid = index
 85 |         answer = self.get_answer_label(sample)
 86 |         ret = {
 87 |             "image": image_tensor,
 88 |             "img_index": index,
 89 |             "cap_index": index,
 90 |             "raw_index": index,
 91 |             'answer': answer
 92 |         }
 93 |         texts = self.get_text(sample)
 94 |         ret["text"] = texts[0]
 95 |         # print(len(texts))
 96 |         for i in range(self.draw_false_text - 1):
 97 |             ret.update({f"false_text_{i}": texts[i+1]})
 98 |         return ret
 99 | 
100 |     def __len__(self):
101 |         return len(self.metadata)


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/msrvttqa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .video_base_dataset import BaseDataset
  3 | import os
  4 | import json
  5 | import pandas as pd
  6 | 
  7 | 
  8 | class MSRVTTQADataset(BaseDataset):
  9 |     def __init__(self, *args, split="", **kwargs):
 10 |         assert split in ["train", "val", "test"]
 11 |         #         if split == "test":
 12 |         #             split = "val"
 13 |         self.split = split
 14 |         self.metadata = None
 15 |         self.ans_lab_dict = None
 16 |         if split == "train":
 17 |             names = ["msrvtt_qa_train"]
 18 |             # names = ["msrvtt_qa_train", "msrvtt_qa_val"]
 19 |         elif split == "val":
 20 |             names = ["msrvtt_qa_test"]  # ["msrvtt_qa_val"]
 21 |         elif split == "test":
 22 |             names = ["msrvtt_qa_test"]  # vqav2_test-dev for test-dev
 23 | 
 24 |         super().__init__(
 25 |             *args,
 26 |             **kwargs,
 27 |             names=names,
 28 |             text_column_name="questions",
 29 |             remove_duplicate=False,
 30 |         )
 31 |         self.names = names
 32 |         # self.num_frames = 4
 33 |         self._load_metadata()
 34 | 
 35 |     def _load_metadata(self):
 36 |         metadata_dir = './meta_data/msrvtt'
 37 |         split_files = {
 38 |             'train': 'msrvtt_qa_train.jsonl',
 39 |             'val': 'msrvtt_qa_val.jsonl',
 40 |             'test': 'msrvtt_qa_test.jsonl'
 41 |         }
 42 |         answer_fp = os.path.join(metadata_dir, 'msrvtt_train_ans2label.json')  # 1500 in total (all classes in train)
 43 |         # answer_fp = os.path.join(metadata_dir, 'msrvtt_qa_ans2label.json')  # 4539 in total (all classes in train+val+test)
 44 |         with open(answer_fp, 'r') as JSON:
 45 |             self.ans_lab_dict = json.load(JSON)
 46 |         for name in self.names:
 47 |             split = name.split('_')[-1]
 48 |             target_split_fp = split_files[split]
 49 |             # path_or_buf=os.path.join(metadata_dir, target_split_fp)
 50 |             metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True)
 51 |             if self.metadata is None:
 52 |                 self.metadata = metadata
 53 |             else:
 54 |                 self.metadata.update(metadata)
 55 |         print("total {} samples for {}".format(len(self.metadata), self.names))
 56 |         # data1.update(data2)
 57 | 
 58 |     def get_text(self, sample):
 59 |         text = sample['question']
 60 |         encoding = self.tokenizer(
 61 |             text,
 62 |             padding="max_length",
 63 |             truncation=True,
 64 |             max_length=self.max_text_len,
 65 |             return_special_tokens_mask=True,
 66 |         )
 67 |         return (text, encoding)
 68 | 
 69 |     def get_answer_label(self, sample):
 70 |         text = sample['answer']
 71 |         ans_total_len = len(self.ans_lab_dict) + 1  # one additional class
 72 |         try:
 73 |             ans_label = self.ans_lab_dict[text]  #
 74 |         except KeyError:
 75 |             ans_label = -100  # ignore classes
 76 |             # ans_label = 1500 # other classes
 77 |         scores = np.zeros(ans_total_len).astype(int)
 78 |         scores[ans_label] = 1
 79 |         return text, ans_label, scores
 80 |         # return text, ans_label_vector, scores
 81 | 
 82 |     def __getitem__(self, index):
 83 |         sample = self.metadata.iloc[index]
 84 |         image_tensor = self.get_video(sample)
 85 |         text = self.get_text(sample)
 86 |         # index, question_index = self.index_mapper[index]
 87 |         qid = index
 88 |         if self.split != "test":
 89 |             answers, labels, scores = self.get_answer_label(sample)
 90 |         else:
 91 |             answers = list()
 92 |             labels = list()
 93 |             scores = list()
 94 | 
 95 |         return {
 96 |             "image": image_tensor,
 97 |             "text": text,
 98 |             "vqa_answer": answers,
 99 |             "vqa_labels": labels,
100 |             "vqa_scores": scores,
101 |             "qid": qid,
102 |         }
103 | 
104 |     def __len__(self):
105 |         return len(self.metadata)


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/msvdqa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .video_base_dataset import BaseDataset
  3 | import os
  4 | import pandas as pd
  5 | import json
  6 | 
  7 | 
  8 | class MSVDQADataset(BaseDataset):
  9 |     def __init__(self, *args, split="", **kwargs):
 10 |         assert split in ["train", "val", "test"]
 11 |         self.split = split
 12 |         self.metadata = None
 13 |         self.ans_lab_dict = None
 14 |         if split == "train":
 15 |             names = ["msvd_qa_train"]
 16 |         elif split == "val":
 17 |             names = ["msvd_qa_test"]  
 18 |         elif split == "test":
 19 |             names = ["msvd_qa_test"] 
 20 | 
 21 |         super().__init__(
 22 |             *args,
 23 |             **kwargs,
 24 |             names=names,
 25 |             text_column_name="questions",
 26 |             remove_duplicate=False,
 27 |         )
 28 |         self._load_metadata()
 29 | 
 30 |     def _load_metadata(self):
 31 |         metadata_dir = '../../ICLR2023/VideoQA//meta_data/msvd'
 32 |         split_files = {'train': 'msvd_train.jsonl', 'val': 'msvd_val.jsonl', 'test': 'msvd_test.jsonl'}
 33 |         # split_files = {'train': 'what_train.jsonl', 'val': 'what_test.jsonl', 'test': 'what_test.jsonl'}
 34 | 
 35 |         self.ans_lab_dict = {}
 36 |         answer_fp = os.path.join(metadata_dir, 'msvd_answer_set.txt')
 37 |         self.youtube_mapping_dict = dict()
 38 |         with open(os.path.join(metadata_dir, 'msvd_youtube_mapping.txt')) as f:
 39 |             lines = f.readlines()
 40 |             for line in lines:
 41 |                 info = line.strip().split(' ')
 42 |                 self.youtube_mapping_dict[info[1]] = info[0]
 43 |         with open(answer_fp, 'r') as f:
 44 |             lines = f.readlines()
 45 |             count = 0
 46 |             for line in lines:
 47 |                 self.ans_lab_dict[str(line.strip())] = count
 48 |                 count += 1
 49 | 
 50 |         split = self.names[0].split('_')[-1]
 51 |         target_split_fp = split_files[split]
 52 |         self.metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True)
 53 |         # for i, k in enumerate(list(self.ans_lab_dict.keys())):
 54 |         #     if i == 250:
 55 |         #         break
 56 |         #     self.metadata = self.metadata[self.metadata['answer'] != k]
 57 | 
 58 | 
 59 |         print("total {} samples for {}".format(len(self.metadata), self.names))
 60 | 
 61 |     def _get_video_path(self, sample):
 62 |         rel_video_fp = self.youtube_mapping_dict['vid' + str(sample["video_id"])] + '.avi'
 63 |         full_video_fp = os.path.join(self.data_dir, 'videos', rel_video_fp)
 64 |         return full_video_fp, rel_video_fp
 65 | 
 66 |     def get_text(self, sample):
 67 |         text = sample['question']
 68 |         encoding = self.tokenizer(
 69 |             text,
 70 |             padding="max_length",
 71 |             truncation=True,
 72 |             max_length=self.max_text_len,
 73 |             return_special_tokens_mask=True,
 74 |         )
 75 |         return (text, encoding)
 76 | 
 77 |     def get_answer_label(self, sample):
 78 |         text = sample['answer']
 79 |         ans_total_len = len(self.ans_lab_dict) + 1  
 80 |         # ans_total_len = len(self.ans_lab_dict)
 81 |         try:
 82 |             ans_label = self.ans_lab_dict[text]  
 83 |         except KeyError:
 84 |             ans_label = -100
 85 |         scores = np.zeros(ans_total_len).astype(int)
 86 |         scores[ans_label] = 1
 87 |         return text, ans_label, scores
 88 | 
 89 |     def __getitem__(self, index):
 90 |         sample = self.metadata.iloc[index]
 91 |         image_tensor = self.get_video(sample)
 92 |         text = self.get_text(sample)
 93 |         qid = index
 94 | 
 95 |         answers, labels, scores = self.get_answer_label(sample)
 96 | 
 97 | 
 98 |         return {
 99 |             "image": image_tensor,
100 |             "text": text,
101 |             "vqa_answer": answers,
102 |             "vqa_labels": labels,
103 |             "vqa_scores": scores,
104 |             "qid": qid,
105 |         }
106 | 
107 |     def __len__(self):
108 |         return len(self.metadata) 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/tgif.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .video_base_dataset import BaseDataset, read_frames_gif
  3 | import os
  4 | import json
  5 | import pandas as pd
  6 | 
  7 | # 2022.1.28 read gif is too slow, may be need to speedup by convert gif -> video
  8 | # https://stackify.dev/833655-python-convert-gif-to-videomp4
  9 | 
 10 | 
 11 | class TGIFDataset(BaseDataset):
 12 |     def __init__(self, *args, split="", **kwargs):
 13 |         assert split in ["train", "val", "test"]
 14 |         self.split = split
 15 |         self.metadata = None
 16 |         self.ans_lab_dict = None
 17 |         if split == "train":
 18 |             names = ["tgif_train"]
 19 |         elif split == "val":
 20 |             names = ["tgif_val"]
 21 |         elif split == "test":
 22 |             names = ["tgif_test"]
 23 | 
 24 |         super().__init__(
 25 |             *args,
 26 |             **kwargs,
 27 |             names=names,
 28 |             text_column_name="questions",
 29 |             remove_duplicate=False,
 30 |         )
 31 |         # self.num_frames = 4
 32 |         self._load_metadata()
 33 | 
 34 |     def _load_metadata(self):
 35 |         metadata_dir = './meta_data/tgif'
 36 |         split_files = {
 37 |             'train': 'frameqa_train.jsonl',
 38 |             'val': 'frameqa_test.jsonl',  # frameqa_val.jsonl
 39 |             'test': 'frameqa_test.jsonl'
 40 |         }
 41 |         target_split_fp = split_files[self.split]
 42 |         answer_fp = os.path.join(metadata_dir, 'frameqa_trainval_ans2label.json')
 43 |         # answer_fp = os.path.join(metadata_dir, 'msrvtt_qa_ans2label.json')
 44 |         with open(answer_fp, 'r') as JSON:
 45 |             self.ans_lab_dict = json.load(JSON)
 46 |         # path_or_buf=os.path.join(metadata_dir, target_split_fp)
 47 |         metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True)
 48 |         self.metadata = metadata
 49 | 
 50 |     def _get_video_path(self, sample):
 51 |         return os.path.join(self.data_dir, 'gifs', sample['gif_name']) + '.gif', sample['gif_name'] + '.gif'
 52 | 
 53 |     def get_raw_video(self, sample):
 54 |         abs_fp, rel_fp = self._get_video_path(sample)
 55 |         imgs, idxs, vlen = read_frames_gif(abs_fp, self.num_frames, mode=self.split)
 56 |         if imgs is None:
 57 |             raise Exception("Invalid img!", rel_fp)
 58 |         else:
 59 |             return imgs
 60 | 
 61 |     def get_text(self, sample):
 62 |         text = sample['question']
 63 |         encoding = self.tokenizer(
 64 |             text,
 65 |             padding="max_length",
 66 |             truncation=True,
 67 |             max_length=self.max_text_len,
 68 |             return_special_tokens_mask=True,
 69 |         )
 70 |         return (text, encoding)
 71 | 
 72 |     def get_answer_label(self, sample):
 73 |         text = sample['answer']
 74 |         ans_total_len = len(self.ans_lab_dict) + 1  # one additional class
 75 |         try:
 76 |             ans_label = self.ans_lab_dict[text]  #
 77 |         except KeyError:
 78 |             ans_label = -100  # ignore classes
 79 |             # ans_label = 1500 # other classes
 80 |         scores = np.zeros(ans_total_len).astype(int)
 81 |         scores[ans_label] = 1
 82 |         return text, ans_label, scores
 83 |         # return text, ans_label_vector, scores
 84 | 
 85 |     def __getitem__(self, index):
 86 |         sample = self.metadata.iloc[index]
 87 |         image_tensor = self.get_video(sample)
 88 |         text = self.get_text(sample)
 89 |         # index, question_index = self.index_mapper[index]
 90 |         qid = index
 91 |         if self.split != "test":
 92 |             answers, labels, scores = self.get_answer_label(sample)
 93 |         else:
 94 |             answers = list()
 95 |             labels = list()
 96 |             scores = list()
 97 | 
 98 |         return {
 99 |             "image": image_tensor,
100 |             "text": text,
101 |             "vqa_answer": answers,
102 |             "vqa_labels": labels,
103 |             "vqa_scores": scores,
104 |             "qid": qid,
105 |         }
106 | 
107 |     def __len__(self):
108 |         return len(self.metadata)


--------------------------------------------------------------------------------
/univl/README.md:
--------------------------------------------------------------------------------
  1 | # UniVL + MELTR
  2 | 
  3 | ## Requirements
  4 | 
  5 | Our code is implemented under [UniVL](https://github.com/microsoft/UniVL) environment.
  6 | 
  7 | ## Datasets
  8 | 
  9 | We use two datasets (msrvtt, youcook). UniVL also provides downstream datasets and annotation files [here](https://github.com/microsoft/UniVL/blob/main/dataloaders/README.md).
 10 | Annotation files of msrvtt that we used can be found [here](https://drive.google.com/drive/folders/1akmVjM6vcjlwuQj7oIN9T_Gtb0bvr5iV).
 11 | 
 12 | Note: As mentioned in UniVL, a transcript is not publicly available due to legal issues.
 13 | 
 14 | 
 15 | 
 16 | ### YoucookII
 17 | 
 18 | ```
 19 | mkdir -p data
 20 | cd data
 21 | wget https://github.com/microsoft/UniVL/releases/download/v0/youcookii.zip
 22 | unzip youcookii.zip
 23 | cd ..
 24 | ```
 25 | 
 26 | ### MSRVTT
 27 | 
 28 | ```
 29 | mkdir -p data
 30 | cd data
 31 | wget https://github.com/microsoft/UniVL/releases/download/v0/msrvtt.zip
 32 | unzip msrvtt.zip
 33 | cd ..
 34 | ```
 35 | 
 36 | 
 37 | ## Pretrained checkpoint
 38 | 
 39 | ```
 40 | mkdir -p ./checkpoint
 41 | wget -P ./checkpoint https://github.com/microsoft/UniVL/releases/download/v0/univl.pretrained.bin
 42 | ```
 43 | 
 44 | ## Training & Evaluation
 45 | 
 46 | * Text-to-Video Retrieval on YouCook2
 47 | 
 48 | ```
 49 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \
 50 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/RY \
 51 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \
 52 | --eval_task retrieval --datatype youcook --youcook_train_csv /path/to/data/youcookii_train.csv \
 53 | --youcook_val_csv /path/to/data/youcookii_val.csv \
 54 | --youcook_features_path /path/to/data/youcookii_videos_features.pickle \
 55 | --youcook_data_path /path/to/data/youcookii_data.transcript.pickle
 56 | ```
 57 | 
 58 | * Text-to-Video Retrieval on MSRVTT-9K
 59 | 
 60 | ```
 61 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \
 62 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/R9K  \
 63 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \
 64 | --eval_task retrieval --datatype msrvtt9K --VTT_train_csv /path/to/data/MSRVTT_train.9k.csv \
 65 | --VTT_val_csv /path/to/data/MSRVTT_JSFUSION_test.csv \
 66 | --VTT_data_path /path/to/data/MSRVTT_transcript_data_v2.json \
 67 | --VTT_features_path /path/to/data/msrvtt_videos_features.pickle
 68 | ```
 69 | 
 70 | * Text-to-Video Retrieval on MSRVTT-7K
 71 | 
 72 | ```
 73 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \
 74 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/R7K  \
 75 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \
 76 | --eval_task retrieval --datatype msrvtt7K --VTT_train_csv /path/to/data/MSRVTT_train.9k.csv \
 77 | --VTT_val_csv /path/to/data/MSRVTT_JSFUSION_test.csv \
 78 | --VTT_data_path /path/to/data/MSRVTT_transcript_data_v2.json \
 79 | --VTT_features_path /path/to/data/msrvtt_videos_features.pickle
 80 | ```
 81 | 
 82 | * Captioning on YouCook2
 83 | 
 84 | ```
 85 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \
 86 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/CY1 \
 87 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \
 88 | --eval_task caption --datatype youcook --youcook_train_csv /path/to/data/youcookii_train.csv \
 89 | --youcook_val_csv /path/to/data/youcookii_val.csv \
 90 | --youcook_features_path /path/to/data/youcookii_videos_features.pickle \
 91 | --youcook_data_path /path/to/data/youcookii_data.transcript.pickle
 92 | ```
 93 | 
 94 | * Captioning on MSRVTT-Full
 95 | 
 96 | ```
 97 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \
 98 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/CF \
 99 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \
100 | --eval_task caption --datatype msrvttFull --VTT_train_csv /path/to/data/MSRVTT_train.9k.csv \
101 | --VTT_val_csv /path/to/data/MSRVTT_JSFUSION_test.csv \
102 | --VTT_data_path /path/to/data/MSRVTT_transcript_data_v2.json \
103 | --VTT_features_path /path/to/data/msrvtt_videos_features.pickle
104 | ```
105 | 
106 | 
107 | 
108 | ## Acknowledgement
109 | 
110 | This repo is built upon [UniVL](https://github.com/microsoft/UniVL).
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/univl/modules/beam.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Manage beam search info structure.
  3 | Heavily borrowed from OpenNMT-py.
  4 | For code in OpenNMT-py, please check the following link (maybe in oldest version):
  5 | https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/Beam.py
  6 | """
  7 | 
  8 | import torch
  9 | 
 10 | class Constants():
 11 |     def __init__(self):
 12 |         self.PAD = 0
 13 |         self.UNK = 1
 14 |         self.BOS = 2
 15 |         self.EOS = 3
 16 |         self.PAD_WORD = '[PAD]'
 17 |         self.UNK_WORD = '[UNK]'
 18 |         self.BOS_WORD = '[CLS]'
 19 |         self.EOS_WORD = '[SEP]'
 20 | 
 21 |     @classmethod
 22 |     def from_tokenizer(cls, tokenizer):
 23 |         instance = cls()
 24 |         instance.PAD = tokenizer.vocab[instance.PAD_WORD]
 25 |         instance.UNK = tokenizer.vocab[instance.UNK_WORD]
 26 |         instance.BOS = tokenizer.vocab[instance.BOS_WORD]
 27 |         instance.EOS = tokenizer.vocab[instance.EOS_WORD]
 28 |         return instance
 29 | 
 30 | class Beam():
 31 |     ''' Beam search '''
 32 | 
 33 |     def __init__(self, size, device=False, tokenizer=None):
 34 |         if tokenizer is None:
 35 |             self.constants = Constants()
 36 |         else:
 37 |             self.constants = Constants.from_tokenizer(tokenizer)
 38 | 
 39 |         self.size = size
 40 |         self._done = False
 41 |         # The score for each interface on the beam.
 42 |         self.scores = torch.zeros((size,), dtype=torch.float, device=device)
 43 |         self.all_scores = []
 44 | 
 45 |         # The backpointers at each time-step.
 46 |         self.prev_ks = []
 47 | 
 48 |         # The outputs at each time-step.
 49 |         self.next_ys = [torch.full((size,), self.constants.BOS, dtype=torch.long, device=device)]
 50 | 
 51 |     def get_current_state(self):
 52 |         "Get the outputs for the current timestep."
 53 |         return self.get_tentative_hypothesis()
 54 | 
 55 |     def get_current_origin(self):
 56 |         "Get the backpointers for the current timestep."
 57 |         return self.prev_ks[-1]
 58 | 
 59 |     @property
 60 |     def done(self):
 61 |         return self._done
 62 | 
 63 |     def advance(self, word_prob, word_length=None):
 64 | 
 65 |         "Update beam status and check if finished or not."
 66 |         num_words = word_prob.size(1)
 67 |         # Sum the previous scores.
 68 |         if len(self.prev_ks) > 0:
 69 |             beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob)
 70 |         else:
 71 |             beam_lk = word_prob[0]
 72 |         flat_beam_lk = beam_lk.view(-1)
 73 |         best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 1st sort
 74 |         self.all_scores.append(self.scores)
 75 |         self.scores = best_scores
 76 |         # bestScoresId is flattened as a (beam x word) array,
 77 |         # so we need to calculate which word and beam each score came from
 78 |         prev_k = best_scores_id // num_words
 79 |         self.prev_ks.append(prev_k)
 80 |         self.next_ys.append(best_scores_id - prev_k * num_words)
 81 |         # End condition is when top-of-beam is EOS.
 82 |         if self.next_ys[-1][0].item() == self.constants.EOS:
 83 |             self._done = True
 84 | 
 85 |         return self._done
 86 | 
 87 |     def sort_scores(self):
 88 |         "Sort the scores."
 89 |         return torch.sort(self.scores, 0, True)
 90 | 
 91 |     def get_the_best_score_and_idx(self):
 92 |         "Get the score of the best in the beam."
 93 |         scores, ids = self.sort_scores()
 94 |         return scores[1], ids[1]
 95 | 
 96 |     def get_tentative_hypothesis(self):
 97 |         "Get the decoded sequence for the current timestep."
 98 | 
 99 |         if len(self.next_ys) == 1:
100 |             dec_seq = self.next_ys[0].unsqueeze(1)
101 |         else:
102 |             _, keys = self.sort_scores()
103 |             hyps = [self.get_hypothesis(k) for k in keys]
104 |             hyps = [[self.constants.BOS] + h for h in hyps]
105 |             dec_seq = torch.LongTensor(hyps)
106 | 
107 |         return dec_seq
108 | 
109 |     def get_hypothesis(self, k):
110 |         """ Walk back to construct the full hypothesis. """
111 |         hyp = []
112 |         for j in range(len(self.prev_ks) - 1, -1, -1):
113 |             hyp.append(self.next_ys[j+1][k])
114 |             k = self.prev_ks[j][k]
115 | 
116 |         return list(map(lambda x: x.item(), hyp[::-1]))
117 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/lsmdc_choice.py:
--------------------------------------------------------------------------------
  1 | from .video_base_dataset import BaseDataset
  2 | import os
  3 | import pandas as pd
  4 | from AllInOne.transforms.videoaug import VideoTransform
  5 | import random
  6 | 
  7 | 
  8 | class LSMDCChoiceDataset(BaseDataset):
  9 |     def __init__(self, *args, split="", **kwargs):
 10 |         assert split in ["train", "val", "test"]
 11 |         self.split = split
 12 |         self.metadata = None
 13 |         self.ans_lab_dict = None
 14 |         if split == "train":
 15 |             names = ["lsmdc_choice_train"]
 16 |         elif split == "val":
 17 |             names = ["lsmdc_choice_val"]
 18 |         elif split == "test":
 19 |             names = ["lsmdc_choice_test"]  # vqav2_test-dev for test-dev
 20 | 
 21 |         super().__init__(
 22 |             *args,
 23 |             **kwargs,
 24 |             names=names,
 25 |             text_column_name="unknown",
 26 |             remove_duplicate=False,
 27 |         )
 28 |         self._load_metadata()
 29 | 
 30 |     def _load_metadata(self):
 31 |         metadata_dir = './meta_data/lsmdc'
 32 |         split_files = {
 33 |             'train': 'LSMDC16_multiple_choice_train.csv',
 34 |             'val': 'LSMDC16_multiple_choice_test_randomized.csv',  # 'LSMDC16_multiple_choice_valid.csv',
 35 |             'test': 'LSMDC16_multiple_choice_test_randomized.csv'
 36 |         }
 37 |         target_split_fp = split_files[self.split]
 38 |         print(os.path.join(metadata_dir, target_split_fp))
 39 |         metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t', header=None, error_bad_lines=False)
 40 |         self.metadata = metadata
 41 |         datalist = []
 42 |         for raw_id in range(len(metadata)):
 43 |             raw_d = metadata.iloc[raw_id]
 44 |             video_fp = raw_d[0]
 45 |             sub_path = video_fp.split('.')[0]
 46 |             remove = sub_path.split('_')[-1]
 47 |             sub_path = sub_path.replace('_'+remove,'/')
 48 |             rel_video_fp = sub_path + video_fp + '.avi'
 49 |             options = [raw_d[idx] for idx in range(5, 10)]
 50 |             d = dict(
 51 |                 id=video_fp,
 52 |                 vid_id=rel_video_fp,
 53 |                 answer=raw_d[10] - 1 if self.split in ['val', 'test'] else 0,
 54 |                 options=options,
 55 |             )
 56 |             datalist.append(d)
 57 |         self.metadata = datalist
 58 |         print("load split {}, {} samples".format(self.split, len(self.metadata)))
 59 | 
 60 |     def _get_video_path(self, sample):
 61 |         rel_video_fp = sample['vid_id']
 62 |         full_video_fp = os.path.join(self.data_dir, rel_video_fp)
 63 |         # print(full_video_fp)
 64 |         # assert os.path.exists(full_video_fp)
 65 |         return full_video_fp, rel_video_fp
 66 | 
 67 |     def get_text(self, sample):
 68 |         texts = []
 69 |         for text in sample['options']:
 70 |             encoding = self.tokenizer(
 71 |                 text,
 72 |                 padding="max_length",
 73 |                 truncation=True,
 74 |                 max_length=self.max_text_len,
 75 |                 return_special_tokens_mask=True,
 76 |             )
 77 |             texts.append((text, encoding))
 78 |         return texts
 79 | 
 80 |     def get_answer_label(self, sample):
 81 |         answer = sample['answer']
 82 |         return answer
 83 | 
 84 |     def __getitem__(self, index):
 85 |         result = False
 86 |         while not result:
 87 |             try:
 88 |                 sample = self.metadata[index]
 89 |                 image_tensor = self.get_video(sample)
 90 |                 qid = index
 91 |                 answer = self.get_answer_label(sample)
 92 |                 ret = {
 93 |                     "image": image_tensor,
 94 |                     "img_index": index,
 95 |                     "cap_index": index,
 96 |                     "raw_index": index,
 97 |                     'answer': answer
 98 |                 }
 99 |                 texts = self.get_text(sample)
100 |                 ret["text"] = texts[0]
101 |                 for i in range(self.draw_false_text - 1):
102 |                     ret.update({f"false_text_{i}": texts[i+1]})
103 |                 result = True
104 |             except Exception as e:
105 |                 print(f"Error while read file idx {sample['vid_id']} in {self.names[0]} -> {e}")
106 |                 index = random.randint(0, len(self.metadata) - 1)
107 |         return ret
108 | 
109 |     def __len__(self):
110 |         return len(self.metadata)


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/k400.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .video_base_dataset import BaseDataset
  3 | import os
  4 | import random
  5 | import pandas as pd
  6 | from AllInOne.transforms.videoaug import VideoTransform
  7 | 
  8 | 
  9 | class K400Dataset(BaseDataset):
 10 |     def __init__(self, *args, split="", **kwargs):
 11 |         assert split in ["train", "val", "test"]
 12 |         self.split = split
 13 |         self.metadata = None
 14 |         self.ans_lab_dict = dict()
 15 |         if split == "train":
 16 |             names = ["k400_train"]
 17 |         elif split == "val":
 18 |             names = ["k400_val"]
 19 |         elif split == "test":
 20 |             names = ["k400_test"]
 21 |         super().__init__(
 22 |             *args,
 23 |             **kwargs,
 24 |             names=names,
 25 |             text_column_name="questions",
 26 |             remove_duplicate=False,
 27 |         )
 28 |         self.video_transform = VideoTransform(mode=self.split)  # train or val model
 29 |         self._load_metadata()
 30 | 
 31 |     def _load_metadata(self):
 32 |         metadata_dir = './meta_data/k400'
 33 |         split_files = {
 34 |             'train': 'k400_train_tsm.list',
 35 |             'val': 'k400_test_tsm.list',
 36 |             'test': 'k400_test_tsm.list'
 37 |         }
 38 |         target_split_fp = split_files[self.split]
 39 |         with open(os.path.join(metadata_dir, target_split_fp)) as f:
 40 |             self.metadata = f.readlines()
 41 |         answer_fp = os.path.join(metadata_dir, 'kinetics_label_map.txt')
 42 |         count = 0
 43 |         with open(answer_fp, 'r') as f:
 44 |             lines = f.readlines()
 45 |             for line in lines:
 46 |                 self.ans_lab_dict[str(line.strip())] = count
 47 |                 count += 1
 48 | 
 49 |     def _get_video_path(self, sample):
 50 |         # find the name is os.listdir() e.g. abseiling/0wR5jVB-WPk.mp4
 51 |         # /data/algceph/arcdata/Kinetics-400/train_zips/snowboarding/MCgJO4s1qBA_000129_000139.zip
 52 |         # -> snowboarding/MCgJO4s1qBA_000129_000139.mp4
 53 |         if self.split == 'train':
 54 |             rel_path = sample[0][46:-4] + '.mp4'
 55 |         else:
 56 |             # val maybe mkv. webm etc.
 57 |             fake_path = sample[0][44:-4]
 58 |             sub_dir, video_name = fake_path.split('/')
 59 |             rel_path = sub_dir
 60 |             for video in os.listdir(os.path.join(self.data_dir, self.split, sub_dir)):
 61 |                 if video_name in video:
 62 |                     rel_path = os.path.join(rel_path, video)
 63 |                     break
 64 |         full_path = os.path.join(self.data_dir, self.split, rel_path)
 65 |         # print(full_path)
 66 |         return full_path, rel_path
 67 | 
 68 |     def get_text(self, sample):
 69 |         text = "A persion is doing [MASK]"
 70 |         encoding = self.tokenizer(
 71 |             text,
 72 |             padding="max_length",
 73 |             truncation=True,
 74 |             max_length=self.max_text_len,
 75 |             return_special_tokens_mask=True,
 76 |         )
 77 |         return (text, encoding)
 78 | 
 79 |     def get_answer_label(self, sample):
 80 |         text = "None"
 81 |         # print(len(self.ans_lab_dict))
 82 |         ans_total_len = len(self.ans_lab_dict) + 1  # one additional class
 83 |         ans_label = int(sample[1])
 84 |         scores = np.zeros(ans_total_len).astype(int)
 85 |         scores[ans_label] = 1
 86 |         return text, ans_label, scores
 87 | 
 88 |     def __getitem__(self, index):
 89 |         result = None
 90 |         while result is None:
 91 |             sample = self.metadata[index].split('\t')
 92 |             try:
 93 |                 image_tensor = self.get_video(sample)
 94 |                 text = self.get_text(sample)
 95 |                 qid = index
 96 |                 if self.split != "test":
 97 |                     answers, labels, scores = self.get_answer_label(sample)
 98 |                 else:
 99 |                     answers = list()
100 |                     labels = list()
101 |                     scores = list()
102 |                 result = True
103 |             except Exception as e:
104 |                 print(f"Error while read file idx {sample[0]} -> {e}")
105 |                 index = random.randint(0, len(self.metadata) - 1)
106 |         return {
107 |             "image": image_tensor,
108 |             "text": text,
109 |             "vqa_answer": answers,
110 |             "vqa_labels": labels,
111 |             "vqa_scores": scores,
112 |             "qid": qid,
113 |         }
114 | 
115 |     def __len__(self):
116 |         return len(self.metadata)


--------------------------------------------------------------------------------
/univl/eval/retrieval.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | import numpy as np
 4 | from utils.metrics import compute_metrics
 5 | from utils.utils import parallel_apply
 6 | global logger
 7 | 
 8 | def _run_on_single_gpu(net, batch_list_t, batch_list_v, batch_sequence_output_list, batch_visual_output_list):
 9 |     sim_matrix = []
10 |     for idx1, b1 in enumerate(tqdm(batch_list_t)):
11 |         input_ids, input_mask, segment_ids, _, _, _, _, _, _ = b1
12 |         sequence_output = batch_sequence_output_list[idx1]
13 |         each_row = []
14 |         for idx2, b2 in enumerate(batch_list_v):
15 |             _, _, _, video, video_mask, _, _, _, _ = b2
16 |             visual_output = batch_visual_output_list[idx2]
17 |             b1b2_logits = net.get_similarity_logits_align(sequence_output, visual_output, input_mask, video_mask)
18 |             b1b2_logits = b1b2_logits.cpu().detach().numpy()
19 |             each_row.append(b1b2_logits)
20 |         each_row = np.concatenate(tuple(each_row), axis=-1)
21 |         sim_matrix.append(each_row)
22 |     return sim_matrix
23 | 
24 | 
25 | def eval_retrieval_epoch(model, test_dataloader, device, n_gpu, logger):
26 | 
27 |     if hasattr(model, 'module'):
28 |         model = model.module.to(device)
29 |     else:
30 |         model = model.to(device)
31 |     model.eval()
32 |     with torch.no_grad():
33 |         batch_list = []
34 |         batch_sequence_output_list, batch_visual_output_list = [], []
35 |         for bid, batch in enumerate(tqdm(test_dataloader)):
36 |             batch = tuple(t.to(device) for t in batch)
37 | 
38 |             input_ids, input_mask, segment_ids, video, video_mask, _, _, _, _ = batch
39 | 
40 | 
41 |             sequence_output, visual_output = model.get_sequence_visual_output(input_ids, segment_ids, input_mask, video, video_mask)
42 | 
43 |             batch_sequence_output_list.append(sequence_output)
44 |             batch_visual_output_list.append(visual_output)
45 |             batch_list.append(batch)
46 | 
47 |             print("{}/{}\r".format(bid, len(test_dataloader)), end="")
48 | 
49 |         if n_gpu > 1:
50 |             device_ids = list(range(n_gpu))
51 |             batch_list_t_splits = []
52 |             batch_list_v_splits = []
53 |             batch_t_output_splits = []
54 |             batch_v_output_splits = []
55 |             bacth_len = len(batch_list)
56 |             split_len = (bacth_len + n_gpu - 1) // n_gpu
57 |             for dev_id in device_ids:
58 |                 s_, e_ = dev_id * split_len, (dev_id + 1) * split_len
59 |                 if dev_id == 0:
60 |                     batch_list_t_splits.append(batch_list[s_:e_])
61 |                     batch_list_v_splits.append(batch_list)
62 | 
63 |                     batch_t_output_splits.append(batch_sequence_output_list[s_:e_])
64 |                     batch_v_output_splits.append(batch_visual_output_list)
65 |                 else:
66 |                     devc = torch.device('cuda:{}'.format(str(dev_id)))
67 |                     devc_batch_list = [tuple(t.to(devc) for t in b) for b in batch_list[s_:e_]]
68 |                     batch_list_t_splits.append(devc_batch_list)
69 |                     devc_batch_list = [tuple(t.to(devc) for t in b) for b in batch_list]
70 |                     batch_list_v_splits.append(devc_batch_list)
71 | 
72 |                     devc_batch_list = [b.to(devc) for b in batch_sequence_output_list[s_:e_]]
73 |                     batch_t_output_splits.append(devc_batch_list)
74 |                     devc_batch_list = [b.to(devc) for b in batch_visual_output_list]
75 |                     batch_v_output_splits.append(devc_batch_list)
76 |             parameters_tuple_list = [(batch_list_t_splits[dev_id], batch_list_v_splits[dev_id],
77 |                                       batch_t_output_splits[dev_id], batch_v_output_splits[dev_id]) for dev_id in device_ids]
78 |             parallel_outputs = parallel_apply(_run_on_single_gpu, model, parameters_tuple_list, device_ids)
79 |             sim_matrix = []
80 |             for idx in range(len(parallel_outputs)):
81 |                 sim_matrix += parallel_outputs[idx]
82 |             sim_matrix = np.concatenate(tuple(sim_matrix), axis=0)
83 | 
84 |         else:
85 |             sim_matrix = _run_on_single_gpu(model, batch_list, batch_list, batch_sequence_output_list, batch_visual_output_list)
86 |             sim_matrix = np.concatenate(sim_matrix, axis=0)
87 | 
88 |     metrics = compute_metrics(sim_matrix) # 53 * (64, 3369)
89 |     logger.info('\t Length-T: {}, Length-V:{}'.format(len(sim_matrix), len(sim_matrix[0])))
90 |     logger.info('\t>>>  R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'.
91 |                 format(metrics['R1'], metrics['R5'], metrics['R10'], metrics['MR']))
92 | 
93 |     R1, R5, R10, MR = metrics['R1'], metrics['R5'], metrics['R10'], metrics['MR']
94 | 
95 |     return R1, R5, R10, MR
96 | 


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/activitynet.py:
--------------------------------------------------------------------------------
  1 | from .video_base_dataset import BaseDataset, read_frames_from_img_dir
  2 | import random
  3 | import os
  4 | import pandas as pd
  5 | 
  6 | 
  7 | class ActivityNetDataset(BaseDataset):
  8 |     def __init__(self, *args, split="", **kwargs):
  9 |         assert split in ["train", "val", "test"]
 10 |         self.split = split
 11 |         self.metadata = None
 12 |         if split == "train":
 13 |             names = ["activitynet_train"]
 14 |         elif split == "val":
 15 |             names = ["activitynet_val"]
 16 |         elif split == "test":
 17 |             names = ["activitynet_val"]
 18 |         self._load_metadata()
 19 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
 20 | 
 21 |     def _load_metadata(self):
 22 |         metadata_dir = './meta_data/activitynet'
 23 |         split_files = {
 24 |             'train': 'train.jsonl',
 25 |             'val': 'val1.jsonl',
 26 |             'test': 'val2.jsonl'
 27 |         }
 28 |         target_split_fp = split_files[self.split]
 29 |         metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True)
 30 |         self.metadata = metadata
 31 | 
 32 |     def _get_video_path(self, sample):
 33 |         rel_video_fp = sample['clip_name']
 34 |         full_video_fp = os.path.join(self.data_dir, 'activitynet_frames', rel_video_fp)
 35 |         return full_video_fp, rel_video_fp
 36 | 
 37 |     def get_raw_video(self, sample):
 38 |         abs_fp, rel_fp = self._get_video_path(sample)
 39 |         imgs, idxs, vlen = read_frames_from_img_dir(abs_fp, self.num_frames, mode=self.split)
 40 |         if imgs is None:
 41 |             raise Exception("Invalid img!", rel_fp)
 42 |         else:
 43 |             return imgs
 44 | 
 45 |     def get_video(self, index, sample, image_key="image"):
 46 |         imgs = self.get_raw_video(sample).permute(1, 0, 2, 3)  # to cthw
 47 |         imgs_tensor = [self.video_transform(imgs).permute(1, 0, 2, 3)]  # to tchw
 48 |         return {
 49 |             "image": imgs_tensor,
 50 |             "img_index": index,
 51 |             "cap_index": index,
 52 |             "raw_index": index,
 53 |         }
 54 | 
 55 |     def get_false_video(self, rep, image_key="image"):
 56 |         random_index = random.randint(0, len(self.metadata) - 1)
 57 |         sample = self.metadata.iloc[random_index]
 58 |         imgs = self.get_raw_video(sample).permute(1, 0, 2, 3)  # to cthw
 59 |         # can be different augmentation
 60 |         imgs_tensor = [self.video_transform(imgs).permute(1, 0, 2, 3)] # to tchw
 61 |         return {f"false_image_{rep}": imgs_tensor}
 62 | 
 63 |     def get_text(self, raw_index, sample):
 64 |         text = sample['caption']
 65 |         # print(text)
 66 |         encoding = self.tokenizer(
 67 |             text,
 68 |             padding="max_length",
 69 |             truncation=True,
 70 |             max_length=self.max_text_len,
 71 |             return_special_tokens_mask=True,
 72 |         )
 73 |         # print(encoding.size())
 74 |         return {
 75 |             "text": (text, encoding),
 76 |             "img_index": raw_index,
 77 |             "cap_index": raw_index,
 78 |             "raw_index": raw_index,
 79 |         }
 80 | 
 81 |     def get_false_text(self, rep):
 82 |         random_index = random.randint(0, len(self.metadata) - 1)
 83 |         sample = self.metadata.iloc[random_index]
 84 |         text = sample['caption']
 85 |         encoding = self.tokenizer(
 86 |             text,
 87 |             # padding="max_length",
 88 |             truncation=True,
 89 |             max_length=self.max_text_len,
 90 |             return_special_tokens_mask=True,
 91 |         )
 92 |         return {f"false_text_{rep}": (text, encoding)}
 93 | 
 94 |     def get_suite(self, index):
 95 |         result = None
 96 |         while result is None:
 97 |             sample = self.metadata.iloc[index]
 98 |             try:
 99 |                 ret = dict()
100 |                 ret.update(self.get_video(index, sample))
101 |                 if not self.image_only:
102 |                     txt = self.get_text(index, sample)
103 |                     ret.update({"replica": True if txt["cap_index"] > 0 else False})
104 |                     ret.update(txt)
105 | 
106 |                 for i in range(self.draw_false_image):
107 |                     ret.update(self.get_false_video(i))
108 |                 for i in range(self.draw_false_text):
109 |                     ret.update(self.get_false_text(i))
110 |                 result = True
111 |             except Exception as e:
112 |                 print(f"Error while read file idx {sample.name} in {self.names[0]} -> {e}")
113 |                 index = random.randint(0, len(self.metadata) - 1)
114 |         return ret
115 | 
116 |     def __len__(self):
117 |         return len(self.metadata)
118 | 
119 |     def __getitem__(self, index):
120 |         return self.get_suite(index)


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/webvid.py:
--------------------------------------------------------------------------------
  1 | from .video_base_dataset import BaseDataset, read_frames_decord
  2 | import random
  3 | import os
  4 | import pandas as pd
  5 | 
  6 | 
  7 | class WEBVIDDataset(BaseDataset):
  8 |     def __init__(self, *args, split="", **kwargs):
  9 |         assert split in ["train", "val", "test"]
 10 |         self.split = split
 11 |         self.metadata = None
 12 |         self.cut = "jsfusion"
 13 |         if split == "train":
 14 |             names = ["webvid_train"]
 15 |         elif split == "val":
 16 |             names = ["webvid_val"]
 17 |         elif split == "test":
 18 |             names = ["webvid_val"]
 19 |         self._load_metadata()
 20 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
 21 | 
 22 |     def _load_metadata(self):
 23 |         metadata_dir = './meta_data/webvid'
 24 |         split_files = {
 25 |             'train': 'webvid_training_success_full.tsv',
 26 |             'val': 'webvid_validation_success_full.tsv',            # there is no test
 27 |             'test': 'webvid_validation_success_full.tsv'
 28 |         }
 29 |         target_split_fp = split_files[self.split]
 30 |         metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t')
 31 |         self.metadata = metadata
 32 | 
 33 |     def _get_video_path(self, sample):
 34 |         rel_video_fp = sample[1] + '.mp4'
 35 |         full_video_fp = os.path.join(self.data_dir, self.split, rel_video_fp)
 36 |         return full_video_fp, rel_video_fp
 37 | 
 38 |     def _get_caption(self, sample):
 39 |         return sample[0]
 40 | 
 41 |     def get_raw_video(self, sample):
 42 |         abs_fp, rel_fp = self._get_video_path(sample)
 43 |         imgs, idxs, vlen = read_frames_decord(abs_fp, self.num_frames, mode=self.split)
 44 |         if imgs is None:
 45 |             raise Exception("Invalid img!", rel_fp)
 46 |         else:
 47 |             return imgs
 48 | 
 49 |     def get_video(self, index, sample, image_key="image"):
 50 |         imgs = self.get_raw_video(sample).permute(1, 0, 2, 3)  # to cthw
 51 |         imgs_tensor = [self.video_transform(imgs).permute(1, 0, 2, 3)]  # to tchw
 52 |         return {
 53 |             "image": imgs_tensor,
 54 |             "img_index": index,
 55 |             "cap_index": index,
 56 |             "raw_index": index,
 57 |         }
 58 | 
 59 |     def get_false_video(self, rep, image_key="image"):
 60 |         random_index = random.randint(0, len(self.metadata) - 1)
 61 |         sample = self.metadata.iloc[random_index]
 62 |         imgs = self.get_raw_video(sample).permute(1, 0, 2, 3)  # to cthw
 63 |         # can be different augmentation
 64 |         imgs_tensor = [self.video_transform(imgs).permute(1, 0, 2, 3)] # to tchw
 65 |         return {f"false_image_{rep}": imgs_tensor}
 66 | 
 67 |     def get_text(self, raw_index, sample):
 68 |         text = sample[0]
 69 |         # print(text)
 70 |         encoding = self.tokenizer(
 71 |             text,
 72 |             padding="max_length",
 73 |             truncation=True,
 74 |             max_length=self.max_text_len,
 75 |             return_special_tokens_mask=True,
 76 |         )
 77 |         # print(encoding.size())
 78 |         return {
 79 |             "text": (text, encoding),
 80 |             "img_index": raw_index,
 81 |             "cap_index": raw_index,
 82 |             "raw_index": raw_index,
 83 |         }
 84 | 
 85 |     def get_false_text(self, rep):
 86 |         random_index = random.randint(0, len(self.metadata) - 1)
 87 |         sample = self.metadata.iloc[random_index]
 88 |         text = sample[0]
 89 |         encoding = self.tokenizer(
 90 |             text,
 91 |             # padding="max_length",
 92 |             truncation=True,
 93 |             max_length=self.max_text_len,
 94 |             return_special_tokens_mask=True,
 95 |         )
 96 |         return {f"false_text_{rep}": (text, encoding)}
 97 | 
 98 |     def get_suite(self, index):
 99 |         result = None
100 |         while result is None:
101 |             sample = self.metadata.iloc[index]
102 |             try:
103 |                 ret = dict()
104 |                 ret.update(self.get_video(index, sample))
105 |                 if not self.image_only:
106 |                     txt = self.get_text(index, sample)
107 |                     ret.update({"replica": True if txt["cap_index"] > 0 else False})
108 |                     ret.update(txt)
109 | 
110 |                 for i in range(self.draw_false_image):
111 |                     ret.update(self.get_false_video(i))
112 |                 for i in range(self.draw_false_text):
113 |                     ret.update(self.get_false_text(i))
114 |                 result = True
115 |             except Exception as e:
116 |                 print(f"Error while read file idx {sample.name} in {self.names[0]} -> {e}")
117 |                 index = random.randint(0, len(self.metadata) - 1)
118 |         return ret
119 | 
120 |     def __len__(self):
121 |         return len(self.metadata)
122 | 
123 |     def __getitem__(self, index):
124 |         return self.get_suite(index)


--------------------------------------------------------------------------------
/allinone/AllInOne/datasets/tvqaplus.py:
--------------------------------------------------------------------------------
  1 | from .video_base_dataset import BaseDataset
  2 | import os
  3 | import pandas as pd
  4 | import cv2
  5 | import torch
  6 | from AllInOne.datasets.video_base_dataset import sample_frames
  7 | 
  8 | # each sample: https://tvqa.cs.unc.edu/download_tvqa_plus.html
  9 | # {
 10 | #   "answer_idx": "1",
 11 | #   "qid": 134094,
 12 | #   "ts": [5.99, 11.98],
 13 | #   "a1": "Howard is talking to Raj and Leonard",
 14 | #   "a0": "Howard is talking to Bernadette",
 15 | #   "a3": "Howard is talking to Leonard and Penny",
 16 | #   "a2": "Howard is talking to Sheldon , and Raj",
 17 | #   "q": "Who is Howard talking to when he is in the lab room ?",
 18 | #   "vid_name": "s05e02_seg02_clip_00",
 19 | #   "a4": "Howard is talking to Penny and Bernadette",
 20 | #   "bbox": {
 21 | #     "14": [
 22 | #       {
 23 | #         "img_id": 14,
 24 | #         "top": 153,
 25 | #         "label": "Howard",
 26 | #         "width": 180,
 27 | #         "height": 207,
 28 | #         "left": 339
 29 | #       },
 30 | #       {
 31 | #         "img_id": 14,
 32 | #         "top": 6,
 33 | #         "label": "lab",
 34 | #         "width": 637,
 35 | #         "height": 354,
 36 | #         "left": 3
 37 | #       },
 38 | #       ...
 39 | #     ],
 40 | #     "20": [ ... ],
 41 | #     "26": [ ... ],
 42 | #     "32": [ ... ],
 43 | #     "38": [ ... ]
 44 | #   }
 45 | # }
 46 | 
 47 | 
 48 | class TVQAPLUSDataset(BaseDataset):
 49 |     def __init__(self, *args, split="", **kwargs):
 50 |         assert split in ["train", "val", "test"]
 51 |         self.split = split
 52 |         self.metadata = None
 53 |         self._load_metadata()
 54 |         if split == "train":
 55 |             names = ["tvqaplus_train"]
 56 |         elif split == "val":
 57 |             names = ["tvqaplus_val"]
 58 |         elif split == "test":
 59 |             names = ["tvqaplus_test"]
 60 | 
 61 |         super().__init__(*args, **kwargs, names=names, text_column_name="caption")
 62 |         # for appear objects
 63 |         self.only_use_relevant_dets = True
 64 |         if self.only_use_relevant_dets:
 65 |             self.relevant_dets = []  # resort the detection numbers
 66 |             self.relevant_dets_classes = []
 67 | 
 68 |     def _load_metadata(self):
 69 |         # download specific
 70 |         metadata_dir = './meta_data/tvqa'
 71 |         split_files = {
 72 |             'train': 'tvqa_plus_train.jsonl',
 73 |             'val': 'tvqa_plus_val.jsonl',
 74 |             'test': 'tvqa_plus_test_public.jsonl'  # no GT label for test set
 75 |         }
 76 |         target_split_fp = split_files[self.split]
 77 |         metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True)
 78 |         self.metadata = metadata
 79 | 
 80 |     def _get_image_path(self, sample):
 81 |         rel_fp = sample['vid_name']
 82 |         return os.path.join(self.data_dir, rel_fp), rel_fp
 83 | 
 84 |     def _get_caption(self, sample):
 85 |         return sample[0]
 86 | 
 87 |     # tvqaplus provide sampled frames (3 fps)
 88 |     # To Do: considering sample one frame with bounding box
 89 |     def get_raw_video(self, sample):
 90 |         abs_fp, rel_fp = self._get_image_path(sample)
 91 |         [beg_time, end_time] = sample['ts']
 92 |         clip_len = int((float(end_time) - float(beg_time)) * 3)
 93 |         rel_frame_index = sample_frames(self.num_frames, clip_len)
 94 |         # sample N frames here
 95 |         frames = []
 96 |         for index in rel_frame_index:
 97 |             img = cv2.imread(abs_fp + '{}.jpg'.format(index))
 98 |             frame = torch.from_numpy(img).byte()
 99 |             frame = frame.permute(2, 0, 1)
100 |             frames.append(frame)
101 |         frames = torch.stack(frames).permute(1, 0, 2, 3)
102 |         return frames
103 | 
104 |     def get_text(self, sample):
105 |         question = self.get_question(sample)
106 |         qa_texts = []
107 |         # 5 choices
108 |         for i in range(5):
109 |             raw_text = question + "[SEP]" + sample["a{}".format(i)]
110 |             qa_encoding = self.tokenizer(
111 |                 raw_text,
112 |                 padding="max_length",
113 |                 truncation=True,
114 |                 max_length=self.max_text_len,
115 |                 return_special_tokens_mask=True,
116 |             )
117 |             qa_texts.append((raw_text, qa_encoding))
118 |         return qa_texts
119 | 
120 |     def get_answer_label(self, sample):
121 |         answer = int(sample['answer_idx'])
122 |         return answer
123 | 
124 |     def get_question(self, sample):
125 |         return sample["q"]
126 | 
127 |     def __len__(self):
128 |         return len(self.metadata)
129 | 
130 |     def __getitem__(self, index):
131 |         sample = self.metadata.iloc[index]
132 |         self.relevant_dets = []  # initalize
133 |         self.relevant_dets_classes = []
134 |         answer = self.get_answer_label(sample)
135 |         ret = {
136 |             "img_index": index,
137 |             "cap_index": index,
138 |             "raw_index": index,
139 |             'answer': answer
140 |         }
141 |         qa_texts = self.get_text(sample)
142 |         ret["text"] = qa_texts[0]
143 |         for i in range(self.draw_options_text - 1):
144 |             ret.update({f"options_text_{i}": qa_texts[i+1]})
145 |         video_tensor = self.get_video(sample)
146 |         ret["image"] = video_tensor
147 |         return ret
148 | 
149 | 


--------------------------------------------------------------------------------
/violet/eval_retrieval.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from lib import *
  3 | from dataset import Dataset_Base
  4 | from model import VIOLET_Base
  5 | from agent import Agent_Base
  6 | import pandas as pd
  7 | 
  8 | class Dataset_Retrieval(Dataset_Base):
  9 |     def __init__(self, args, split):
 10 |         super().__init__(args)
 11 |         self.split = split
 12 |         
 13 |         dataset = args['dataset']
 14 |         self.imgs = pickle.load(open(f'./_data/{dataset}/img_{dataset}.pkl', 'rb'))
 15 |         self.vq = pickle.load(open(f'./_data/{dataset}/{dataset}_vq.pkl', 'rb'))
 16 | 
 17 |         if split == 'train':
 18 |             self.data = json.load(open(args['train_annotation'], 'r'))['sentences']
 19 |         else:
 20 |             self.data = json.load(open(args['test_annotation'], 'r'))['sentences']
 21 | 
 22 |     def __len__(self):
 23 |         return len(self.data)
 24 | 
 25 |     def __getitem__(self, idx):
 26 |         vid = self.data[idx]['video_id']
 27 |         txt, mask = self.str2txt(self.data[idx]['caption'])
 28 |         
 29 |         img = []
 30 |         for b in self.imgs[vid]:
 31 |             img.append(self.str2img(b).unsqueeze(0))
 32 |         img = T.cat(img, dim=0)
 33 | 
 34 |         return img, txt, mask, vid
 35 | 
 36 | class Dataset_Product(T.utils.data.Dataset):
 37 |     def __init__(self, feat):
 38 |         super().__init__()
 39 |         
 40 |         self.vid2idx = {v: i for i, v in enumerate(feat)}
 41 |         self.lst = [[feat[p], feat[q]] for p in feat for q in feat]
 42 |         
 43 |     def __len__(self):
 44 |         return len(self.lst)
 45 |     
 46 |     def __getitem__(self, idx):
 47 |         p, q = self.lst[idx]
 48 |         
 49 |         return [p['feat_txt'], p['mask_txt'], self.vid2idx[p['video']], 
 50 |                 q['feat_img'], q['mask_img'], self.vid2idx[q['video']]] # (p->text, q->video)
 51 | 
 52 | class VIOLET_Retrieval(VIOLET_Base):
 53 |     def __init__(self):
 54 |         super().__init__()
 55 |         
 56 |         self.fc = T.nn.Sequential(*[T.nn.Dropout(0.1), 
 57 |                                     T.nn.Linear(768, 768*2), T.nn.ReLU(inplace=True), 
 58 |                                     T.nn.Linear(768*2, 1)])
 59 |     
 60 |     def forward(self, typ, 
 61 |                 img=None, txt=None, mask=None, 
 62 |                 feat_img=None, mask_img=None, feat_txt=None, mask_txt=None):
 63 |         
 64 |         if typ=='feat':
 65 |             feat_img, mask_img, feat_txt, mask_txt = self.go_feat(img, txt, mask)
 66 |             return feat_img, mask_img, feat_txt, mask_txt
 67 |         
 68 |         elif typ=='cross':
 69 |             out, _ = self.go_cross(feat_img, mask_img, feat_txt, mask_txt)
 70 |             out = self.fc(out[:, feat_img.shape[1], :]).squeeze()
 71 |             return out
 72 | 
 73 | if __name__=='__main__':
 74 |     args = json.load(open(sys.argv[1], 'r'))
 75 |     args['size_batch'] = 100*T.cuda.device_count()
 76 |     print(args)
 77 |     
 78 |     model = T.nn.DataParallel(VIOLET_Retrieval().cuda())
 79 |     model.module.load_ckpt(args['path_ckpt'])
 80 |     model.eval()
 81 |     
 82 |     for split in ['val']:
 83 |         dl = T.utils.data.DataLoader(Dataset_Retrieval(args, split), 
 84 |                                      batch_size=args['size_batch'], shuffle=False, 
 85 |                                      num_workers=64, pin_memory=True)
 86 |         feat = {}
 87 |         for img, txt, mask, vid in tqdm(dl, ascii=True):
 88 |             with T.no_grad():
 89 |                 feat_img, mask_img, feat_txt, mask_txt = model(typ='feat', img=img.cuda(), txt=txt.cuda(), mask=mask.cuda())
 90 |             for v, f_i, m_i, f_t, m_t in zip(vid, *[d.data.cpu().numpy() for d in [feat_img, mask_img, feat_txt, mask_txt]]):
 91 |                 feat[v] = {'video': v, 'feat_img': f_i, 'mask_img': m_i, 'feat_txt': f_t, 'mask_txt': m_t}
 92 |         
 93 |         dl = T.utils.data.DataLoader(Dataset_Product(feat), 
 94 |                                      batch_size=args['size_batch'], shuffle=False, 
 95 |                                      num_workers=64, pin_memory=True)
 96 |         rank = {}
 97 |         for feat_txt, mask_txt, idx_txt, feat_img, mask_img, idx_vid in tqdm(dl, ascii=True):
 98 |             with T.no_grad():
 99 |                 out = model(typ='cross', feat_img=feat_img, mask_img=mask_img, feat_txt=feat_txt, mask_txt=mask_txt)
100 |                 out = T.sigmoid(out).data.cpu().numpy()
101 |             for i_t, i_v, o in zip(idx_txt, idx_vid, out):
102 |                 i_t, i_v, o = int(i_t), int(i_v), float(o)
103 |                 
104 |                 if not i_t in rank:
105 |                     rank[i_t] = []
106 |                 rank[i_t].append([i_v, o])
107 |         
108 |         res = {'r@1': 0, 'r@5': 0, 'r@10': 0, 'median': []}
109 |         for i_t in rank:
110 |             tmp = sorted(rank[i_t], key=lambda d: -d[1])
111 |             p = [d[0] for d in tmp].index(i_t)+1
112 |             
113 |             if p<=1:
114 |                 res['r@1'] += 1.0/len(rank)
115 |             if p<=5:
116 |                 res['r@5'] += 1.0/len(rank)
117 |             if p<=10:
118 |                 res['r@10'] += 1.0/len(rank)
119 |             res['median'].append(p)
120 |         res['median'] = int(np.median(res['median']))
121 |         
122 |         print(split, res)
123 |         with open('result.txt', 'a') as f:
124 |             text = f"r@1: {res['r@1']}, r@5: {res['r@5']}, r@10: {res['r@10']}, median: {res['median']}\n"
125 |             f.write(text)


--------------------------------------------------------------------------------
/univl/modules/until_config.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """PyTorch BERT model."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import copy
 24 | import json
 25 | import logging
 26 | import tarfile
 27 | import tempfile
 28 | import shutil
 29 | import torch
 30 | from .file_utils import cached_path
 31 | 
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | class PretrainedConfig(object):
 35 | 
 36 |     pretrained_model_archive_map = {}
 37 |     config_name = ""
 38 |     weights_name = ""
 39 | 
 40 |     @classmethod
 41 |     def get_config(cls, pretrained_model_name, cache_dir, type_vocab_size, state_dict, task_config=None):
 42 |         archive_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), pretrained_model_name)
 43 |         if os.path.exists(archive_file) is False:
 44 |             if pretrained_model_name in cls.pretrained_model_archive_map:
 45 |                 archive_file = cls.pretrained_model_archive_map[pretrained_model_name]
 46 |             else:
 47 |                 archive_file = pretrained_model_name
 48 | 
 49 |         # redirect to the cache, if necessary
 50 |         try:
 51 |             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
 52 |         except FileNotFoundError:
 53 |             if task_config is None or task_config.local_rank == 0:
 54 |                 logger.error(
 55 |                     "Model name '{}' was not found in model name list. "
 56 |                     "We assumed '{}' was a path or url but couldn't find any file "
 57 |                     "associated to this path or url.".format(
 58 |                         pretrained_model_name,
 59 |                         archive_file))
 60 |             return None
 61 |         if resolved_archive_file == archive_file:
 62 |             if task_config is None or task_config.local_rank == 0:
 63 |                 logger.info("loading archive file {}".format(archive_file))
 64 |         else:
 65 |             if task_config is None or task_config.local_rank == 0:
 66 |                 logger.info("loading archive file {} from cache at {}".format(
 67 |                     archive_file, resolved_archive_file))
 68 |         tempdir = None
 69 |         if os.path.isdir(resolved_archive_file):
 70 |             serialization_dir = resolved_archive_file
 71 |         else:
 72 |             # Extract archive to temp dir
 73 |             tempdir = tempfile.mkdtemp()
 74 |             if task_config is None or task_config.local_rank == 0:
 75 |                 logger.info("extracting archive file {} to temp dir {}".format(
 76 |                     resolved_archive_file, tempdir))
 77 |             with tarfile.open(resolved_archive_file, 'r:gz') as archive:
 78 |                 archive.extractall(tempdir)
 79 |             serialization_dir = tempdir
 80 |         # Load config
 81 |         config_file = os.path.join(serialization_dir, cls.config_name)
 82 |         config = cls.from_json_file(config_file)
 83 |         config.type_vocab_size = type_vocab_size
 84 |         if task_config is None or task_config.local_rank == 0:
 85 |             logger.info("Model config {}".format(config))
 86 | 
 87 |         if state_dict is None:
 88 |             weights_path = os.path.join(serialization_dir, cls.weights_name)
 89 |             if os.path.exists(weights_path):
 90 |                 state_dict = torch.load(weights_path, map_location='cpu')
 91 |             else:
 92 |                 if task_config is None or task_config.local_rank == 0:
 93 |                     logger.info("Weight doesn't exsits. {}".format(weights_path))
 94 | 
 95 |         if tempdir:
 96 |             # Clean up temp dir
 97 |             shutil.rmtree(tempdir)
 98 | 
 99 |         return config, state_dict
100 | 
101 |     @classmethod
102 |     def from_dict(cls, json_object):
103 |         """Constructs a `BertConfig` from a Python dictionary of parameters."""
104 |         config = cls(vocab_size_or_config_json_file=-1)
105 |         for key, value in json_object.items():
106 |             config.__dict__[key] = value
107 |         return config
108 | 
109 |     @classmethod
110 |     def from_json_file(cls, json_file):
111 |         """Constructs a `BertConfig` from a json file of parameters."""
112 |         with open(json_file, "r", encoding='utf-8') as reader:
113 |             text = reader.read()
114 |         return cls.from_dict(json.loads(text))
115 | 
116 |     def __repr__(self):
117 |         return str(self.to_json_string())
118 | 
119 |     def to_dict(self):
120 |         """Serializes this instance to a Python dictionary."""
121 |         output = copy.deepcopy(self.__dict__)
122 |         return output
123 | 
124 |     def to_json_string(self):
125 |         """Serializes this instance to a JSON string."""
126 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


--------------------------------------------------------------------------------