├── allinone ├── __init__.py ├── AllInOne │ ├── __init__.py │ ├── gadgets │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── my_metrics.cpython-37.pyc │ │ └── my_metrics.py │ ├── datasets │ │ ├── k400_zero_shot.py │ │ ├── __pycache__ │ │ │ ├── vcr.cpython-37.pyc │ │ │ ├── cc3m.cpython-37.pyc │ │ │ ├── didemo.cpython-37.pyc │ │ │ ├── ego4d.cpython-37.pyc │ │ │ ├── hmdb51.cpython-37.pyc │ │ │ ├── k400.cpython-37.pyc │ │ │ ├── msrvtt.cpython-37.pyc │ │ │ ├── msvd.cpython-37.pyc │ │ │ ├── msvdqa.cpython-37.pyc │ │ │ ├── tgif.cpython-37.pyc │ │ │ ├── tgifqa.cpython-37.pyc │ │ │ ├── tvqa.cpython-37.pyc │ │ │ ├── webvid.cpython-37.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── msrvttqa.cpython-37.pyc │ │ │ ├── activitynet.cpython-37.pyc │ │ │ ├── howto100m.cpython-37.pyc │ │ │ ├── yttemporal.cpython-37.pyc │ │ │ ├── base_dataset.cpython-37.pyc │ │ │ ├── ego4d_choice.cpython-37.pyc │ │ │ ├── lsmdc_choice.cpython-37.pyc │ │ │ ├── lsmdc_dataset.cpython-37.pyc │ │ │ ├── msrvtt_choice.cpython-37.pyc │ │ │ ├── nlvr2_dataset.cpython-37.pyc │ │ │ ├── vqav2_dataset.cpython-37.pyc │ │ │ ├── vg_caption_dataset.cpython-37.pyc │ │ │ ├── video_base_dataset.cpython-37.pyc │ │ │ ├── sbu_caption_dataset.cpython-37.pyc │ │ │ ├── coco_caption_karpathy_dataset.cpython-37.pyc │ │ │ └── f30k_caption_karpathy_dataset.cpython-37.pyc │ │ ├── sbu_caption_dataset.py │ │ ├── vg_caption_dataset.py │ │ ├── f30k_caption_karpathy_dataset.py │ │ ├── coco_caption_karpathy_dataset.py │ │ ├── __init__.py │ │ ├── didemo.py │ │ ├── vqav2_dataset.py │ │ ├── nlvr2_dataset.py │ │ ├── msvd.py │ │ ├── lsmdc_dataset.py │ │ ├── ego4d.py │ │ ├── msrvtt.py │ │ ├── msrvtt_choice.py │ │ ├── hmdb51_zero_shot.py │ │ ├── hmdb51.py │ │ ├── ego4d_choice.py │ │ ├── msrvttqa.py │ │ ├── msvdqa.py │ │ ├── tgif.py │ │ ├── lsmdc_choice.py │ │ ├── k400.py │ │ ├── activitynet.py │ │ ├── webvid.py │ │ └── tvqaplus.py │ ├── modules │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── heads.cpython-37.pyc │ │ │ ├── meltr.cpython-37.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── dist_utils.cpython-37.pyc │ │ │ ├── objectives.cpython-37.pyc │ │ │ ├── allinone_utils.cpython-37.pyc │ │ │ ├── temporal_roll.cpython-37.pyc │ │ │ ├── allinone_module.cpython-37.pyc │ │ │ ├── retrieval_metrics.cpython-37.pyc │ │ │ └── base_vision_transformer.cpython-37.pyc │ │ ├── forzen_param.py │ │ ├── heads.py │ │ ├── temporal_roll.py │ │ └── meltr.py │ ├── __pycache__ │ │ ├── config.cpython-37.pyc │ │ └── __init__.cpython-37.pyc │ ├── transforms │ │ ├── __pycache__ │ │ │ ├── mix.cpython-37.pyc │ │ │ ├── utils.cpython-37.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── pixelbert.cpython-37.pyc │ │ │ ├── randaug.cpython-37.pyc │ │ │ ├── videoaug.cpython-37.pyc │ │ │ ├── functional.cpython-37.pyc │ │ │ └── video_transform.cpython-37.pyc │ │ ├── __init__.py │ │ ├── pixelbert.py │ │ ├── mix.py │ │ ├── utils.py │ │ ├── videoaug.py │ │ └── functional.py │ └── datamodules │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── cc3m_datamodule.cpython-37.pyc │ │ ├── datamodule_base.cpython-37.pyc │ │ ├── k400_datamodule.cpython-37.pyc │ │ ├── msvd_datamodule.cpython-37.pyc │ │ ├── sbu_datamodule.cpython-37.pyc │ │ ├── tgif_datamodule.cpython-37.pyc │ │ ├── tvqa_datamodule.cpython-37.pyc │ │ ├── vcr_datamodule.cpython-37.pyc │ │ ├── didemo_datamodule.cpython-37.pyc │ │ ├── ego4d_datamodule.cpython-37.pyc │ │ ├── hmdb51_datamodule.cpython-37.pyc │ │ ├── lsmdc_datamodule.cpython-37.pyc │ │ ├── msrvtt_datamodule.cpython-37.pyc │ │ ├── msvdqa_datamodule.cpython-37.pyc │ │ ├── nlvr2_datamodule.cpython-37.pyc │ │ ├── tgifqa_datamodule.cpython-37.pyc │ │ ├── vqav2_datamodule.cpython-37.pyc │ │ ├── webvid_datamodule.cpython-37.pyc │ │ ├── howto100m_datamodule.cpython-37.pyc │ │ ├── msrvttqa_datamodule.cpython-37.pyc │ │ ├── multitask_datamodule.cpython-37.pyc │ │ ├── activitynet_datamodule.cpython-37.pyc │ │ ├── ego4d_choice_datamodule.cpython-37.pyc │ │ ├── lsmdc_choice_datamodule.cpython-37.pyc │ │ ├── vg_caption_datamodule.cpython-37.pyc │ │ ├── yttemporal_datamodule.cpython-37.pyc │ │ ├── msrvtt_choice_datamodule.cpython-37.pyc │ │ ├── coco_caption_karpathy_datamodule.cpython-37.pyc │ │ └── f30k_caption_karpathy_datamodule.cpython-37.pyc │ │ ├── vcr_datamodule.py │ │ ├── cc3m_datamodule.py │ │ ├── nlvr2_datamodule.py │ │ ├── sbu_datamodule.py │ │ ├── activitynet_datamodule.py │ │ ├── vg_caption_datamodule.py │ │ ├── k400_datamodule.py │ │ ├── msvd_datamodule.py │ │ ├── tgif_datamodule.py │ │ ├── tvqa_datamodule.py │ │ ├── ego4d_datamodule.py │ │ ├── lsmdc_datamodule.py │ │ ├── didemo_datamodule.py │ │ ├── hmdb51_datamodule.py │ │ ├── msrvtt_datamodule.py │ │ ├── tgifqa_datamodule.py │ │ ├── webvid_datamodule.py │ │ ├── howto100m_datamodule.py │ │ ├── yttemporal_datamodule.py │ │ ├── ego4d_choice_datamodule.py │ │ ├── lsmdc_choice_datamodule.py │ │ ├── msrvtt_choice_datamodule.py │ │ ├── coco_caption_karpathy_datamodule.py │ │ ├── f30k_caption_karpathy_datamodule.py │ │ ├── msvdqa_datamodule.py │ │ ├── msrvttqa_datamodule.py │ │ ├── vqav2_datamodule.py │ │ ├── __init__.py │ │ └── multitask_datamodule.py ├── setup.py ├── param_and_flop.py ├── move_pretrained_weight.py ├── README.md ├── run.py └── requirements.txt ├── univl ├── eval │ ├── __init__.py │ └── retrieval.py ├── utils │ ├── __init__.py │ └── metrics.py ├── modules │ ├── __init__.py │ ├── __pycache__ │ │ ├── beam.cpython-37.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── file_utils.cpython-37.pyc │ │ ├── module_bert.cpython-37.pyc │ │ ├── module_cross.cpython-37.pyc │ │ ├── module_decoder.cpython-37.pyc │ │ ├── module_meltr.cpython-37.pyc │ │ ├── module_visual.cpython-37.pyc │ │ ├── tokenization.cpython-37.pyc │ │ ├── until_config.cpython-37.pyc │ │ └── until_module.cpython-37.pyc │ ├── cross-base │ │ └── cross_config.json │ ├── visual-base │ │ └── visual_config.json │ ├── bert-base-uncased │ │ └── bert_config.json │ ├── decoder-base │ │ └── decoder_config.json │ ├── optimization_MELTR.py │ ├── meltr.py │ ├── beam.py │ └── until_config.py ├── requirements.txt ├── dataloaders │ ├── __pycache__ │ │ ├── dataloader_meta_msrvtt.cpython-37.pyc │ │ ├── dataloader_meta_youcook.cpython-37.pyc │ │ └── dataloader_msrvtt_caption.cpython-37.pyc │ └── README.md ├── asset │ └── bert_config.json ├── LICENSE └── README.md ├── asset └── main.png ├── violet ├── lib.py ├── args │ ├── args_msvd-qaoe.json │ ├── args_tgif-action.json │ ├── args_tgif-frame.json │ ├── args_tgif-transition.json │ ├── args_msrvtt-retrieval_7k.json │ ├── args_msrvtt-retrieval_9k.json │ └── args_msrvtt-retrieval_eval.json ├── tools │ ├── extract_tsv.py │ ├── extract_video-frame.py │ └── extract_vq.py ├── dataset.py ├── agent.py ├── utils.py ├── README.md ├── meltr.py ├── model.py └── eval_retrieval.py ├── LICENSE └── README.md /allinone/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /univl/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /univl/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /univl/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /allinone/AllInOne/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /allinone/AllInOne/gadgets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/k400_zero_shot.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /asset/main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/asset/main.png -------------------------------------------------------------------------------- /univl/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.7.0 2 | tqdm 3 | boto3 4 | requests 5 | pandas -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from AllInOne.modules.allinone_module import AllinoneTransformerSS -------------------------------------------------------------------------------- /univl/modules/__pycache__/beam.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/beam.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/file_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/file_utils.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/module_bert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_bert.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/module_cross.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_cross.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/module_decoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_decoder.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/module_meltr.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_meltr.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/module_visual.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/module_visual.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/tokenization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/tokenization.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/until_config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/until_config.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/__pycache__/until_module.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/modules/__pycache__/until_module.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/vcr.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/vcr.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/cc3m.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/cc3m.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/didemo.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/didemo.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/ego4d.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/ego4d.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/hmdb51.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/hmdb51.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/k400.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/k400.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/msrvtt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msrvtt.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/msvd.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msvd.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/msvdqa.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msvdqa.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/tgif.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/tgif.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/tgifqa.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/tgifqa.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/tvqa.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/tvqa.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/webvid.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/webvid.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/heads.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/heads.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/meltr.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/meltr.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__pycache__/mix.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/mix.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/msrvttqa.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msrvttqa.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/gadgets/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/gadgets/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/activitynet.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/activitynet.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/howto100m.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/howto100m.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/yttemporal.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/yttemporal.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/gadgets/__pycache__/my_metrics.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/gadgets/__pycache__/my_metrics.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/dist_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/dist_utils.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/objectives.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/objectives.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__pycache__/pixelbert.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/pixelbert.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__pycache__/randaug.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/randaug.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__pycache__/videoaug.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/videoaug.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/base_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/base_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/ego4d_choice.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/ego4d_choice.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/lsmdc_choice.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/lsmdc_choice.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/lsmdc_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/lsmdc_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/msrvtt_choice.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/msrvtt_choice.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/nlvr2_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/nlvr2_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/vqav2_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/vqav2_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/allinone_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/allinone_utils.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/temporal_roll.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/temporal_roll.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__pycache__/functional.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/functional.cpython-37.pyc -------------------------------------------------------------------------------- /univl/dataloaders/__pycache__/dataloader_meta_msrvtt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/dataloaders/__pycache__/dataloader_meta_msrvtt.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/allinone_module.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/allinone_module.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/retrieval_metrics.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/retrieval_metrics.cpython-37.pyc -------------------------------------------------------------------------------- /univl/dataloaders/__pycache__/dataloader_meta_youcook.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/dataloaders/__pycache__/dataloader_meta_youcook.cpython-37.pyc -------------------------------------------------------------------------------- /univl/dataloaders/__pycache__/dataloader_msrvtt_caption.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/univl/dataloaders/__pycache__/dataloader_msrvtt_caption.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/cc3m_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/cc3m_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/datamodule_base.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/datamodule_base.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/k400_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/k400_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/msvd_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msvd_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/sbu_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/sbu_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/tgif_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/tgif_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/tvqa_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/tvqa_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/vcr_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/vcr_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/vg_caption_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/vg_caption_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/video_base_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/video_base_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__pycache__/video_transform.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/transforms/__pycache__/video_transform.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/didemo_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/didemo_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/ego4d_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/ego4d_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/hmdb51_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/hmdb51_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/lsmdc_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/lsmdc_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/msrvtt_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msrvtt_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/msvdqa_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msvdqa_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/nlvr2_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/nlvr2_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/tgifqa_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/tgifqa_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/vqav2_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/vqav2_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/webvid_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/webvid_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/sbu_caption_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/sbu_caption_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/howto100m_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/howto100m_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/msrvttqa_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msrvttqa_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/multitask_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/multitask_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/modules/__pycache__/base_vision_transformer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/modules/__pycache__/base_vision_transformer.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/activitynet_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/activitynet_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/ego4d_choice_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/ego4d_choice_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/lsmdc_choice_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/lsmdc_choice_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/vg_caption_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/vg_caption_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/yttemporal_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/yttemporal_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/msrvtt_choice_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/msrvtt_choice_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/coco_caption_karpathy_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/coco_caption_karpathy_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__pycache__/f30k_caption_karpathy_dataset.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datasets/__pycache__/f30k_caption_karpathy_dataset.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/coco_caption_karpathy_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/coco_caption_karpathy_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__pycache__/f30k_caption_karpathy_datamodule.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlvlab/MELTR/HEAD/allinone/AllInOne/datamodules/__pycache__/f30k_caption_karpathy_datamodule.cpython-37.pyc -------------------------------------------------------------------------------- /univl/modules/cross-base/cross_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 1024, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 2, 11 | "vocab_size": 768 12 | } -------------------------------------------------------------------------------- /univl/modules/visual-base/visual_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 1, 11 | "vocab_size": 1024 12 | } 13 | -------------------------------------------------------------------------------- /violet/lib.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse, sys, os, io, base64, pickle, json, math 3 | 4 | from datetime import datetime 5 | from tqdm import tqdm 6 | 7 | import numpy as np 8 | import torch as T 9 | import torchvision as TV 10 | import torch.distributed as DIST 11 | 12 | import cv2 13 | from PIL import Image 14 | 15 | import transformers 16 | os.environ['TOKENIZERS_PARALLELISM'] = 'true' 17 | -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .pixelbert import ( 2 | pixelbert_transform, 3 | pixelbert_transform_randaug, 4 | ) 5 | 6 | _transforms = { 7 | "pixelbert": pixelbert_transform, 8 | "pixelbert_randaug": pixelbert_transform_randaug, 9 | } 10 | 11 | 12 | def keys_to_transforms(keys: list, size=224): 13 | return [_transforms[key](size=size) for key in keys] 14 | -------------------------------------------------------------------------------- /univl/asset/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /univl/modules/bert-base-uncased/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } 14 | -------------------------------------------------------------------------------- /univl/modules/decoder-base/decoder_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "num_attention_heads": 12, 9 | "num_hidden_layers": 12, 10 | "type_vocab_size": 2, 11 | "vocab_size": 30522, 12 | "num_decoder_layers": 4, 13 | "max_target_embeddings": 512 14 | } 15 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/vcr_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import VCRDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class VCRDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return VCRDataset 12 | 13 | @property 14 | def dataset_name(self): 15 | return "vcr" 16 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/cc3m_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import CC3MDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class CC3MDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return CC3MDataset 12 | 13 | @property 14 | def dataset_name(self): 15 | return "cc3m" 16 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/nlvr2_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import NLVR2Dataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class NLVR2DataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return NLVR2Dataset 12 | 13 | @property 14 | def dataset_name(self): 15 | return "nlvr2" 16 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/sbu_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import SBUCaptionDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class SBUCaptionDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return SBUCaptionDataset 12 | 13 | @property 14 | def dataset_name(self): 15 | return "sbu" 16 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/activitynet_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import ActivityNetDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class ActivityNetDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return ActivityNetDataset 12 | 13 | @property 14 | def dataset_name(self): 15 | return "activitynet" 16 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/vg_caption_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import VisualGenomeCaptionDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class VisualGenomeCaptionDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return VisualGenomeCaptionDataset 12 | 13 | @property 14 | def dataset_name(self): 15 | return "vg" 16 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/k400_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import K400Dataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class K400DataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return K400Dataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return K400Dataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "k400" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/msvd_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import MSVDDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class MSVDDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return MSVDDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return MSVDDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "msvd" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/tgif_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import TGIFDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class TGIFDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return TGIFDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return TGIFDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "tgif" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/tvqa_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import TVQADataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class TVQADataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return TVQADataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return TVQADataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "tvqa" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/ego4d_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import Ego4DDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class Ego4DDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return Ego4DDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return Ego4DDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "ego4d" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/lsmdc_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import LSMDCDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class LSMDCDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return LSMDCDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return LSMDCDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "lsmdc" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/didemo_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import DIDEMODataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class DIDEMODataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return DIDEMODataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return DIDEMODataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "didemo" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/hmdb51_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import HMDB51Dataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class HMDB51DataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return HMDB51Dataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return HMDB51Dataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "hmdb51" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/msrvtt_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import MSRVTTDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class MSRVTTDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return MSRVTTDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return MSRVTTDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "msrvtt" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/tgifqa_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import TGIFQADataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class TGIFQADataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return TGIFQADataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return TGIFQADataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "tgifqa" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/webvid_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import WEBVIDDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class WEBVIDDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return WEBVIDDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return WEBVIDDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "webvid" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/howto100m_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import HT100MDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class HT100MDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return HT100MDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return HT100MDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "howto100m" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/yttemporal_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import YTTemporalDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class YTTemporalMDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return YTTemporalDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return YTTemporalDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "yttemporal" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/ego4d_choice_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import EGO4DChoiceDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class EGO4DChoiceDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return EGO4DChoiceDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return EGO4DChoiceDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "ego4d_choice" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/lsmdc_choice_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import LSMDCChoiceDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class LSMDCChoiceDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return LSMDCChoiceDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return LSMDCChoiceDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "lsmdc_choice" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/msrvtt_choice_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import MSRVTTChoiceDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class MSRVTTChoiceDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return MSRVTTChoiceDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return MSRVTTChoiceDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "msrvtt_choice" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/modules/forzen_param.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | # def forzen_param(model): 5 | # for name, param in model.named_parameters(): 6 | # if 'mlm_score' in name or 'itm_score' in name or 'mpp_score' in name: 7 | # param.requires_grad = True 8 | # else: 9 | # param.requires_grad = False 10 | # return True 11 | 12 | 13 | def forzen_param(model): 14 | flag = False 15 | for name, param in model.named_parameters(): 16 | if '10' in name: 17 | flag = True 18 | param.requires_grad = flag 19 | return True -------------------------------------------------------------------------------- /violet/args/args_msvd-qaoe.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "qaoe", 3 | 4 | "dataset": "msvd", 5 | "task": "msvd-qa", 6 | "annotation_file": "txt_msvd-qa.json", 7 | 8 | "size_img": 224, 9 | "size_txt": 25, 10 | "size_vocab": 1000, 11 | 12 | "size_epoch": 30, 13 | "size_batch": 5, 14 | 15 | "lr": 1.2e-5, 16 | "decay": 1e-3, 17 | 18 | "meltr_lr": 1e-4, 19 | "meltr_decay": 1e-4, 20 | "max_grad_norm": 12, 21 | "auxgrad_every": 3, 22 | "gamma": 0.1, 23 | 24 | "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt" 25 | } 26 | -------------------------------------------------------------------------------- /violet/args/args_tgif-action.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "qamc", 3 | 4 | "dataset": "tgif", 5 | "task": "tgif-action", 6 | "annotation_file": "txt_tgif-action.json", 7 | 8 | "size_img": 224, 9 | "size_txt": 40, 10 | "size_option": 5, 11 | 12 | "size_epoch": 20, 13 | "size_batch": 3, 14 | 15 | "lr": 6e-6, 16 | "decay": 1e-3, 17 | 18 | "meltr_lr": 1e-4, 19 | "meltr_decay": 1e-4, 20 | "max_grad_norm": 12, 21 | "auxgrad_every": 3, 22 | "gamma": 0.1, 23 | 24 | "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt" 25 | } 26 | -------------------------------------------------------------------------------- /violet/args/args_tgif-frame.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "qaoe", 3 | 4 | "dataset": "tgif", 5 | "task": "tgif-frame", 6 | "annotation_file": "txt_tgif-frame.json", 7 | 8 | "size_img": 224, 9 | "size_txt": 25, 10 | "size_vocab": 1540, 11 | 12 | "size_epoch": 20, 13 | "size_batch": 5, 14 | 15 | "lr": 3e-5, 16 | "decay": 1e-4, 17 | 18 | "meltr_lr": 1e-4, 19 | "meltr_decay": 1e-4, 20 | "max_grad_norm": 12, 21 | "auxgrad_every": 3, 22 | "gamma": 0.1, 23 | 24 | "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt" 25 | } 26 | -------------------------------------------------------------------------------- /allinone/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="AllInOne", 5 | packages=find_packages( 6 | exclude=[".dfc", ".vscode", "dataset", "notebooks", "result", "scripts"] 7 | ), 8 | version="1.0.0", 9 | license="MIT", 10 | description="All in One: Exploring Unified Video-Language Pre-training", 11 | author="Alex Jinpeng Wang", 12 | author_email="awinyimgprocess@gmail.com", 13 | url="https://github.com/fingerrec'", 14 | keywords=["video and language pretraining"], 15 | install_requires=["torch", "pytorch_lightning"], 16 | ) 17 | -------------------------------------------------------------------------------- /violet/args/args_tgif-transition.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "qamc", 3 | 4 | "dataset": "tgif", 5 | "task": "tgif-transition", 6 | "annotation_file": "txt_tgif-transition.json", 7 | 8 | "size_img": 224, 9 | "size_txt": 40, 10 | "size_option": 5, 11 | 12 | "size_epoch": 20, 13 | "size_batch": 3, 14 | 15 | "lr": 3e-6, 16 | "decay": 1e-3, 17 | 18 | "meltr_lr": 1e-4, 19 | "meltr_decay": 1e-4, 20 | "max_grad_norm": 12, 21 | "auxgrad_every": 3, 22 | "gamma": 0.1, 23 | 24 | "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt" 25 | } 26 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/coco_caption_karpathy_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import CocoCaptionKarpathyDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class CocoCaptionKarpathyDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return CocoCaptionKarpathyDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return CocoCaptionKarpathyDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "coco" 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/f30k_caption_karpathy_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import F30KCaptionKarpathyDataset 2 | from .datamodule_base import BaseDataModule 3 | 4 | 5 | class F30KCaptionKarpathyDataModule(BaseDataModule): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | 9 | @property 10 | def dataset_cls(self): 11 | return F30KCaptionKarpathyDataset 12 | 13 | @property 14 | def dataset_cls_no_false(self): 15 | return F30KCaptionKarpathyDataset 16 | 17 | @property 18 | def dataset_name(self): 19 | return "f30k" 20 | -------------------------------------------------------------------------------- /allinone/param_and_flop.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import pytorch_lightning as pl 3 | from AllInOne.config import ex 4 | from AllInOne.modules import ViLTransformerSS 5 | from AllInOne.datamodules.multitask_datamodule import MTDataModule 6 | from thop import profile 7 | import torch 8 | 9 | @ex.automain 10 | def main(_config): 11 | _config = copy.deepcopy(_config) 12 | pl.seed_everything(_config["seed"]) 13 | 14 | dm = MTDataModule(_config, dist=True) 15 | 16 | model = ViLTransformerSS(_config) 17 | input = torch.randn(1, 3, 3, 224, 224) 18 | macs, params = profile(model, inputs=(input,)) 19 | print(macs, params) 20 | 21 | # 110M -------------------------------------------------------------------------------- /violet/args/args_msrvtt-retrieval_7k.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "retrieval", 3 | 4 | "dataset": "msrvtt", 5 | "task": "msrvtt-retrieval", 6 | "train_annotation": "./_data/msrvtt/train_7k.json", 7 | "test_annotation": "./_data/msrvtt/test.json", 8 | 9 | "size_img": 224, 10 | "size_txt": 25, 11 | 12 | "size_epoch": 20, 13 | "size_batch": 5, 14 | 15 | "lr": 3e-6, 16 | "decay": 1e-3, 17 | 18 | "meltr_lr": 3e-6, 19 | "meltr_decay": 1e-3, 20 | "max_grad_norm": 12, 21 | "auxgrad_every": 3, 22 | "gamma": 0.1, 23 | 24 | "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt" 25 | } 26 | -------------------------------------------------------------------------------- /violet/args/args_msrvtt-retrieval_9k.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "retrieval", 3 | 4 | "dataset": "msrvtt", 5 | "task": "msrvtt-retrieval", 6 | "train_annotation": "./_data/msrvtt/train_9k.json", 7 | "test_annotation": "./_data/msrvtt/test.json", 8 | 9 | "size_img": 224, 10 | "size_txt": 25, 11 | 12 | "size_epoch": 20, 13 | "size_batch": 5, 14 | 15 | "lr": 3e-6, 16 | "decay": 1e-3, 17 | 18 | "meltr_lr": 3e-6, 19 | "meltr_decay": 1e-3, 20 | "max_grad_norm": 12, 21 | "auxgrad_every": 3, 22 | "gamma": 0.1, 23 | 24 | "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt" 25 | } 26 | -------------------------------------------------------------------------------- /violet/args/args_msrvtt-retrieval_eval.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "retrieval", 3 | 4 | "dataset": "msrvtt", 5 | "task": "msrvtt-retrieval", 6 | "train_annotation": "./_data/msrvtt/train_9k.json", 7 | "test_annotation": "./_data/msrvtt/test.json", 8 | 9 | "size_img": 224, 10 | "size_txt": 25, 11 | 12 | "size_epoch": 20, 13 | "size_batch": 5, 14 | 15 | "lr": 3e-6, 16 | "decay": 1e-3, 17 | 18 | "meltr_lr": 3e-6, 19 | "meltr_decay": 1e-3, 20 | "max_grad_norm": 12, 21 | "auxgrad_every": 3, 22 | "gamma": 0.1, 23 | 24 | "path_ckpt": "./checkpoint/ckpt_violet_pretrain.pt" 25 | } 26 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/sbu_caption_dataset.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | from .base_dataset import BaseDataset 3 | 4 | 5 | class SBUCaptionDataset(BaseDataset): 6 | def __init__(self, *args, split="", **kwargs): 7 | assert split in ["train", "val", "test"] 8 | if split == "test": 9 | split = "val" 10 | 11 | if split == "train": 12 | names = [f"sbu_{i}" for i in range(9)] 13 | elif split == "val": 14 | names = [] 15 | 16 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 17 | 18 | def __getitem__(self, index): 19 | return self.get_suite(index) 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/vg_caption_dataset.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import BaseDataset 2 | 3 | 4 | class VisualGenomeCaptionDataset(BaseDataset): 5 | def __init__(self, *args, split="", **kwargs): 6 | assert split in ["train", "val", "test"] 7 | if split == "test": 8 | split = "val" 9 | 10 | if split == "train": 11 | names = ["vg_train"] 12 | elif split == "val": 13 | names = [] 14 | elif split == "test": 15 | names = [] 16 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 17 | 18 | def __getitem__(self, index): 19 | return self.get_suite(index) 20 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/f30k_caption_karpathy_dataset.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import BaseDataset 2 | 3 | 4 | class F30KCaptionKarpathyDataset(BaseDataset): 5 | def __init__(self, *args, split="", **kwargs): 6 | assert split in ["train", "val", "test"] 7 | 8 | if split == "train": 9 | names = ["f30k_caption_karpathy_train", "f30k_caption_karpathy_val"] 10 | elif split == "val": 11 | names = ["f30k_caption_karpathy_test"] 12 | elif split == "test": 13 | names = ["f30k_caption_karpathy_test"] 14 | 15 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 16 | 17 | def __getitem__(self, index): 18 | return self.get_suite(index) 19 | -------------------------------------------------------------------------------- /violet/tools/extract_tsv.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse, pickle 3 | from tqdm import tqdm 4 | 5 | def get_args(): 6 | parser = argparse.ArgumentParser() 7 | 8 | parser.add_argument('--path', required=True, type=str) 9 | 10 | args = parser.parse_args() 11 | 12 | return args 13 | 14 | if __name__=='__main__': 15 | args = get_args() 16 | 17 | pkl = pickle.load(open('%s.pkl'%(args.path), 'rb')) 18 | 19 | file_tsv, file_lineidx = open('%s.tsv'%(args.path), 'w'), open('%s.lineidx'%(args.path), 'w') 20 | for vid in tqdm(pkl, ascii=True): 21 | file_lineidx.write('%d\n'%(file_tsv.tell())) 22 | file_tsv.write(vid) 23 | for b in pkl[vid]: 24 | file_tsv.write('\t%s'%(b)) 25 | file_tsv.write('\n') 26 | 27 | file_tsv.flush(), file_lineidx.flush() 28 | 29 | -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/pixelbert.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | inception_normalize, 3 | MinMaxResize, 4 | ) 5 | from torchvision import transforms 6 | from .randaug import RandAugment 7 | 8 | 9 | def pixelbert_transform(size=800): 10 | longer = int((1333 / 800) * size) 11 | return transforms.Compose( 12 | [ 13 | MinMaxResize(shorter=size, longer=longer), 14 | transforms.ToTensor(), 15 | inception_normalize, 16 | ] 17 | ) 18 | 19 | 20 | def pixelbert_transform_randaug(size=800): 21 | longer = int((1333 / 800) * size) 22 | trs = transforms.Compose( 23 | [ 24 | MinMaxResize(shorter=size, longer=longer), 25 | transforms.ToTensor(), 26 | inception_normalize, 27 | ] 28 | ) 29 | trs.transforms.insert(0, RandAugment(2, 9)) 30 | return trs -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/mix.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | 5 | class SpatialMixup(object): 6 | def __init__(self, alpha=0.2, trace=True, version=2): 7 | self.alpha = alpha 8 | self.trace = trace 9 | self.version = version 10 | 11 | def mixup_data(self, x): 12 | """ 13 | return mixed inputs. pairs of targets 14 | """ 15 | b, t, c, h, w = x.size() 16 | loss_prob = random.random() * self.alpha 17 | if self.trace: 18 | mixed_x = x 19 | else: 20 | mixed_x = torch.zeros_like(x) 21 | for i in range(b): 22 | tmp = (i+1) % b 23 | img_index = random.randint(0, t-1) 24 | for j in range(t): 25 | mixed_x[i, j, :, :, :] = (1-loss_prob) * x[i, j, :, :, :] + loss_prob * x[tmp, img_index, :, :, :] 26 | return mixed_x 27 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/msvdqa_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import MSVDQADataset 2 | from .datamodule_base import BaseDataModule 3 | from collections import defaultdict 4 | 5 | 6 | class MSVDQADataModule(BaseDataModule): 7 | def __init__(self, *args, **kwargs): 8 | super().__init__(*args, **kwargs) 9 | 10 | @property 11 | def dataset_cls(self): 12 | return MSVDQADataset 13 | 14 | @property 15 | def dataset_name(self): 16 | return "msvdqa" 17 | 18 | def setup(self, stage): 19 | super().setup(stage) 20 | self.answer2id = self.train_dataset.ans_lab_dict 21 | sorted_a2i = sorted(self.answer2id.items(), key=lambda x: x[1]) 22 | self.num_class = max(self.answer2id.values()) + 1 23 | self.id2answer = defaultdict(lambda: "unknown") 24 | for k, v in sorted_a2i: 25 | self.id2answer[v] = k 26 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/msrvttqa_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import MSRVTTQADataset 2 | from .datamodule_base import BaseDataModule 3 | from collections import defaultdict 4 | 5 | 6 | class MSRVTTQADataModule(BaseDataModule): 7 | def __init__(self, *args, **kwargs): 8 | super().__init__(*args, **kwargs) 9 | 10 | @property 11 | def dataset_cls(self): 12 | return MSRVTTQADataset 13 | 14 | @property 15 | def dataset_name(self): 16 | return "msrvttqa" 17 | 18 | def setup(self, stage): 19 | super().setup(stage) 20 | self.answer2id = self.train_dataset.ans_lab_dict 21 | sorted_a2i = sorted(self.answer2id.items(), key=lambda x: x[1]) 22 | self.num_class = max(self.answer2id.values()) + 1 23 | self.id2answer = defaultdict(lambda: "unknown") 24 | for k, v in sorted_a2i: 25 | self.id2answer[v] = k 26 | -------------------------------------------------------------------------------- /univl/utils/metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | 8 | def compute_metrics(x): 9 | print("metrics") 10 | 11 | sx = np.sort(-x, axis=1) 12 | d = np.diag(-x) 13 | d = d[:, np.newaxis] 14 | ind = sx - d 15 | ind = np.where(ind == 0) 16 | ind = ind[1] 17 | metrics = {} 18 | metrics['R1'] = float(np.sum(ind == 0)) / len(ind) 19 | metrics['R5'] = float(np.sum(ind < 5)) / len(ind) 20 | metrics['R10'] = float(np.sum(ind < 10)) / len(ind) 21 | metrics['MR'] = np.median(ind) + 1 22 | return metrics 23 | 24 | def print_computed_metrics(metrics): 25 | r1 = metrics['R1'] 26 | r5 = metrics['R5'] 27 | r10 = metrics['R10'] 28 | mr = metrics['MR'] 29 | print('R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'.format(r1, r5, r10, mr)) -------------------------------------------------------------------------------- /allinone/move_pretrained_weight.py: -------------------------------------------------------------------------------- 1 | import os 2 | print('move pretrained weights...') 3 | try: 4 | # v100 machines 5 | if not os.path.exists('~/.cache/torch/hub/checkpoints/'): 6 | os.makedirs('~/.cache/torch/hub/checkpoints/') 7 | os.system( 8 | 'cp pretrained/*.pth ~/.cache/torch/hub/checkpoints/.') 9 | except Exception as e: 10 | print(e) 11 | try: 12 | # v100 machines 13 | if not os.path.exists('/usr/local/app/.cache/torch/hub/checkpoints/'): 14 | os.makedirs('/usr/local/app/.cache/torch/hub/checkpoints') 15 | os.system( 16 | 'cp pretrained/*.pth /usr/local/app/.cache/torch/hub/checkpoints/.') 17 | except Exception as e: 18 | print(e) 19 | try: 20 | # a100 machines 21 | if not os.path.exists('/root/.cache/torch/hub/checkpoints/'): 22 | os.makedirs('/root/.cache/torch/hub/checkpoints/') 23 | os.system( 24 | 'cp pretrained/*.pth /root/.cache/torch/hub/checkpoints/.') 25 | print('move finished...') 26 | except Exception as e: 27 | print(e) -------------------------------------------------------------------------------- /univl/modules/optimization_MELTR.py: -------------------------------------------------------------------------------- 1 | from torch.nn.utils import clip_grad_norm_ 2 | from modules.modeling_MELTR import MELTRgrad 3 | 4 | class MELTROptimizer: 5 | 6 | def __init__(self, meta_optimizer, max_grad_norm=10): 7 | self.meta_optimizer = meta_optimizer 8 | self.hypergrad = MELTRgrad() 9 | 10 | self.max_grad_norm = max_grad_norm 11 | 12 | def step(self, train_loss, val_loss, parameters, aux_params): 13 | self.zero_grad() 14 | 15 | hyper_gards = self.hypergrad.grad( 16 | loss_val=val_loss, 17 | loss_train=train_loss, 18 | aux_params=aux_params, 19 | params=parameters, 20 | ) 21 | for p, g in zip(aux_params, hyper_gards): 22 | if g is not None: 23 | p.grad = -g 24 | 25 | if self.max_grad_norm is not None: 26 | clip_grad_norm_(aux_params, max_norm=self.max_grad_norm) 27 | 28 | self.meta_optimizer.step() 29 | def zero_grad(self): 30 | self.meta_optimizer.zero_grad() -------------------------------------------------------------------------------- /violet/dataset.py: -------------------------------------------------------------------------------- 1 | 2 | from lib import * 3 | 4 | class Dataset_Base(T.utils.data.Dataset): 5 | def __init__(self, args): 6 | super().__init__() 7 | 8 | self.args = args 9 | self.tokzr = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased') 10 | 11 | def str2img(self, b): 12 | img = Image.open(io.BytesIO(base64.b64decode(b))).convert('RGB') 13 | w, h = img.size 14 | img = TV.transforms.Compose([TV.transforms.Pad([0, (w-h)//2] if w>h else [(h-w)//2, 0]), 15 | TV.transforms.Resize([self.args['size_img'], self.args['size_img']]), 16 | TV.transforms.ToTensor()])(img) 17 | return img 18 | 19 | def str2txt(self, s): 20 | txt = self.tokzr.encode(s, padding='max_length', max_length=self.args['size_txt'], truncation=True) 21 | mask = [1 if w!=0 else w for w in txt] 22 | txt, mask = np.array(txt, dtype=np.int64), np.array(mask, dtype=np.int64) 23 | return txt, mask 24 | -------------------------------------------------------------------------------- /univl/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Microsoft 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 MLV Lab (Machine Learning and Vision Lab at Korea University) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/coco_caption_karpathy_dataset.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import BaseDataset 2 | 3 | 4 | class CocoCaptionKarpathyDataset(BaseDataset): 5 | def __init__(self, *args, split="", **kwargs): 6 | assert split in ["train", "val", "test"] 7 | self.split = split 8 | 9 | if split == "train": 10 | names = ["coco_caption_karpathy_train", "coco_caption_karpathy_restval"] 11 | elif split == "val": 12 | names = ["coco_caption_karpathy_val"] 13 | # names = ["coco_caption_karpathy_test"] 14 | # names = [] # for fast train 15 | elif split == "test": 16 | names = ["coco_caption_karpathy_test"] 17 | # names = [] 18 | 19 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 20 | 21 | def __getitem__(self, index): 22 | suite = self.get_suite(index) 23 | 24 | if "test" in self.split: 25 | _index, _question_index = self.index_mapper[index] 26 | iid = self.table["image_id"][_index].as_py() 27 | iid = int(iid.split(".")[0].split("_")[-1]) 28 | suite.update({"iid": iid}) 29 | 30 | return suite 31 | -------------------------------------------------------------------------------- /violet/tools/extract_video-frame.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse, av, base64, io, pickle 3 | 4 | from glob import glob 5 | from tqdm import tqdm 6 | 7 | def get_args(): 8 | parser = argparse.ArgumentParser() 9 | 10 | parser.add_argument('--sample', required=True, type=int) 11 | 12 | args = parser.parse_args() 13 | 14 | return args 15 | 16 | if __name__=='__main__': 17 | args = get_args() 18 | 19 | lst = glob('/hub_data2/dohwan/MSVD/videos/*.avi') 20 | 21 | pkl = {} 22 | for f in tqdm(lst, ascii=True): 23 | vid = f.split('/')[-1].replace('.avi', '') 24 | 25 | imgs = [] 26 | for pack in av.open(f).demux(): 27 | for buf in pack.decode(): 28 | if str(type(buf))=="": 29 | imgs.append(buf.to_image().convert('RGB')) 30 | N = len(imgs)/(args.sample+1) 31 | 32 | pkl[vid] = [] 33 | for i in range(args.sample): 34 | buf = io.BytesIO() 35 | imgs[int(N*(i+1))].save(buf, format='JPEG') 36 | pkl[vid].append(str(base64.b64encode(buf.getvalue()))[2:-1]) 37 | pickle.dump(pkl, open('msvd.pkl', 'wb')) 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MELTR: Meta Loss Transformer for Learning to Fine-tune Video Foundation Models 2 | 3 | This is the official implementation of MELTR (CVPR 2023). ([arxiv](https://arxiv.org/abs/2303.13009)) 4 | 5 | > Dohwan Ko1*, Joonmyung Choi1*, Hyeong Kyu Choi1, Kyoung-Woon On2, Byungseok Roh2, Hyunwoo J. Kim1. 6 | > 7 | > 1Korea University 2Kakao Brain 8 | 9 | 10 | 11 |
12 | 13 |
14 | 15 | 16 | ## Code Repositories 17 | * [UniVL + MELTR](https://github.com/mlvlab/MELTR/tree/master/univl) 18 | 19 | * [Violet + MELTR](https://github.com/mlvlab/MELTR/tree/master/violet) 20 | 21 | * [All-in-one + MELTR](https://github.com/mlvlab/MELTR/tree/master/allinone) 22 | 23 | 24 | 25 | ## Citation 26 | 27 | ``` 28 | @inproceedings{ko2023meltr, 29 | title={MELTR: Meta Loss Transformer for Learning to Fine-tune Video Foundation Models}, 30 | author={Ko, Dohwan and Choi, Joonmyung and Choi, Hyeong Kyu and On, Kyoung-Woon and Roh, Byungseok and Kim, Hyunwoo J}, 31 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 32 | year={2023} 33 | } 34 | ``` -------------------------------------------------------------------------------- /violet/agent.py: -------------------------------------------------------------------------------- 1 | from meltr import MELTROptimizer 2 | from lib import * 3 | 4 | class Agent_Base: 5 | def __init__(self, args, model): 6 | super().__init__() 7 | 8 | self.args, self.model = args, model 9 | 10 | self.loss_func = T.nn.CrossEntropyLoss(ignore_index=-1).cuda() 11 | self.optzr = T.optim.AdamW(self.model.parameters(), lr=args['lr'], betas=(0.9, 0.98), weight_decay=args['decay']) 12 | self.scaler = T.cuda.amp.GradScaler() 13 | 14 | class Agent_Base_MELTR: 15 | def __init__(self, args, model, aux_model=None): 16 | super().__init__() 17 | 18 | self.args, self.model, self.aux_model = args, model, aux_model 19 | 20 | self.loss_func = T.nn.CrossEntropyLoss(ignore_index=-1).cuda() 21 | self.optzr = T.optim.AdamW(self.model.parameters(), lr=args['lr'], betas=(0.9, 0.98), weight_decay=args['decay']) 22 | self.scaler = T.cuda.amp.GradScaler() 23 | 24 | if aux_model is not None: 25 | self.aux_optzr = T.optim.AdamW(self.aux_model.parameters(), lr=args['meltr_lr'], betas=(0.9, 0.98), weight_decay=args['meltr_decay']) 26 | self.meta_optim = MELTROptimizer(meta_optimizer=self.aux_optzr, max_grad_norm=args['max_grad_norm']) -------------------------------------------------------------------------------- /violet/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class AverageMeter: 4 | ''' Computes and stores the average and current value. ''' 5 | def __init__(self) -> None: 6 | self.reset() 7 | 8 | def reset(self) -> None: 9 | self.val = 0.0 10 | self.avg = 0.0 11 | self.sum = 0.0 12 | self.count = 0 13 | 14 | def update(self, val: float, n: int = 1) -> None: 15 | if type(val) == torch.Tensor: 16 | val = float(val.detach().cpu().data) 17 | self.val = val 18 | self.sum += val * n 19 | self.count += n 20 | self.avg = self.sum / self.count 21 | def sample(self): 22 | return "\ 23 | end = time.time() \n\ 24 | batch_time = AverageMeter() \n\ 25 | batch_time.update(time.time() - end) \n\ 26 | end = time.time() \n\ 27 | avg_score = AverageMeter()\n\ 28 | accuracy = 0.1\n\ 29 | avg_score.update(accuracy)\n\ 30 | losses = AverageMeter()\n\ 31 | loss = 0\n\ 32 | batch_size = 128\n\ 33 | losses.update(loss.data.item(), batch_size)\n\ 34 | print(f'time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'\n\ 35 | f'loss {losses.val:.4f} ({losses.avg:.4f})\t' \n\ 36 | f'acc {avg_score.val:.4f} ({avg_score.avg:.4f})')" -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # == pretrain data 2 | # = image 3 | from .vg_caption_dataset import VisualGenomeCaptionDataset 4 | from .coco_caption_karpathy_dataset import CocoCaptionKarpathyDataset 5 | from .sbu_caption_dataset import SBUCaptionDataset 6 | from .cc3m import CC3MDataset 7 | # = video 8 | from .webvid import WEBVIDDataset 9 | from .howto100m import HT100MDataset 10 | from .yttemporal import YTTemporalDataset 11 | # == downstream data 12 | # = image 13 | from .f30k_caption_karpathy_dataset import F30KCaptionKarpathyDataset 14 | from .vqav2_dataset import VQAv2Dataset 15 | from .nlvr2_dataset import NLVR2Dataset 16 | # = video 17 | from .msrvtt import MSRVTTDataset 18 | from .msrvttqa import MSRVTTQADataset 19 | from .msrvtt_choice import MSRVTTChoiceDataset 20 | from .msvd import MSVDDataset 21 | from .lsmdc_dataset import LSMDCDataset 22 | from .msvdqa import MSVDQADataset 23 | from .vcr import VCRDataset 24 | from .ego4d import Ego4DDataset 25 | from .tvqa import TVQADataset 26 | from .lsmdc_choice import LSMDCChoiceDataset 27 | from .ego4d_choice import EGO4DChoiceDataset 28 | from .tgif import TGIFDataset 29 | from .tgifqa import TGIFQADataset 30 | from .didemo import DIDEMODataset 31 | from .hmdb51 import HMDB51Dataset 32 | from .k400 import K400Dataset 33 | from .activitynet import ActivityNetDataset -------------------------------------------------------------------------------- /allinone/README.md: -------------------------------------------------------------------------------- 1 | 2 | # All-in-one + MELTR 3 | 4 | 5 | ## Preparation 6 | ### Requirements 7 | 8 | Our code is implemented under [All-in-one](https://github.com/showlab/all-in-one) environment with PyTorch 1.10+. 9 | 10 | ### Datasets 11 | 12 | We use MSRVTT for text-to-video retrieval and All-in-one also provides downstream datasets [here](https://github.com/showlab/all-in-one/blob/main/DATA.md). 13 | 14 | Annotation files of MSRVTT can be found [here](https://drive.google.com/drive/folders/1nXWGRKjm6fwYly4YCgdKu7XtV2IXGUix). 15 | 16 | ### Pretrained checkpoint 17 | 18 | You can download the pretrained checkpoint of All-in-one [here](https://drive.google.com/file/d/1Yd2lKppaduqG_RO1gCA6OpAfB0_IXDoX/view?usp=sharing). 19 | 20 | Then, place the files as follows: 21 | 22 | ``` 23 | data 24 | |─ msrvtt 25 | │ └─ videos 26 | | | │─ video0.mp4 27 | | | : 28 | │ │─ train_list_9k.txt 29 | | │─ train_list_7k.txt 30 | | │─ val_list_jsfusion.txt 31 | | │─ MSR_VTT.json 32 | | │─ jsfusion_val_caption_idx.pkl 33 | 34 | checkpoint 35 | |─ all-in-one-plus-224.ckpt 36 | ``` 37 | 38 | 39 | 40 | ## Training & Evaluation 41 | 42 | ``` 43 | python run.py with \ 44 | data_root=./data/msrvtt num_gpus=8 num_nodes=1 \ 45 | per_gpu_batchsize=16 msrvtt_retrieval_MELTR \ 46 | num_frames=3 \ 47 | load_path="./checkpoint/all-in-one-plus-224.ckpt" 48 | ``` 49 | 50 | 51 | 52 | 53 | ## Acknowledgement 54 | This repo is built upon [All-in-one](https://github.com/showlab/all-in-one). 55 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/vqav2_datamodule.py: -------------------------------------------------------------------------------- 1 | from AllInOne.datasets import VQAv2Dataset 2 | from .datamodule_base import BaseDataModule 3 | from collections import defaultdict 4 | 5 | 6 | class VQAv2DataModule(BaseDataModule): 7 | def __init__(self, *args, **kwargs): 8 | super().__init__(*args, **kwargs) 9 | 10 | @property 11 | def dataset_cls(self): 12 | return VQAv2Dataset 13 | 14 | @property 15 | def dataset_name(self): 16 | return "vqa" 17 | 18 | def setup(self, stage): 19 | super().setup(stage) 20 | 21 | train_answers = self.train_dataset.table["answers"].to_pandas().tolist() 22 | val_answers = self.val_dataset.table["answers"].to_pandas().tolist() 23 | train_labels = self.train_dataset.table["answer_labels"].to_pandas().tolist() 24 | val_labels = self.val_dataset.table["answer_labels"].to_pandas().tolist() 25 | 26 | all_answers = [c for c in train_answers + val_answers if c is not None] 27 | all_answers = [l for lll in all_answers for ll in lll for l in ll] 28 | all_labels = [c for c in train_labels + val_labels if c is not None] 29 | all_labels = [l for lll in all_labels for ll in lll for l in ll] 30 | 31 | self.answer2id = {k: v for k, v in zip(all_answers, all_labels)} 32 | sorted_a2i = sorted(self.answer2id.items(), key=lambda x: x[1]) 33 | self.num_class = max(self.answer2id.values()) + 1 34 | self.id2answer = defaultdict(lambda: "unknown") 35 | for k, v in sorted_a2i: 36 | self.id2answer[v] = k 37 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/didemo.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset 2 | import os 3 | import pandas as pd 4 | 5 | # some videos are missed, for better results, do IO exception. 6 | 7 | 8 | class DIDEMODataset(BaseDataset): 9 | def __init__(self, *args, split="", **kwargs): 10 | assert split in ["train", "val", "test"] 11 | self.split = split 12 | self.metadata = None 13 | if split == "train": 14 | names = ["didemo_train"] 15 | elif split == "val": 16 | names = ["didemo_val"] 17 | elif split == "test": 18 | names = ["didemo_val"] 19 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 20 | self._load_metadata() 21 | 22 | def _load_metadata(self): 23 | metadata_dir = './meta_data/didemo' 24 | split_files = { 25 | 'train': 'DiDeMo_train.tsv', 26 | 'val': 'DiDeMo_val.tsv', # there is no test 27 | 'test': 'DiDeMo_test.tsv' 28 | } 29 | target_split_fp = split_files[self.split] 30 | metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t') 31 | self.metadata = metadata 32 | print("load split {}, {} samples".format(self.split, len(metadata))) 33 | 34 | def _get_video_path(self, sample): 35 | rel_video_fp = sample[1] 36 | full_video_fp = os.path.join(self.data_dir, 'video', rel_video_fp) 37 | return full_video_fp, rel_video_fp 38 | 39 | def _get_caption(self, sample): 40 | return sample[0] 41 | 42 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/vqav2_dataset.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import BaseDataset 2 | 3 | 4 | class VQAv2Dataset(BaseDataset): 5 | def __init__(self, *args, split="", **kwargs): 6 | assert split in ["train", "val", "test"] 7 | self.split = split 8 | 9 | if split == "train": 10 | names = ["vqav2_train", "vqav2_trainable_val"] 11 | elif split == "val": 12 | names = ["vqav2_rest_val"] 13 | elif split == "test": 14 | names = ["vqav2_test"] # vqav2_test-dev for test-dev 15 | 16 | super().__init__( 17 | *args, 18 | **kwargs, 19 | names=names, 20 | text_column_name="questions", 21 | remove_duplicate=False, 22 | ) 23 | 24 | def __getitem__(self, index): 25 | image_tensor = self.get_image(index)["image"] 26 | text = self.get_text(index)["text"] 27 | 28 | index, question_index = self.index_mapper[index] 29 | qid = self.table["question_id"][index][question_index].as_py() 30 | 31 | if self.split != "test": 32 | answers = self.table["answers"][index][question_index].as_py() 33 | labels = self.table["answer_labels"][index][question_index].as_py() 34 | scores = self.table["answer_scores"][index][question_index].as_py() 35 | else: 36 | answers = list() 37 | labels = list() 38 | scores = list() 39 | 40 | return { 41 | "image": image_tensor, 42 | "text": text, 43 | "vqa_answer": answers, 44 | "vqa_labels": labels, 45 | "vqa_scores": scores, 46 | "qid": qid, 47 | } 48 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/nlvr2_dataset.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import BaseDataset 2 | import sys 3 | import random 4 | 5 | 6 | class NLVR2Dataset(BaseDataset): 7 | def __init__(self, *args, split="", **kwargs): 8 | assert split in ["train", "val", "test"] 9 | self.split = split 10 | 11 | if split == "train": 12 | names = ["nlvr2_train"] 13 | elif split == "val": 14 | names = ["nlvr2_dev", "nlvr2_test1"] 15 | elif split == "test": 16 | names = ["nlvr2_dev", "nlvr2_test1"] 17 | 18 | super().__init__( 19 | *args, 20 | **kwargs, 21 | names=names, 22 | text_column_name="questions", 23 | remove_duplicate=False, 24 | ) 25 | 26 | def __getitem__(self, index): 27 | result = None 28 | while result is None: 29 | try: 30 | image_tensor_0 = self.get_image(index, image_key="image_0")["image"] 31 | image_tensor_1 = self.get_image(index, image_key="image_1")["image"] 32 | text = self.get_text(index)["text"] 33 | result = True 34 | except: 35 | print( 36 | f"error while read file idx {index} in {self.names[0]}", 37 | file=sys.stderr, 38 | ) 39 | index = random.randint(0, len(self.index_mapper) - 1) 40 | 41 | index, question_index = self.index_mapper[index] 42 | answers = self.table["answers"][index][question_index].as_py() 43 | answers = answers == "True" 44 | 45 | return { 46 | "image_0": image_tensor_0, 47 | "image_1": image_tensor_1, 48 | "text": text, 49 | "answers": answers, 50 | "table_name": self.table_names[index], 51 | } 52 | -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/utils.py: -------------------------------------------------------------------------------- 1 | from torchvision import transforms 2 | from PIL import Image 3 | 4 | 5 | class MinMaxResize: 6 | def __init__(self, shorter=800, longer=1333): 7 | self.min = shorter 8 | self.max = longer 9 | 10 | def __call__(self, x): 11 | w, h = x.size 12 | scale = self.min / min(w, h) 13 | if h < w: 14 | newh, neww = self.min, scale * w 15 | else: 16 | newh, neww = scale * h, self.min 17 | 18 | if max(newh, neww) > self.max: 19 | scale = self.max / max(newh, neww) 20 | newh = newh * scale 21 | neww = neww * scale 22 | 23 | newh, neww = int(newh + 0.5), int(neww + 0.5) 24 | newh, neww = newh // 32 * 32, neww // 32 * 32 25 | 26 | return x.resize((neww, newh), resample=Image.BICUBIC) 27 | 28 | 29 | class UnNormalize(object): 30 | def __init__(self, mean, std): 31 | self.mean = mean 32 | self.std = std 33 | 34 | def __call__(self, tensor): 35 | """ 36 | Args: 37 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 38 | Returns: 39 | Tensor: Normalized image. 40 | """ 41 | for t, m, s in zip(tensor, self.mean, self.std): 42 | t.mul_(s).add_(m) 43 | # The normalize code -> t.sub_(m).div_(s) 44 | return tensor 45 | 46 | 47 | # This is simple maximum entropy normalization performed in Inception paper 48 | inception_normalize = transforms.Compose( 49 | [transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])] 50 | ) 51 | 52 | # ViT uses simple non-biased inception normalization 53 | # https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py#L132 54 | inception_unnormalize = transforms.Compose( 55 | [UnNormalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])] 56 | ) 57 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/msvd.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset 2 | import random 3 | import os 4 | import pandas as pd 5 | 6 | 7 | class MSVDDataset(BaseDataset): 8 | def __init__(self, *args, split="", **kwargs): 9 | assert split in ["train", "val", "test"] 10 | self.split = split 11 | self.metadata = None 12 | if split == "train": 13 | names = ["msvd_train"] 14 | elif split == "val": 15 | names = ["msvd_val"] 16 | elif split == "test": 17 | names = ["msvd_test"] 18 | self._load_metadata() 19 | # self.num_frames = kwargs['num_frames'] 20 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 21 | 22 | def _load_metadata(self): 23 | metadata_dir = './meta_data/msvd' 24 | split_files = { 25 | 'train': 'MSVD_train.tsv', 26 | 'val': 'MSVD_test.tsv', # MSVD_val.tsv 27 | 'test': 'MSVD_test.tsv' 28 | } 29 | target_split_fp = split_files[self.split] 30 | metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t') 31 | self.metadata = metadata 32 | print("load split {}, {} samples".format(self.split, len(metadata))) 33 | 34 | def _get_video_path(self, sample): 35 | rel_video_fp = sample[1] + '.avi' 36 | full_video_fp = os.path.join(self.data_dir, 'YouTubeClips', rel_video_fp) 37 | return full_video_fp, rel_video_fp 38 | 39 | def _get_caption(self, sample): 40 | if self.split == 'train': 41 | words = sample[0].split(',') 42 | num_word = len(words) 43 | index = random.randint(0, num_word - 1) 44 | caption = words[index] 45 | else: 46 | # caption = sample[0] 47 | words = sample[0].split(',') 48 | num_word = len(words) 49 | index = random.randint(0, num_word - 1) 50 | caption = words[index] 51 | return caption 52 | -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/videoaug.py: -------------------------------------------------------------------------------- 1 | # input: (C, T, H, W) output: (C, T, H, W) 2 | def VideoTransform(mode='train', crop_size=224, backend='v100'): 3 | if backend == 'a100': 4 | print("initalize data augmentation for a100 gpus") 5 | import AllInOne.transforms.video_transform as video_transform 6 | from torchvision import transforms 7 | # https://github.com/FingerRec/BE/blob/main/src/Contrastive/augment/video_transformations/volume_transforms.py 8 | if mode == 'train': 9 | data_transforms = transforms.Compose([ 10 | video_transform.TensorToNumpy(), 11 | video_transform.Resize(int(crop_size*1.2)), # 256/224 = 1.14 12 | video_transform.RandomCrop(crop_size), 13 | # video_transform.ColorJitter(0.5, 0.5, 0.25, 0.5), # color operation perimitted, damage attribute 14 | video_transform.ClipToTensor(channel_nb=3), 15 | video_transform.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 16 | ]) 17 | else: 18 | data_transforms = transforms.Compose([ 19 | video_transform.TensorToNumpy(), 20 | video_transform.Resize(int(crop_size*1.2)), # 256 21 | video_transform.CenterCrop(crop_size), # 224 22 | video_transform.ClipToTensor(channel_nb=3), 23 | video_transform.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 24 | ]) 25 | return data_transforms 26 | else: 27 | # for pytorch > 1.9.0, V100 28 | import pytorchvideo.transforms as video_transforms 29 | # https://pytorchvideo.readthedocs.io/en/latest/api/transforms/transforms.html 30 | return video_transforms.create_video_transform(mode=mode, min_size=int(crop_size*1.2), 31 | max_size=int(crop_size*1.5), 32 | crop_size=crop_size, 33 | aug_type='randaug', # randaug/augmix 34 | num_samples=None) # not use temporal sub sampling -------------------------------------------------------------------------------- /violet/tools/extract_vq.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse, base64, io, pickle 3 | from glob import glob 4 | 5 | from tqdm import tqdm 6 | 7 | import numpy as np 8 | import torch as T 9 | import torchvision as TV 10 | from dall_e import map_pixels, unmap_pixels, load_model 11 | 12 | from PIL import Image 13 | 14 | def get_args(): 15 | parser = argparse.ArgumentParser() 16 | 17 | parser.add_argument('--path', required=True, type=str) 18 | parser.add_argument('--frame', required=True, type=int) 19 | 20 | args = parser.parse_args() 21 | 22 | return args 23 | 24 | def proc_buf(buf, _F): 25 | img = Image.open(io.BytesIO(base64.b64decode(buf))) 26 | w, h = img.size 27 | img = TV.transforms.Compose([TV.transforms.Pad([0, (w-h)//2] if w>h else [(h-w)//2, 0]), 28 | TV.transforms.Resize([_F, _F]), 29 | TV.transforms.ToTensor()])(img).unsqueeze(0) 30 | img = map_pixels(img) 31 | return img 32 | 33 | if __name__=='__main__': 34 | args = get_args() 35 | 36 | dalle_enc = load_model('encoder.pkl', T.device('cpu')).cuda() # https://cdn.openai.com/dall-e/encoder.pkl 37 | # dalle_dec = load_model('decoder.pkl', T.device('cpu')).cuda() # https://cdn.openai.com/dall-e/decoder.pkl 38 | 39 | 40 | lst = glob(f'{args.path}/pickles/*.pkl') 41 | pickle_list = [] 42 | for file in tqdm(lst): 43 | pickle_list.append(pickle.load(open(f'{file}', 'rb'))) 44 | 45 | for pkl in tqdm(pickle_list): 46 | vq = {} 47 | for vid in pkl: 48 | imgs = [proc_buf(b, int(args.frame//32*8)) for b in pkl[vid]] 49 | imgs = T.cat(imgs, dim=0) 50 | 51 | z = dalle_enc(imgs.cuda()) 52 | z = T.argmax(z, dim=1) 53 | vq[vid] = z.data.cpu().numpy().astype(np.int16) 54 | 55 | '''o = T.nn.functional.one_hot(z, num_classes=dalle_enc.vocab_size).permute(0, 3, 1, 2).float() 56 | o = dalle_dec(o).float() 57 | rec = unmap_pixels(T.sigmoid(o[:, :3])) 58 | rec = [TV.transforms.ToPILImage(mode='RGB')(r) for r in rec]''' 59 | pickle.dump(vq, open(f'{args.path}/vq/{vid}_vq.pkl', 'wb')) 60 | -------------------------------------------------------------------------------- /allinone/AllInOne/modules/heads.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from transformers.models.bert.modeling_bert import BertPredictionHeadTransform 6 | 7 | 8 | class Pooler(nn.Module): 9 | def __init__(self, hidden_size): 10 | super().__init__() 11 | self.dense = nn.Linear(hidden_size, hidden_size) 12 | self.activation = nn.Tanh() 13 | 14 | def forward(self, hidden_states): 15 | # print(hidden_states.size()) # 64 x 237 x 768 16 | first_token_tensor = hidden_states[:, 0] 17 | pooled_output = self.dense(first_token_tensor) 18 | pooled_output = self.activation(pooled_output) 19 | return pooled_output 20 | 21 | 22 | class ITMHead(nn.Module): 23 | def __init__(self, hidden_size): 24 | super().__init__() 25 | self.fc = nn.Linear(hidden_size, 2) 26 | 27 | def forward(self, x): 28 | x = self.fc(x) 29 | return x 30 | 31 | 32 | class MLMHead(nn.Module): 33 | def __init__(self, config, weight=None): 34 | super().__init__() 35 | self.transform = BertPredictionHeadTransform(config) 36 | self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) 37 | self.bias = nn.Parameter(torch.zeros(config.vocab_size)) 38 | if weight is not None: 39 | self.decoder.weight = weight 40 | 41 | def forward(self, x): 42 | x = self.transform(x) 43 | x = self.decoder(x) + self.bias 44 | return x 45 | 46 | 47 | class MPPHead(nn.Module): 48 | def __init__(self, config): 49 | super().__init__() 50 | self.transform = BertPredictionHeadTransform(config) 51 | self.decoder = nn.Linear(config.hidden_size, 256 * 3) 52 | 53 | def forward(self, x): 54 | x = self.transform(x) 55 | x = self.decoder(x) 56 | return x 57 | 58 | 59 | class MLP(nn.Module): 60 | def __init__(self, hidden_size): 61 | super().__init__() 62 | self.fc = nn.Sequential( 63 | nn.Linear(hidden_size, 128), 64 | nn.GELU(), 65 | nn.Linear(128, 1)) 66 | 67 | # self.fc = nn.Linear(hidden_size, 1) 68 | 69 | def forward(self, x): 70 | 71 | x = self.fc(x.squeeze(-1)) 72 | return x -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/lsmdc_dataset.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset 2 | import random 3 | import os 4 | import pandas as pd 5 | from AllInOne.transforms.videoaug import VideoTransform 6 | 7 | 8 | class LSMDCDataset(BaseDataset): 9 | def __init__(self, *args, split="", **kwargs): 10 | assert split in ["train", "val", "test"] 11 | self.split = split 12 | self.metadata = None 13 | if split == "train": 14 | names = ["lsmdc_train"] 15 | elif split == "val": 16 | names = ["lsmdc_val"] 17 | elif split == "test": 18 | names = ["lsmdc_test"] 19 | self._load_metadata() 20 | # self.num_frames = kwargs['num_frames'] 21 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 22 | 23 | def _load_metadata(self): 24 | metadata_dir = './meta_data/lsmdc' 25 | split_files = { 26 | 'train': 'LSMDC16_annos_training.csv', 27 | 'val': 'LSMDC16_challenge_1000_publictect.csv', # LSMDC16_annos_val.csv 28 | 'test': 'LSMDC16_challenge_1000_publictect.csv' 29 | } 30 | target_split_fp = split_files[self.split] 31 | metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t', header=None, error_bad_lines=False) 32 | self.metadata = metadata 33 | print("load split {}, {} samples".format(self.split, len(metadata))) 34 | 35 | def _get_video_path(self, sample): 36 | # e.g. 3009_BATTLE_LOS_ANGELES_00.03.07.170-00.03.09.675 -> 3009_BATTLE_LOS_ANGELES/3009_BATTLE_LOS_ANGELES_00.03.07.170-00.03.09.675 37 | sub_dir = '_'.join(sample[0].split('_')[:-1]) 38 | rel_video_fp = sample[0] + '.avi' 39 | full_video_fp = os.path.join(self.data_dir, sub_dir, rel_video_fp) 40 | return full_video_fp, rel_video_fp 41 | 42 | def _get_caption(self, sample): 43 | if self.split == 'train': 44 | words = sample[0].split(',') 45 | num_word = len(words) 46 | index = random.randint(0, num_word - 1) 47 | caption = words[index] 48 | else: 49 | # caption = sample[0] 50 | words = sample[0].split(',') 51 | num_word = len(words) 52 | index = random.randint(0, num_word - 1) 53 | caption = words[index] 54 | return caption 55 | -------------------------------------------------------------------------------- /violet/README.md: -------------------------------------------------------------------------------- 1 | # Violet + MELTR 2 | 3 | 4 | ## Preparation 5 | 6 | ### Requirements 7 | 8 | Our code is implemented under [Violet](https://github.com/tsujuifu/pytorch_violet) environment with PyTorch 1.10+. 9 | 10 | ### Datasets 11 | 12 | We use three datasets (MSRVTT, TGIF, and MSVD). Violet also provides downstream datasets and annotation files [here](https://drive.google.com/drive/u/2/folders/1BisJHVUOLeHWmnAeMrCHvy1BP9XBXNkQ). 13 | 14 | Annotation files of msrvtt can be found [here](https://drive.google.com/drive/folders/1rVnRBZ45g96TlTnxbFBBP2AVfOB-Tf3J). 15 | 16 | Download them and run the below command to extract VQ tokens for MVM. 17 | 18 | ``` 19 | cd tools 20 | wget https://cdn.openai.com/dall-e/encoder.pkl # download trained dall-e encoder 21 | python extract_vq.py --path=msrvtt --frame=224 # output: msrvtt_vq.pkl 22 | ``` 23 | 24 | ### Pretrained checkpoint 25 | 26 | You can download the pretrained checkpoint of Violet [here](https://drive.google.com/file/d/1RLbthdRIflxCFjRTcVV5jQJGP30_lNfg/view). 27 | 28 | Then, place the files as follows: 29 | 30 | ``` 31 | data 32 | |─ msrvtt 33 | │ |─ img_msrvtt.pkl 34 | │ │─ msrvtt_vq.pkl 35 | | │─ train_9k.json 36 | | │─ train_7k.json 37 | | │─ test.json 38 | | 39 | |─ tgif 40 | | │─ img_tgif.pkl 41 | | │─ tgif_vq.pkl 42 | | |─ txt_tgif-action.json 43 | | |─ txt_tgif-transition.json 44 | | |─ txt_tgif-frame.json 45 | | 46 | |─ msvd 47 | | │─ img_msvd.pkl 48 | | │─ msvd_vq.pkl 49 | | │─ txt_msvd-qa.json 50 | 51 | checkpoint 52 | |─ ckpt_violet_pretrain.pt 53 | ``` 54 | 55 | 56 | 57 | ## Training & Evaluation 58 | 59 | + Multiple-Choice Question Answering 60 | ``` 61 | python main_qamc.py ./args/args_tgif-action.json 62 | python main_qamc.py ./args/args_tgif-transition.json 63 | ``` 64 | + Open-Ended Question Answering 65 | ``` 66 | python main_qaoe.py ./args/args_msvd-qaoe.json 67 | python main_qaoe.py ./args/args_tgif-frame.json 68 | ``` 69 | + Text-to-Video Retrieval 70 | ``` 71 | python main_retrieval.py ./args/args_msrvtt-retrieval_7k.json 72 | python main_retrieval.py ./args/args_msrvtt-retrieval_9k.json 73 | python eval_retrieval.py ./args/args_msrvtt-retrieval_eval.json 74 | ``` 75 | You may modify 'path_ckpt' of './args/args_msrvtt-retrieval_eval.json' for evaluation. 76 | 77 | 78 | 79 | ## Acknowledgement 80 | 81 | This repo is built upon [Violet](https://github.com/tsujuifu/pytorch_violet). 82 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/ego4d.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset, video_reader, read_large_frames_decord 2 | import torch as th 3 | from torch.utils.data import Dataset 4 | import pandas as pd 5 | import os 6 | import numpy as np 7 | import random 8 | import ffmpeg 9 | import time 10 | import re 11 | import json 12 | from AllInOne.transforms.videoaug import VideoTransform 13 | import cv2 14 | import subprocess 15 | 16 | # {'timestamp_sec': 221.29666, 'narration_text': '#C C walks on the ground'} 17 | 18 | 19 | class Ego4DDataset(BaseDataset): 20 | """EGO4D Video-Text loader.""" 21 | 22 | def __init__(self, *args, split="", **kwargs): 23 | assert split in ["train", "val", "test"] 24 | self.split = split 25 | 26 | if split == "train": 27 | names = ["ego4d_train"] 28 | elif split == "val": 29 | names = ["ego4d_val"] 30 | elif split == "test": 31 | names = ["ego4d_test"] 32 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 33 | 34 | self._load_metadata() 35 | 36 | def _load_metadata(self): 37 | metadata_dir = './meta_data/ego4d' 38 | split_files = { 39 | 'train': 'ego4d_train_subset.csv', 40 | 'val': 'ego4d_val_ts_clean.csv', 41 | 'test': 'ego4d_val_ts_clean.csv' # there is no test 42 | } 43 | target_split_fp = split_files[self.split] 44 | self.metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t', header=None, error_bad_lines=False) 45 | 46 | def _get_video_path(self, sample): 47 | rel_video_fp = sample[0] + '.mp4' 48 | full_video_fp = os.path.join(self.data_dir, 'videos', rel_video_fp) 49 | if not os.path.exists(full_video_fp): 50 | Exception(IOError) 51 | return full_video_fp, rel_video_fp 52 | 53 | def _get_caption(self, sample): 54 | return sample[6] 55 | 56 | def get_raw_video(self, sample): 57 | abs_fp, rel_fp = self._get_video_path(sample) 58 | # if int(sample[2]) > 600: 59 | # raise Exception("Video is longer than 10m!", rel_fp) 60 | frame_end, frame_loc = int(sample[3]), int(sample[5]) 61 | # imgs = video_reader(abs_fp, frame_loc, frame_end, self.num_frames) 62 | imgs = read_large_frames_decord(abs_fp, frame_loc, frame_end, self.num_frames) 63 | if imgs is None: 64 | raise Exception("Invalid video!", rel_fp) 65 | else: 66 | return imgs 67 | 68 | -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/__init__.py: -------------------------------------------------------------------------------- 1 | # pretrain dataset 2 | ## video 3 | from .webvid_datamodule import WEBVIDDataModule 4 | from .howto100m_datamodule import HT100MDataModule 5 | from .yttemporal_datamodule import YTTemporalMDataModule 6 | ## image 7 | from .cc3m_datamodule import CC3MDataModule 8 | from .vg_caption_datamodule import VisualGenomeCaptionDataModule 9 | from .coco_caption_karpathy_datamodule import CocoCaptionKarpathyDataModule 10 | from .sbu_datamodule import SBUCaptionDataModule 11 | # finetune dataset 12 | ## image 13 | from .f30k_caption_karpathy_datamodule import F30KCaptionKarpathyDataModule 14 | from .vqav2_datamodule import VQAv2DataModule 15 | from .nlvr2_datamodule import NLVR2DataModule 16 | from .msrvtt_datamodule import MSRVTTDataModule 17 | from .msrvttqa_datamodule import MSRVTTQADataModule 18 | from .msrvtt_choice_datamodule import MSRVTTChoiceDataModule 19 | from .msvd_datamodule import MSVDDataModule 20 | from .msvdqa_datamodule import MSVDQADataModule 21 | from .vcr_datamodule import VCRDataModule 22 | ## video 23 | from .ego4d_datamodule import Ego4DDataModule 24 | from .tvqa_datamodule import TVQADataModule 25 | from .lsmdc_choice_datamodule import LSMDCChoiceDataModule 26 | from .ego4d_choice_datamodule import EGO4DChoiceDataModule 27 | from .tgif_datamodule import TGIFDataModule 28 | from .tgifqa_datamodule import TGIFQADataModule 29 | from .didemo_datamodule import DIDEMODataModule 30 | from .hmdb51_datamodule import HMDB51DataModule 31 | from .k400_datamodule import K400DataModule 32 | from .lsmdc_datamodule import LSMDCDataModule 33 | from .activitynet_datamodule import ActivityNetDataModule 34 | 35 | _datamodules = { 36 | "vg": VisualGenomeCaptionDataModule, 37 | "f30k": F30KCaptionKarpathyDataModule, 38 | "coco": CocoCaptionKarpathyDataModule, 39 | "sbu": SBUCaptionDataModule, 40 | "vqa": VQAv2DataModule, 41 | "nlvr2": NLVR2DataModule, 42 | "cc3m": CC3MDataModule, 43 | 'howto100m': HT100MDataModule, 44 | 'webvid': WEBVIDDataModule, 45 | 'msrvtt': MSRVTTDataModule, 46 | 'msrvttqa': MSRVTTQADataModule, 47 | 'msrvtt_choice': MSRVTTChoiceDataModule, 48 | 'msvd': MSVDDataModule, 49 | 'msvdqa': MSVDQADataModule, 50 | 'vcr': VCRDataModule, 51 | 'ego4d': Ego4DDataModule, 52 | 'tvqa': TVQADataModule, 53 | 'lsmdc_choice': LSMDCChoiceDataModule, 54 | 'ego4d_choice': EGO4DChoiceDataModule, 55 | 'yttemporal': YTTemporalMDataModule, 56 | 'tgif': TGIFDataModule, 57 | "tgifqa": TGIFQADataModule, 58 | 'didemo': DIDEMODataModule, 59 | 'hmdb51': HMDB51DataModule, 60 | 'k400': K400DataModule, 61 | 'lsmdc': LSMDCDataModule, 62 | 'activitynet': ActivityNetDataModule 63 | } 64 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/msrvtt.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset 2 | import random 3 | import os 4 | import pandas as pd 5 | import json 6 | import numpy as np 7 | 8 | 9 | class MSRVTTDataset(BaseDataset): 10 | def __init__(self, *args, split="", **kwargs): 11 | assert split in ["train", "val", "test"] 12 | self.split = split 13 | self.metadata = None 14 | self.cut = "7k" 15 | if split == "train": 16 | names = ["msrvtt_train"] 17 | elif split == "val": 18 | names = ["msrvtt_val"] 19 | elif split == "test": 20 | names = ["msrvtt_val"] 21 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 22 | 23 | self._load_metadata() 24 | 25 | def _load_metadata(self): 26 | json_fp = os.path.join(self.data_dir, 'MSR_VTT.json') 27 | with open(json_fp, 'r') as fid: 28 | data = json.load(fid) 29 | df = pd.DataFrame(data['annotations']) 30 | 31 | js_test_cap_idx_path = None 32 | if self.cut == "7k": 33 | train_list_path = "train_list_7k.txt" 34 | elif self.cut == "9k": 35 | train_list_path = "train_list_9k.txt" 36 | test_list_path = "val_list_jsfusion.txt" 37 | js_test_cap_idx_path = "jsfusion_val_caption_idx.pkl" 38 | 39 | 40 | train_df = pd.read_csv(os.path.join(self.data_dir, train_list_path), names=['videoid']) 41 | test_df = pd.read_csv(os.path.join(self.data_dir, test_list_path), names=['videoid']) 42 | self.split_sizes = {'train': len(train_df), 'val': len(test_df), 'test': len(test_df)} 43 | 44 | if self.split == 'train': 45 | df = df[df['image_id'].isin(train_df['videoid'])] 46 | else: 47 | df = df[df['image_id'].isin(test_df['videoid'])] 48 | 49 | self.metadata = df.groupby(['image_id'])['caption'].apply(list) 50 | if js_test_cap_idx_path is not None and self.split != 'train': 51 | caps = pd.Series(np.load(os.path.join(self.data_dir, js_test_cap_idx_path), allow_pickle=True)) 52 | new_res = pd.DataFrame({'caps': self.metadata, 'cap_idx': caps}) 53 | new_res['test_caps'] = new_res.apply(lambda x: [x['caps'][x['cap_idx']]], axis=1) 54 | self.metadata = new_res['test_caps'] 55 | 56 | self.metadata = pd.DataFrame({'captions': self.metadata}) 57 | print("load split {}, {} samples".format(self.split, len(self.metadata))) 58 | 59 | # random choice or fixed? 60 | def _get_caption(self, sample): 61 | caption_sample = "rand" 62 | if self.split in ['train', 'val'] and caption_sample == "rand": 63 | caption = random.choice(sample['captions']) 64 | else: 65 | caption = sample['captions'][0] 66 | return caption 67 | 68 | -------------------------------------------------------------------------------- /univl/modules/meltr.py: -------------------------------------------------------------------------------- 1 | from torch.nn import functional as F 2 | from typing import Optional 3 | from torch import Tensor 4 | from torch import nn 5 | import torch 6 | from torch.nn.utils import clip_grad_norm_ 7 | 8 | class MELTRgrad: 9 | def __init__(self): 10 | pass 11 | 12 | def grad(self, loss_val, loss_train, aux_params, params): 13 | 14 | dwdA = torch.autograd.grad( 15 | loss_val, 16 | params, 17 | retain_graph=True, 18 | allow_unused=True 19 | ) 20 | 21 | dwdT = torch.autograd.grad( 22 | loss_train, 23 | params, 24 | create_graph=True, 25 | allow_unused=True 26 | ) 27 | 28 | temp_t, temp_a = [], [] 29 | for t, a in zip(dwdT, dwdA): 30 | if a is None: 31 | continue 32 | temp_t.append(t) 33 | temp_a.append(a) 34 | 35 | v4 = torch.autograd.grad( 36 | tuple(temp_t), 37 | aux_params, 38 | grad_outputs=tuple(temp_a), 39 | allow_unused=True, 40 | ) 41 | return v4 42 | 43 | class MELTROptimizer: 44 | 45 | def __init__(self, meta_optimizer, max_grad_norm=10): 46 | self.meta_optimizer = meta_optimizer 47 | self.hypergrad = MELTRgrad() 48 | 49 | self.max_grad_norm = max_grad_norm 50 | 51 | def step(self, train_loss, val_loss, parameters, aux_params): 52 | self.zero_grad() 53 | 54 | hyper_gards = self.hypergrad.grad( 55 | loss_val=val_loss, 56 | loss_train=train_loss, 57 | aux_params=aux_params, 58 | params=parameters, 59 | ) 60 | for p, g in zip(aux_params, hyper_gards): 61 | if g is not None: 62 | p.grad = -g 63 | 64 | if self.max_grad_norm is not None: 65 | clip_grad_norm_(aux_params, max_norm=self.max_grad_norm) 66 | 67 | self.meta_optimizer.step() 68 | 69 | def zero_grad(self): 70 | self.meta_optimizer.zero_grad() 71 | 72 | class MELTR(nn.Module): 73 | def __init__(self, t_dim, f_dim, i_dim, h1_dim, h2_dim, o_dim): 74 | super(MELTR, self).__init__() 75 | self.task_embedding = nn.Embedding(t_dim, h2_dim) 76 | self.loss_fc1 = nn.Linear(i_dim, h1_dim) 77 | self.activation = nn.ReLU() 78 | self.loss_fc2 = nn.Linear(h1_dim, h2_dim) 79 | 80 | self.encoder = nn.TransformerEncoderLayer(d_model=h2_dim, nhead=8, batch_first=True, dim_feedforward=f_dim) 81 | self.fc1 = nn.Linear(h2_dim, o_dim, bias=False) 82 | 83 | def forward(self, x): 84 | scale_embedding = self.loss_fc2(self.activation(self.loss_fc1(x))) 85 | input = scale_embedding + self.task_embedding.weight 86 | output = self.encoder(input.unsqueeze(0)) 87 | output = self.fc1(output.mean(1)) 88 | return output 89 | -------------------------------------------------------------------------------- /allinone/AllInOne/modules/temporal_roll.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import random 4 | 5 | 6 | class TemporalRoll(nn.Module): 7 | def __init__(self, n_segment=3, n_div=8, v=0): 8 | super(TemporalRoll, self).__init__() 9 | self.n_segment = n_segment 10 | self.fold_div = n_div 11 | self.v = v 12 | 13 | def forward(self, x, layer=1): 14 | # return x 15 | nt, l, c = x.size() 16 | n_batch = nt // self.n_segment 17 | x = x.view(n_batch, self.n_segment, l, c) 18 | if self.v == 0: 19 | # 16, 3, 197, 768 20 | fold = l // self.fold_div 21 | out = torch.zeros_like(x) 22 | # keep cls token 23 | out[:, :, 0] = x[:, :, 0] 24 | # roll left step 1 along time dimension (1) 25 | out[:, :, 1:fold+1] = torch.roll(x[:, :, 1:fold+1], 1, 1) 26 | # roll right step 1 along time dimension (1) 27 | out[:, :, -fold:] = torch.roll(x[:, :, -fold:], -1, 1) 28 | # not roll 29 | out[:, :, 1+fold:-fold] = x[:, :, 1+fold: -fold] 30 | # # 16, 3, 197, 768 31 | # fold = l // self.fold_div 32 | # out = torch.zeros_like(x) 33 | # # roll left step 1 along time dimension (1) 34 | # out[:, :, :fold] = torch.roll(x[:, :, :fold], 1, 1) 35 | # # roll right step 1 along time dimension (1) 36 | # out[:, :, -fold:] = torch.roll(x[:, :, -fold:], -1, 1) 37 | # # not roll 38 | # out[:, :, fold:-fold] = x[:, :, fold: -fold] 39 | # random sampling 40 | elif self.v == 1: 41 | out = torch.zeros_like(x) 42 | roll_token_idexs = random.sample(range(1, l), l//2) 43 | # print(roll_token_idexs) 44 | out = x 45 | out[:, :, roll_token_idexs] = torch.roll(x[:, :, roll_token_idexs], 1, 1) 46 | # roll different tokens for different blocks 47 | elif self.v == 2: 48 | rolled_token_len = l // self.fold_div 49 | fold = rolled_token_len * (layer % self.fold_div) 50 | begin_index = 1 + fold 51 | end_index = min(1 + fold + rolled_token_len, l) 52 | out = torch.zeros_like(x) 53 | out[:, :, 0] = x[:, :, 0] # cls token unchanged 54 | out[:, :, begin_index:] = x[:, :, begin_index:] 55 | out[:, :, begin_index:end_index] = torch.roll(x[:, :, begin_index:end_index], 1, 1) 56 | out[:, :, end_index:] = x[:, :, end_index:] 57 | else: # not roll 58 | fold = c // self.fold_div 59 | out = torch.zeros_like(x) 60 | out[:, :-1, :fold] = x[:, 1:, :fold] # shift left tokens 61 | out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold] # shift right tokens 62 | out[:, :, 2 * fold:] = x[:, :, 2 * fold:] # not shift 63 | return out.view(nt, l, c) -------------------------------------------------------------------------------- /violet/meltr.py: -------------------------------------------------------------------------------- 1 | from lib import * 2 | from torch.nn.utils import clip_grad_norm_ 3 | from torch.nn import functional as F 4 | from typing import Optional, Any 5 | from torch import Tensor 6 | from torch import nn 7 | import torch 8 | 9 | 10 | class MELTRgrad: 11 | def __init__(self): 12 | pass 13 | 14 | def grad(self, loss_val, loss_train, aux_params, params): 15 | dwdA = T.autograd.grad( 16 | loss_val, 17 | params, 18 | retain_graph=True, 19 | allow_unused=True 20 | ) 21 | 22 | dwdT = T.autograd.grad( 23 | loss_train, 24 | params, 25 | create_graph=True, 26 | allow_unused=True 27 | ) 28 | 29 | temp_t, temp_a = [], [] 30 | for t, a in zip(dwdT, dwdA): 31 | if a is None: 32 | continue 33 | temp_t.append(t) 34 | temp_a.append(a) 35 | 36 | v4 = T.autograd.grad( 37 | tuple(temp_t), 38 | aux_params, 39 | grad_outputs=tuple(temp_a), 40 | allow_unused=True 41 | ) 42 | 43 | return v4 44 | 45 | 46 | class MELTROptimizer: 47 | def __init__(self, meta_optimizer, hpo_lr, max_grad_norm=10): 48 | self.meta_optimizer = meta_optimizer 49 | self.hypergrad = MELTRgrad() 50 | 51 | self.max_grad_norm = max_grad_norm 52 | 53 | def step(self, train_loss, val_loss, parameters, aux_params): 54 | self.zero_grad() 55 | hyper_grads = self.hypergrad.grad( 56 | loss_val=val_loss, 57 | loss_train=train_loss, 58 | aux_params=aux_params, 59 | params=parameters, 60 | ) 61 | for p, g in zip(aux_params, hyper_grads): 62 | if g is not None: 63 | p.grad = -g 64 | 65 | if self.max_grad_norm is not None: 66 | clip_grad_norm_(aux_params, max_norm=self.max_grad_norm) 67 | 68 | self.meta_optimizer.step() 69 | 70 | 71 | def zero_grad(self): 72 | self.meta_optimizer.zero_grad() 73 | 74 | 75 | class MELTR(nn.Module): 76 | def __init__(self, t_dim, f_dim, i_dim, h1_dim, h2_dim, o_dim): 77 | super(MELTR, self).__init__() 78 | self.task_embedding = nn.Embedding(t_dim, h2_dim) 79 | self.loss_fc1 = nn.Linear(i_dim, h1_dim) 80 | self.activation1 = nn.ReLU() 81 | self.loss_fc2 = nn.Linear(h1_dim, h2_dim) 82 | 83 | self.encoder = nn.TransformerEncoderLayer(d_model=h2_dim, nhead=8, batch_first=True, dim_feedforward=f_dim) 84 | self.fc1 = nn.Linear(h2_dim, o_dim, bias=False) 85 | 86 | def forward(self, x): 87 | scale_embedding = self.loss_fc2(self.activation1(self.loss_fc1(x))) 88 | input = scale_embedding + self.task_embedding.weight 89 | output = self.encoder(input) 90 | output = self.fc1(output.mean(1)) 91 | return output -------------------------------------------------------------------------------- /allinone/AllInOne/modules/meltr.py: -------------------------------------------------------------------------------- 1 | from torch.nn.utils import clip_grad_norm_ 2 | import torch 3 | from torch.nn import functional as F 4 | from typing import Optional, Any 5 | from torch import Tensor 6 | from torch import nn 7 | import torch 8 | 9 | class MELTRgrad: 10 | def __init__(self): 11 | pass 12 | 13 | def grad(self, loss_train, loss_val, params, aux_params): 14 | dwdA = torch.autograd.grad( 15 | loss_val, 16 | params, 17 | retain_graph=True, 18 | allow_unused=True 19 | ) 20 | 21 | dwdT = torch.autograd.grad( 22 | loss_train, 23 | params, 24 | create_graph=True, 25 | allow_unused=True 26 | ) 27 | 28 | temp_t, temp_a = [], [] 29 | for t, a in zip(dwdT, dwdA): 30 | if a is None: 31 | continue 32 | temp_t.append(t) 33 | temp_a.append(a) 34 | 35 | 36 | v4 = torch.autograd.grad( 37 | tuple(temp_t), 38 | aux_params, 39 | grad_outputs=tuple(temp_a), 40 | allow_unused=True 41 | ) 42 | 43 | return v4 44 | 45 | 46 | class MELTROptimizer: 47 | def __init__(self, meta_optimizer, max_grad_norm=10): 48 | self.meta_optimizer = meta_optimizer 49 | self.hypergrad = MELTRgrad() 50 | 51 | self.max_grad_norm = max_grad_norm 52 | 53 | def step(self, train_loss, val_loss, parameters, aux_params): 54 | self.zero_grad() 55 | 56 | hyper_grads = self.hypergrad.grad( 57 | loss_train=train_loss, 58 | loss_val=val_loss, 59 | params=parameters, 60 | aux_params=aux_params, 61 | ) 62 | 63 | for p, g in zip(aux_params, hyper_grads): 64 | if g is not None: 65 | p.grad = -g 66 | 67 | if self.max_grad_norm is not None: 68 | clip_grad_norm_(aux_params, max_norm=self.max_grad_norm) 69 | 70 | self.meta_optimizer.step() 71 | 72 | def zero_grad(self): 73 | self.meta_optimizer.zero_grad() 74 | 75 | 76 | class MELTR(nn.Module): 77 | def __init__(self, t_dim, f_dim, i_dim, h1_dim, h2_dim, o_dim): 78 | super(MELTR, self).__init__() 79 | self.task_embedding = nn.Embedding(t_dim, h2_dim) 80 | self.loss_fc1 = nn.Linear(i_dim, h1_dim) 81 | self.activation1 = nn.ReLU() 82 | self.loss_fc2 = nn.Linear(h1_dim, h2_dim) 83 | 84 | self.encoder = nn.TransformerEncoderLayer(d_model=h2_dim, nhead=8, batch_first=True, dim_feedforward=f_dim) 85 | self.fc1 = nn.Linear(h2_dim, o_dim, bias=False) 86 | 87 | 88 | 89 | def forward(self, x): 90 | scale_embedding = self.loss_fc2(self.activation1(self.loss_fc1(x))) 91 | input = scale_embedding + self.task_embedding.weight 92 | output = self.encoder(input) 93 | output = self.fc1(output.mean(1)) 94 | return output -------------------------------------------------------------------------------- /allinone/AllInOne/datamodules/multitask_datamodule.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from pytorch_lightning import LightningDataModule 4 | from torch.utils.data import DataLoader 5 | from torch.utils.data.dataset import ConcatDataset 6 | from torch.utils.data.distributed import DistributedSampler 7 | 8 | from . import _datamodules 9 | 10 | 11 | class MTDataModule(LightningDataModule): 12 | def __init__(self, _config, dist=False): 13 | datamodule_keys = _config["datasets"] 14 | assert len(datamodule_keys) > 0 15 | 16 | super().__init__() 17 | 18 | self.dm_keys = datamodule_keys 19 | self.dm_dicts = {key: _datamodules[key](_config) for key in datamodule_keys} 20 | self.dms = [v for k, v in self.dm_dicts.items()] 21 | 22 | self.batch_size = self.dms[0].batch_size 23 | self.vocab_size = self.dms[0].vocab_size 24 | self.num_workers = self.dms[0].num_workers 25 | 26 | self.dist = dist 27 | 28 | def prepare_data(self): 29 | for dm in self.dms: 30 | dm.prepare_data() 31 | 32 | def setup(self, stage): 33 | for dm in self.dms: 34 | dm.setup(stage) 35 | 36 | self.train_dataset = ConcatDataset([dm.train_dataset for dm in self.dms]) 37 | self.val_dataset = ConcatDataset([dm.val_dataset for dm in self.dms]) 38 | self.test_dataset = ConcatDataset([dm.test_dataset for dm in self.dms]) 39 | self.tokenizer = self.dms[0].tokenizer 40 | 41 | self.collate = functools.partial( 42 | self.dms[0].train_dataset.collate, mlm_collator=self.dms[0].mlm_collator, 43 | ) 44 | 45 | if self.dist: 46 | self.train_sampler = DistributedSampler(self.train_dataset, shuffle=True) 47 | self.val_sampler = DistributedSampler(self.val_dataset, shuffle=True) 48 | self.test_sampler = DistributedSampler(self.test_dataset, shuffle=False) 49 | else: 50 | self.train_sampler = None 51 | self.val_sampler = None 52 | self.test_sampler = None 53 | 54 | def train_dataloader(self): 55 | loader = DataLoader( 56 | self.train_dataset, 57 | batch_size=self.batch_size, 58 | sampler=self.train_sampler, 59 | num_workers=self.num_workers, 60 | collate_fn=self.collate, 61 | ) 62 | return loader 63 | 64 | def val_dataloader(self, batch_size=None): 65 | loader = DataLoader( 66 | self.val_dataset, 67 | batch_size=batch_size if batch_size is not None else self.batch_size, 68 | sampler=self.val_sampler, 69 | num_workers=self.num_workers, 70 | collate_fn=self.collate, 71 | ) 72 | return loader 73 | 74 | def test_dataloader(self): 75 | loader = DataLoader( 76 | self.test_dataset, 77 | batch_size=self.batch_size, 78 | sampler=self.test_sampler, 79 | num_workers=self.num_workers, 80 | collate_fn=self.collate, 81 | ) 82 | return loader 83 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/msrvtt_choice.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset 2 | import os 3 | import pandas as pd 4 | 5 | 6 | class MSRVTTChoiceDataset(BaseDataset): 7 | def __init__(self, *args, split="", **kwargs): 8 | assert split in ["train", "val", "test"] 9 | self.split = split 10 | if self.split == "train": 11 | Exception("no train data provided") 12 | self.metadata = None 13 | self.ans_lab_dict = None 14 | if split == "train": 15 | names = ["msrvtt_choice_train"] 16 | elif split == "val": 17 | names = ["msrvtt_choice_val"] 18 | elif split == "test": 19 | names = ["msrvtt_choice_test"] # vqav2_test-dev for test-dev 20 | 21 | super().__init__( 22 | *args, 23 | **kwargs, 24 | names=names, 25 | text_column_name="unknown", 26 | remove_duplicate=False, 27 | ) 28 | self._load_metadata() 29 | 30 | def _load_metadata(self): 31 | metadata_dir = './meta_data/msrvtt' 32 | split_files = { 33 | 'train': 'msrvtt_mc_test.jsonl', # no train and test available, only for zero-shot 34 | 'val': 'msrvtt_mc_test.jsonl', 35 | 'test': 'msrvtt_mc_test.jsonl' 36 | } 37 | target_split_fp = split_files[self.split] 38 | metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True) 39 | self.metadata = metadata 40 | 41 | def _get_video_path(self, sample): 42 | return os.path.join(self.data_dir, 'videos', 'all', sample['clip_name'] + '.mp4'), sample['clip_name'] + '.mp4' 43 | 44 | def get_text(self, sample): 45 | texts = [] 46 | for text in sample['options']: 47 | encoding = self.tokenizer( 48 | text, 49 | padding="max_length", 50 | truncation=True, 51 | max_length=self.max_text_len, 52 | return_special_tokens_mask=True, 53 | ) 54 | texts.append((text, encoding)) 55 | return texts 56 | 57 | def get_answer_label(self, sample): 58 | answer = sample['answer'] 59 | return answer 60 | 61 | def __getitem__(self, index): 62 | sample = self.metadata.iloc[index] 63 | image_tensor = self.get_video(sample) 64 | # index, question_index = self.index_mapper[index] 65 | qid = index 66 | answer = self.get_answer_label(sample) 67 | ret = { 68 | "image": image_tensor, 69 | "img_index": index, 70 | "cap_index": index, 71 | "raw_index": index, 72 | 'answer': answer 73 | } 74 | texts = self.get_text(sample) 75 | ret["text"] = texts[0] 76 | # print(len(texts)) 77 | for i in range(self.draw_false_text - 1): 78 | ret.update({f"false_text_{i}": texts[i+1]}) 79 | # for i in range(self.draw_false_text-1): 80 | # ret[f"false_text_{i}"] = texts[i+1] 81 | # print(ret.keys()) 82 | return ret 83 | 84 | def __len__(self): 85 | return len(self.metadata) -------------------------------------------------------------------------------- /allinone/AllInOne/gadgets/my_metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from pytorch_lightning.metrics import Metric 3 | # from torchmetrics import Metric 4 | 5 | 6 | def order_class_index(order): 7 | """Return the index of the order in its full permutation. 8 | 9 | Args: 10 | order (tensor): e.g. [0,1,2] 11 | """ 12 | classes = list(itertools.permutations(list(range(len(order))))) 13 | return classes.index(tuple(order.tolist())) 14 | 15 | 16 | class Accuracy(Metric): 17 | def __init__(self, dist_sync_on_step=False): 18 | super().__init__(dist_sync_on_step=dist_sync_on_step) 19 | self.add_state("correct", default=torch.tensor(0.0), dist_reduce_fx="sum") 20 | self.add_state("total", default=torch.tensor(0.0), dist_reduce_fx="sum") 21 | 22 | def update(self, logits, target, unfilterd=False): 23 | logits, target = ( 24 | logits.detach().to(self.correct.device), 25 | target.detach().to(self.correct.device), 26 | ) 27 | preds = logits.argmax(dim=-1) 28 | preds = preds[target != -100] 29 | unfilter_num = target.numel() 30 | target = target[target != -100] 31 | if target.numel() == 0: 32 | return 1 33 | 34 | assert preds.shape == target.shape 35 | 36 | self.correct += torch.sum(preds == target) 37 | if unfilterd: 38 | # print("no filter") 39 | self.total += unfilter_num 40 | else: 41 | self.total += target.numel() 42 | 43 | def compute(self): 44 | return self.correct / self.total 45 | 46 | 47 | class Scalar(Metric): 48 | def __init__(self, dist_sync_on_step=False): 49 | super().__init__(dist_sync_on_step=dist_sync_on_step) 50 | self.add_state("scalar", default=torch.tensor(0.0), dist_reduce_fx="sum") 51 | self.add_state("total", default=torch.tensor(0.0), dist_reduce_fx="sum") 52 | 53 | def update(self, scalar): 54 | if isinstance(scalar, torch.Tensor): 55 | scalar = scalar.detach().to(self.scalar.device) 56 | else: 57 | scalar = torch.tensor(scalar).float().to(self.scalar.device) 58 | self.scalar += scalar 59 | self.total += 1 60 | 61 | def compute(self): 62 | return self.scalar / self.total 63 | 64 | 65 | class VQAScore(Metric): 66 | def __init__(self, dist_sync_on_step=False): 67 | super().__init__(dist_sync_on_step=dist_sync_on_step) 68 | self.add_state("score", default=torch.tensor(0.0), dist_reduce_fx="sum") 69 | self.add_state("total", default=torch.tensor(0.0), dist_reduce_fx="sum") 70 | 71 | def update(self, logits, target): 72 | logits, target = ( 73 | logits.detach().float().to(self.score.device), 74 | target.detach().float().to(self.score.device), 75 | ) 76 | logits = torch.max(logits, 1)[1] 77 | one_hots = torch.zeros(*target.size()).to(target) 78 | one_hots.scatter_(1, logits.view(-1, 1), 1) 79 | scores = one_hots * target 80 | 81 | self.score += scores.sum() 82 | self.total += len(logits) 83 | 84 | def compute(self): 85 | return self.score / self.total 86 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/hmdb51_zero_shot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .video_base_dataset import BaseDataset 3 | import os 4 | 5 | 6 | class HMDB51Dataset(BaseDataset): 7 | def __init__(self, *args, split="", **kwargs): 8 | assert split in ["train", "val", "test"] 9 | self.split = split 10 | self.metadata = None 11 | self.ans_lab_dict = dict() 12 | if split == "train": 13 | names = ["hmdb51_train"] 14 | elif split == "val": 15 | names = ["hmdb51_val"] 16 | elif split == "test": 17 | names = ["hmdb51_test"] 18 | super().__init__( 19 | *args, 20 | **kwargs, 21 | names=names, 22 | text_column_name="questions", 23 | remove_duplicate=False, 24 | ) 25 | self._load_metadata() 26 | 27 | def _load_metadata(self): 28 | metadata_dir = './meta_data/hmdb51' 29 | split_files = { 30 | 'train': 'hmdb51_rgb_train_split_1.txt', 31 | 'val': 'hmdb51_rgb_val_split_1.txt', 32 | 'test': 'hmdb51_rgb_val_split_1.txt' 33 | } 34 | target_split_fp = split_files[self.split] 35 | self.metadata = [x.strip().split(' ') for x in open(os.path.join(metadata_dir, target_split_fp))] 36 | answer_fp = os.path.join(metadata_dir, 'hmdb51_classInd.txt') 37 | with open(answer_fp, 'r') as f: 38 | lines = f.readlines() 39 | for line in lines: 40 | self.ans_lab_dict[str(int(line.strip().split(' ')[0]) - 1)] = line.strip().split(' ')[1] 41 | 42 | def _get_video_path(self, sample): 43 | # self.ans_lab_dict[sample[2]], 44 | return os.path.join(self.data_dir, sample[0].split('/')[-1]) + '.avi', sample[0].split('/')[-1] + '.avi' 45 | 46 | def get_text(self, sample): 47 | text = "A" 48 | encoding = self.tokenizer( 49 | text, 50 | padding="max_length", 51 | truncation=True, 52 | max_length=self.max_text_len, 53 | return_special_tokens_mask=True, 54 | ) 55 | return (text, encoding) 56 | 57 | def get_answer_label(self, sample): 58 | text = "None" 59 | ans_total_len = len(self.ans_lab_dict) + 1 # one additional class 60 | ans_label = int(sample[2]) 61 | scores = np.zeros(ans_total_len).astype(int) 62 | scores[ans_label] = 1 63 | return text, ans_label, scores 64 | # return text, ans_label_vector, scores 65 | 66 | def __getitem__(self, index): 67 | sample = self.metadata[index] # .split(' ') 68 | image_tensor = self.get_video(sample) 69 | text = self.get_text(sample) 70 | qid = index 71 | if self.split != "test": 72 | answers, labels, scores = self.get_answer_label(sample) 73 | else: 74 | answers = list() 75 | labels = list() 76 | scores = list() 77 | 78 | return { 79 | "image": image_tensor, 80 | "text": text, 81 | "vqa_answer": answers, 82 | "vqa_labels": labels, 83 | "vqa_scores": scores, 84 | "qid": qid, 85 | } 86 | 87 | def __len__(self): 88 | return len(self.metadata) -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/hmdb51.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .video_base_dataset import BaseDataset 3 | import os 4 | 5 | 6 | class HMDB51Dataset(BaseDataset): 7 | def __init__(self, *args, split="", **kwargs): 8 | assert split in ["train", "val", "test"] 9 | self.split = split 10 | self.metadata = None 11 | self.ans_lab_dict = dict() 12 | if split == "train": 13 | names = ["hmdb51_train"] 14 | elif split == "val": 15 | names = ["hmdb51_val"] 16 | elif split == "test": 17 | names = ["hmdb51_test"] 18 | super().__init__( 19 | *args, 20 | **kwargs, 21 | names=names, 22 | text_column_name="questions", 23 | remove_duplicate=False, 24 | ) 25 | self._load_metadata() 26 | 27 | def _load_metadata(self): 28 | metadata_dir = './meta_data/hmdb51' 29 | split_files = { 30 | 'train': 'hmdb51_rgb_train_split_1.txt', 31 | 'val': 'hmdb51_rgb_val_split_1.txt', 32 | 'test': 'hmdb51_rgb_val_split_1.txt' 33 | } 34 | target_split_fp = split_files[self.split] 35 | self.metadata = [x.strip().split(' ') for x in open(os.path.join(metadata_dir, target_split_fp))] 36 | answer_fp = os.path.join(metadata_dir, 'hmdb51_classInd.txt') 37 | with open(answer_fp, 'r') as f: 38 | lines = f.readlines() 39 | for line in lines: 40 | self.ans_lab_dict[str(int(line.strip().split(' ')[0]) - 1)] = line.strip().split(' ')[1] 41 | 42 | def _get_video_path(self, sample): 43 | # self.ans_lab_dict[sample[2]], 44 | return os.path.join(self.data_dir, sample[0].split('/')[-1]) + '.avi', sample[0].split('/')[-1] + '.avi' 45 | 46 | def get_text(self, sample): 47 | text = "A person is doing [MASK]" 48 | encoding = self.tokenizer( 49 | text, 50 | padding="max_length", 51 | truncation=True, 52 | max_length=self.max_text_len, 53 | return_special_tokens_mask=True, 54 | ) 55 | return (text, encoding) 56 | 57 | def get_answer_label(self, sample): 58 | text = "None" 59 | ans_total_len = len(self.ans_lab_dict) + 1 # one additional class 60 | ans_label = int(sample[2]) 61 | scores = np.zeros(ans_total_len).astype(int) 62 | scores[ans_label] = 1 63 | return text, ans_label, scores 64 | # return text, ans_label_vector, scores 65 | 66 | def __getitem__(self, index): 67 | sample = self.metadata[index] # .split(' ') 68 | image_tensor = self.get_video(sample) 69 | text = self.get_text(sample) 70 | qid = index 71 | if self.split != "test": 72 | answers, labels, scores = self.get_answer_label(sample) 73 | else: 74 | answers = list() 75 | labels = list() 76 | scores = list() 77 | 78 | return { 79 | "image": image_tensor, 80 | "text": text, 81 | "vqa_answer": answers, 82 | "vqa_labels": labels, 83 | "vqa_scores": scores, 84 | "qid": qid, 85 | } 86 | 87 | def __len__(self): 88 | return len(self.metadata) -------------------------------------------------------------------------------- /allinone/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import pytorch_lightning as pl 4 | import torch 5 | from AllInOne.config import ex 6 | from AllInOne.modules import AllinoneTransformerSS 7 | from AllInOne.datamodules.multitask_datamodule import MTDataModule 8 | import datetime 9 | import time 10 | 11 | @ex.automain 12 | def main(_config): 13 | _config = copy.deepcopy(_config) 14 | pl.seed_everything(_config["seed"]) 15 | 16 | dm = MTDataModule(_config, dist=True) 17 | model = AllinoneTransformerSS(_config) 18 | 19 | exp_name = f'{_config["exp_name"]}' 20 | 21 | os.makedirs(_config["log_dir"], exist_ok=True) 22 | checkpoint_callback = pl.callbacks.ModelCheckpoint( 23 | save_top_k=1, 24 | # every_n_epochs=_config["save_checkpoints_interval"], 25 | verbose=True, 26 | monitor="val/the_metric", 27 | mode="max", 28 | save_last=True, 29 | ) 30 | now = datetime.datetime.now() 31 | instance_name = f'{exp_name}_seed{_config["seed"]}_from_{_config["load_path"].split("/")[-1][:-5]}{now.year}_{now.month}_{now.day}' 32 | logger = pl.loggers.TensorBoardLogger( 33 | _config["log_dir"], 34 | name=instance_name, 35 | ) 36 | 37 | lr_callback = pl.callbacks.LearningRateMonitor(logging_interval="step") 38 | callbacks = [checkpoint_callback, lr_callback] 39 | 40 | num_gpus = ( 41 | _config["num_gpus"] 42 | if isinstance(_config["num_gpus"], int) 43 | else len(_config["num_gpus"]) 44 | ) 45 | # print all config at the begin 46 | print('='*70+'Config: '+'='*70) 47 | print(instance_name) 48 | print(_config) 49 | print('='*150) 50 | 51 | # notice _config["batch_size"] should be max length for all machines, eg. at least 1024 52 | grad_steps = _config["batch_size"] // ( 53 | _config["per_gpu_batchsize"] * num_gpus * _config["num_nodes"] 54 | ) 55 | 56 | max_steps = _config["max_steps"] if _config["max_steps"] is not None else None 57 | 58 | 59 | trainer = pl.Trainer( 60 | gpus=_config["num_gpus"], 61 | num_nodes=_config["num_nodes"], 62 | # precision=_config["precision"], 63 | accelerator="ddp", 64 | benchmark=True, 65 | deterministic=True, 66 | max_epochs=_config["max_epoch"] if max_steps is None else 1000, 67 | max_steps=max_steps, 68 | callbacks=callbacks, 69 | logger=logger, 70 | # prepare_data_per_node=False, 71 | replace_sampler_ddp=False, 72 | accumulate_grad_batches=grad_steps, 73 | log_every_n_steps=10, 74 | flush_logs_every_n_steps=10, 75 | resume_from_checkpoint=_config["resume_from"], 76 | weights_summary="top", 77 | fast_dev_run=_config["fast_dev_run"], 78 | val_check_interval=_config["val_check_interval"], 79 | automatic_optimization=False 80 | # num_sanity_val_steps=0, # 처음 sanity check 81 | 82 | # gradient_clip_val = 0.1 83 | 84 | 85 | # plugins=[DDPPlugin(find_unused_parameters=True)] 86 | # show_progress_bar=False, 87 | # progress_bar_refresh_rate=0 88 | ) 89 | 90 | print("accumulate grad batches is: ", trainer.accumulate_grad_batches) 91 | 92 | if not _config["test_only"]: 93 | trainer.fit(model, datamodule=dm) 94 | else: 95 | trainer.test(model, datamodule=dm) 96 | -------------------------------------------------------------------------------- /violet/model.py: -------------------------------------------------------------------------------- 1 | 2 | from lib import * 3 | from video_swin import SwinTransformer3D 4 | 5 | class EncImg(T.nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | self.swin = SwinTransformer3D() 10 | self.swin.load_state_dict(T.load('./_snapshot/ckpt_video-swin.pt', map_location='cpu')) 11 | 12 | self.emb_cls = T.nn.Parameter(0.02*T.randn(1, 1, 1, 768)) 13 | self.emb_pos = T.nn.Parameter(0.02*T.randn(1, 1, 1+14**2, 768)) 14 | self.emb_len = T.nn.Parameter(0.02*T.randn(1, 6, 1, 768)) 15 | self.norm = T.nn.LayerNorm(768) 16 | 17 | def forward(self, img): 18 | _B, _T, _C, _H, _W = img.shape 19 | _h, _w = _H//32, _W//32 20 | 21 | img = TV.transforms.Normalize([0.485, 0.456, 0.406], 22 | [0.229, 0.224, 0.225])(img) 23 | 24 | f_img = self.swin(img.transpose(1, 2)).transpose(1, 2) 25 | 26 | f_img = f_img.permute(0, 1, 3, 4, 2).view([_B, _T, _h*_w, 768]) 27 | f_img = T.cat([self.emb_cls.expand([_B, _T, -1, -1]), f_img], dim=2) 28 | f_img += self.emb_pos.expand([_B, _T, -1, -1])[:, :, :1+_h*_w, :]+self.emb_len.expand([_B, -1, 1+_h*_w, -1])[:, :_T, :, :] 29 | f_img = self.norm(f_img).view([_B, _T*(1+_h*_w), -1]) 30 | 31 | m_img = T.ones(1+_h*_w).long().cuda().unsqueeze(0).unsqueeze(0) 32 | m_img = m_img.expand([_B, _T, -1]).contiguous().view([_B, _T*(1+_h*_w)]) 33 | 34 | return f_img, m_img 35 | 36 | class EncTxt(T.nn.Module): 37 | def __init__(self): 38 | super().__init__() 39 | 40 | bert = transformers.BertModel.from_pretrained('bert-base-uncased') 41 | self.emb_txt = bert.embeddings 42 | 43 | def forward(self, txt): 44 | f_txt = self.emb_txt(txt) 45 | 46 | return f_txt 47 | 48 | class VIOLET_Base(T.nn.Module): 49 | def __init__(self): 50 | super().__init__() 51 | 52 | self.enc_img, self.enc_txt = EncImg(), EncTxt() 53 | bert = transformers.BertForMaskedLM.from_pretrained('bert-base-uncased') 54 | self.mask_ext, self.trsfr = bert.get_extended_attention_mask, bert.bert.encoder 55 | 56 | def go_feat(self, img, txt, mask): 57 | feat_img, mask_img = self.enc_img(img) 58 | feat_txt, mask_txt = self.enc_txt(txt), mask 59 | return feat_img, mask_img, feat_txt, mask_txt 60 | 61 | def go_cross(self, feat_img, mask_img, feat_txt, mask_txt): 62 | feat, mask = T.cat([feat_img, feat_txt], dim=1), T.cat([mask_img, mask_txt], dim=1) 63 | mask = self.mask_ext(mask, mask.shape, mask.device) 64 | out = self.trsfr(feat, mask, output_attentions=True) 65 | return out['last_hidden_state'], out['attentions'] 66 | 67 | 68 | def load_ckpt(self, ckpt): 69 | if ckpt=='': 70 | print('===== Init VIOLET =====') 71 | return 72 | 73 | ckpt_new, ckpt_old = T.load(ckpt, map_location='cpu'), self.state_dict() 74 | key_old = set(ckpt_old.keys()) 75 | for k in ckpt_new: 76 | if k in ckpt_old and ckpt_new[k].shape==ckpt_old[k].shape: 77 | ckpt_old[k] = ckpt_new[k] 78 | key_old.remove(k) 79 | self.load_state_dict(ckpt_old) 80 | print('===== Not Load:', key_old, '=====') 81 | 82 | -------------------------------------------------------------------------------- /allinone/AllInOne/transforms/functional.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import torch 3 | import cv2 4 | import numpy as np 5 | import PIL 6 | 7 | 8 | def _is_tensor_clip(clip): 9 | return torch.is_tensor(clip) and clip.ndimension() == 4 10 | 11 | 12 | def crop_clip(clip, min_h, min_w, h, w): 13 | if isinstance(clip[0], np.ndarray): 14 | cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip] 15 | 16 | elif isinstance(clip[0], PIL.Image.Image): 17 | cropped = [ 18 | img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip 19 | ] 20 | else: 21 | raise TypeError('Expected numpy.ndarray or PIL.Image' + 22 | 'but got list of {0}'.format(type(clip[0]))) 23 | return cropped 24 | 25 | 26 | def resize_clip(clip, size, interpolation='bilinear'): 27 | if isinstance(clip[0], np.ndarray): 28 | if isinstance(size, numbers.Number): 29 | im_h, im_w, im_c = clip[0].shape 30 | # Min spatial dim already matches minimal size 31 | if (im_w <= im_h and im_w == size) or (im_h <= im_w 32 | and im_h == size): 33 | return clip 34 | new_h, new_w = get_resize_sizes(im_h, im_w, size) 35 | size = (new_w, new_h) 36 | else: 37 | size = size[1], size[0] 38 | if interpolation == 'bilinear': 39 | np_inter = cv2.INTER_LINEAR 40 | else: 41 | np_inter = cv2.INTER_NEAREST 42 | scaled = [ 43 | cv2.resize(img, size, interpolation=np_inter) for img in clip 44 | ] 45 | elif isinstance(clip[0], PIL.Image.Image): 46 | if isinstance(size, numbers.Number): 47 | im_w, im_h = clip[0].size 48 | # Min spatial dim already matches minimal size 49 | if (im_w <= im_h and im_w == size) or (im_h <= im_w 50 | and im_h == size): 51 | return clip 52 | new_h, new_w = get_resize_sizes(im_h, im_w, size) 53 | size = (new_w, new_h) 54 | else: 55 | size = size[1], size[0] 56 | if interpolation == 'bilinear': 57 | pil_inter = PIL.Image.NEAREST 58 | else: 59 | pil_inter = PIL.Image.BILINEAR 60 | scaled = [img.resize(size, pil_inter) for img in clip] 61 | else: 62 | raise TypeError('Expected numpy.ndarray or PIL.Image' + 63 | 'but got list of {0}'.format(type(clip[0]))) 64 | return scaled 65 | 66 | 67 | def get_resize_sizes(im_h, im_w, size): 68 | if im_w < im_h: 69 | ow = size 70 | oh = int(size * im_h / im_w) 71 | else: 72 | oh = size 73 | ow = int(size * im_w / im_h) 74 | return oh, ow 75 | 76 | 77 | def normalize(clip, mean, std, inplace=False): 78 | if not _is_tensor_clip(clip): 79 | raise TypeError('tensor is not a torch clip_test.') 80 | 81 | if not inplace: 82 | clip = clip.clone() 83 | 84 | dtype = clip.dtype 85 | dim = len(mean) 86 | mean = torch.as_tensor(mean, dtype=dtype, device=clip.device) 87 | std = torch.as_tensor(std, dtype=dtype, device=clip.device) 88 | # print(clip_test.size()) 89 | # if dim == 3: 90 | clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) 91 | # else: 92 | # clip_test.sub_(mean[:, None, None]).div_(std[:, None, None]) 93 | return clip -------------------------------------------------------------------------------- /univl/dataloaders/README.md: -------------------------------------------------------------------------------- 1 | Data loaders for pretrain and downstream tasks (retrieval and caption). 2 | 3 | ## Preprocess on HowTo100M 4 | 5 | For pretrain, you need to prepare 3 parts, 6 | 7 | ### 1. s3d features pretrained on HowTo100M 8 | 9 | Download raw videos from the [HowTo100M webpage]([https://www.di.ens.fr/willow/research/howto100m/](https://www.di.ens.fr/willow/research/howto100m/)) and extract [s3d (howto100m)](https://github.com/antoine77340/S3D_HowTo100M) features. You can refer to [VideoFeatureExtractor](https://github.com/ArrowLuo/VideoFeatureExtractor). 10 | 11 | ### 2. HowTo100M.csv 12 | Note: this file is different from HowTo100M_v1.csv as in [README.txt](https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/README.txt) 13 | 14 | The csv format contains two columns. The first column is the video id, and the second is the feature file (sub-path of the npy, which will post append to `--features_path` (refer to pretrain part in [README](../README.md)) to find the npy file when reading). 15 | 16 | ``` 17 | video_id,feature_file 18 | Z8xhli297v8,Z8xhli297v8.npy 19 | ... 20 | ``` 21 | video_id: used to match the caption or transcript 22 | feature_file: used to find the feature file after joining with `--features_path` 23 | 24 | ### 3. caption.pickle 25 | This pickle file is generated from raw_caption.json in raw_caption.zip introduced in [README.txt](https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/README.txt) 26 | 27 | The format of this file is: 28 | ``` 29 | { 30 | 'video_id 1':{ 31 | 'start': array([0.08, 7.37, 15.05, ...], dtype=object), 32 | 'end': array([9.96, 16.98, 27.9, ...], dtype=object), 33 | 'text': array(['sentence 1 placehodolder', 34 | 'sentence 2 placehodolder', 35 | 'sentence 3 placehodolder', ...], dtype=object) 36 | }, 37 | ... 38 | } 39 | ``` 40 | Keep the `start` is a sorted array. 41 | 42 | 43 | ## Preprocess on YoucookII 44 | The s3d feature extraction is the same as HowTo100M introduced above. 45 | 46 | ## Generate youcookii_data.pickle 47 | This file is generated from `youcookii_annotations_trainval.json`, which can be downloaded from [official webpage](http://youcook2.eecs.umich.edu/download). 48 | 49 | The format of this file is (similar to `caption.pickle` introduced above, but one more key `transcript`. The `transcript` needs to generated by extra ASR tool from speech.): 50 | ``` 51 | { 52 | 'video_id 1':{ 53 | 'start': array([0.08, 7.37, 15.05, ...], dtype=object), 54 | 'end': array([9.96, 16.98, 27.9, ...], dtype=object), 55 | 'text': array(['sentence 1 placehodolder', 56 | 'sentence 2 placehodolder', 57 | 'sentence 3 placehodolder', ...], dtype=object) 58 | 'transcript': array(['transcript 1 placehodolder', 59 | 'transcript 2 placehodolder', 60 | 'transcript 3 placehodolder', ...], dtype=object) 61 | }, 62 | ... 63 | } 64 | ``` 65 | If you want to test on retrieval or caption w/o transcript tasks, you can set `transcript` with `array(['NONE', 'NONE', 'NONE', ...], dtype=object)`. 66 | 67 | ## Format of csv 68 | ``` 69 | video_id,feature_file 70 | Z8xhli297v8,Z8xhli297v8 71 | ... 72 | ``` 73 | Note: The video_id and feature_file are the same for the consistency and our historical compatibility. We use feature_file to get the feature from feature pickle. 74 | 75 | ## Preprocess on MSRVTT 76 | The s3d feature extraction is the same as HowTo100M introduced above. 77 | The data can be downloaded in: https://github.com/microsoft/UniVL/releases/download/v0/msrvtt.zip 78 | -------------------------------------------------------------------------------- /allinone/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.13.0 2 | addict==2.4.0 3 | aiohttp==3.8.1 4 | aiosignal==1.2.0 5 | apex==0.1 6 | appdirs==1.4.4 7 | async-timeout==4.0.2 8 | attrdict==2.0.1 9 | attrs==21.2.0 10 | av==8.0.3 11 | backcall==0.2.0 12 | black==19.3b0 13 | bravado==11.0.3 14 | bravado-core==5.17.0 15 | cachetools==4.2.2 16 | certifi==2021.5.30 17 | chardet==3.0.4 18 | charset-normalizer==2.0.4 19 | click==8.0.1 20 | colorama==0.4.4 21 | cycler==0.10.0 22 | Cython==0.29.28 23 | decorator==5.0.9 24 | decord==0.6.0 25 | demoji==1.1.0 26 | dlib==19.22.1 27 | docopt==0.6.2 28 | dominate==2.6.0 29 | editdistance==0.6.0 30 | einops==0.3.0 31 | ffmpeg-python==0.2.0 32 | filelock==3.0.12 33 | flake8==3.9.2 34 | frozenlist==1.3.0 35 | fsspec==2022.2.0 36 | ftfy==6.1.1 37 | future==0.18.2 38 | fvcore==0.1.5.post20220305 39 | gensim==3.4.0 40 | gitdb==4.0.7 41 | GitPython==3.1.20 42 | google-auth==2.0.2 43 | google-auth-oauthlib==0.4.6 44 | googletrans==3.0.0 45 | grpcio==1.39.0 46 | h11==0.9.0 47 | h2==3.2.0 48 | hpack==3.0.0 49 | hstspreload==2021.9.1 50 | httpcore==0.9.1 51 | httpx==0.13.3 52 | huggingface-hub==0.0.16 53 | humanize==3.11.0 54 | hyperframe==5.2.0 55 | idna==2.10 56 | imageio==2.9.0 57 | iopath==0.1.9 58 | ipdb==0.13.4 59 | ipython==7.27.0 60 | isort==5.9.3 61 | jedi==0.18.0 62 | joblib==1.0.1 63 | jsonpickle==1.5.2 64 | jsonpointer==2.1 65 | jsonref==0.2 66 | jsonschema==3.2.0 67 | kiwisolver==1.3.2 68 | llvmlite==0.38.0 69 | lmdb==1.2.1 70 | Markdown==3.3.4 71 | matplotlib==3.4.3 72 | matplotlib-inline==0.1.2 73 | mccabe==0.6.1 74 | monotonic==1.6 75 | msgpack==1.0.2 76 | multidict==6.0.2 77 | munch==2.5.0 78 | neptune-client==0.10.8 79 | neptune-contrib==0.27.3 80 | networkx==2.6.2 81 | nltk==3.6.2 82 | numba==0.55.1 83 | numpy==1.19.5 84 | oauthlib==3.1.1 85 | opencv-python==4.4.0.46 86 | packaging==21.0 87 | pandas==1.1.5 88 | parameterized==0.8.1 89 | parso==0.8.2 90 | pexpect==4.8.0 91 | pickleshare==0.7.5 92 | Pillow==8.2.0 93 | pipreqs==0.4.11 94 | portalocker==2.4.0 95 | prompt-toolkit==3.0.20 96 | protobuf==3.17.3 97 | psutil==5.8.0 98 | ptyprocess==0.7.0 99 | py-cpuinfo==8.0.0 100 | pyarrow==2.0.0 101 | pyasn1==0.4.8 102 | pyasn1-modules==0.2.8 103 | pycodestyle==2.7.0 104 | pyflakes==2.3.1 105 | Pygments==2.10.0 106 | PyJWT==2.1.0 107 | pyparsing==2.4.7 108 | pyrsistent==0.18.0 109 | python-dateutil==2.8.2 110 | pytorch-lightning==1.1.4 111 | pytorchvideo==0.1.5 112 | pytz==2021.1 113 | PyWavelets==1.1.1 114 | PyYAML==5.4.1 115 | regex==2021.8.28 116 | requests==2.26.0 117 | requests-oauthlib==1.3.0 118 | rfc3986==1.5.0 119 | rfc3987==1.3.8 120 | rsa==4.7.2 121 | sacred==0.8.2 122 | sacremoses==0.0.45 123 | scikit-image==0.18.3 124 | scikit-learn==0.24.2 125 | scipy==1.7.1 126 | simplejson==3.17.5 127 | six==1.16.0 128 | sklearn==0.0 129 | smart-open==5.2.1 130 | smmap==4.0.0 131 | sniffio==1.2.0 132 | strict-rfc3339==0.7 133 | swagger-spec-validator==2.7.3 134 | tabulate==0.8.9 135 | tb-nightly==2.7.0a20210905 136 | tensorboard==2.8.0 137 | tensorboard-data-server==0.6.1 138 | tensorboard-plugin-wit==1.8.0 139 | termcolor==1.1.0 140 | textaugment==1.3.4 141 | textblob==0.15.3 142 | threadpoolctl==2.2.0 143 | tifffile==2021.8.30 144 | timm==0.4.5 145 | tokenizers==0.9.4 146 | toml==0.10.2 147 | torch==1.8.0 148 | torchaudio==0.8.0 149 | torchvision==0.9.0 150 | tqdm==4.56.0 151 | traitlets==5.1.0 152 | transformers==4.2.1 153 | tslearn==0.5.2 154 | typing-extensions==3.10.0.2 155 | urllib3==1.26.6 156 | wcwidth==0.2.5 157 | webcolors==1.11.1 158 | websocket-client==1.2.1 159 | Werkzeug==2.0.1 160 | wrapt==1.12.1 161 | yacs==0.1.8 162 | yapf==0.31.0 163 | yarg==0.1.9 164 | yarl==1.7.2 165 | Footer 166 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/ego4d_choice.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset, read_large_frames_decord, get_video_len 2 | import os 3 | import pandas as pd 4 | 5 | 6 | class EGO4DChoiceDataset(BaseDataset): 7 | def __init__(self, *args, split="", **kwargs): 8 | assert split in ["train", "val", "test"] 9 | self.split = split 10 | if self.split == "train": 11 | Exception("no train data provided") 12 | self.metadata = None 13 | self.ans_lab_dict = None 14 | if split == "train": 15 | names = ["ego4d_choice_train"] 16 | elif split == "val": 17 | names = ["ego4d_choice_val"] 18 | elif split == "test": 19 | names = ["ego4d_choice_test"] # vqav2_test-dev for test-dev 20 | 21 | super().__init__( 22 | *args, 23 | **kwargs, 24 | names=names, 25 | text_column_name="unknown", 26 | remove_duplicate=False, 27 | ) 28 | self._load_metadata() 29 | 30 | def _load_metadata(self): 31 | metadata_dir = './meta_data/ego4d' 32 | split_files = { 33 | 'train': 'mc_val.csv', # no train and test available, only for zero-shot testing 34 | 'val': 'mc_val.csv', 35 | 'test': 'mc_val.csv' 36 | } 37 | target_split_fp = split_files[self.split] 38 | self.metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep=',', header=0, error_bad_lines=False) 39 | 40 | def _get_video_path(self, sample): 41 | rel_video_fp = eval(sample["question"])[0] + '.mp4' 42 | full_video_fp = os.path.join(self.data_dir, 'videos', rel_video_fp) 43 | if not os.path.exists(full_video_fp): 44 | Exception(IOError) 45 | return full_video_fp, rel_video_fp 46 | 47 | def get_raw_video(self, sample): 48 | abs_fp, rel_fp = self._get_video_path(sample) 49 | frame_loc = eval(sample["question"])[1] 50 | frame_end = get_video_len(abs_fp) 51 | imgs = read_large_frames_decord(abs_fp, frame_loc, frame_end, self.num_frames, mode=self.split) 52 | if imgs is None: 53 | raise Exception("Invalid video!", rel_fp) 54 | else: 55 | return imgs 56 | 57 | def get_text(self, sample): 58 | texts = [] 59 | for answer in eval(sample["answers"]): 60 | text = answer[-1] 61 | encoding = self.tokenizer( 62 | text, 63 | padding="max_length", 64 | truncation=True, 65 | max_length=self.max_text_len, 66 | return_special_tokens_mask=True, 67 | ) 68 | texts.append((text, encoding)) 69 | return texts 70 | 71 | def get_answer_label(self, sample): 72 | gt_text = eval(sample["question"])[-1] 73 | answer_label = 0 74 | for index, answer in enumerate(eval(sample["answers"])): 75 | if answer[-1] == gt_text: 76 | answer_label = index 77 | return answer_label 78 | 79 | def __getitem__(self, index): 80 | sample = self.metadata.iloc[index] 81 | # print(sample) 82 | image_tensor = self.get_video(sample) 83 | # index, question_index = self.index_mapper[index] 84 | qid = index 85 | answer = self.get_answer_label(sample) 86 | ret = { 87 | "image": image_tensor, 88 | "img_index": index, 89 | "cap_index": index, 90 | "raw_index": index, 91 | 'answer': answer 92 | } 93 | texts = self.get_text(sample) 94 | ret["text"] = texts[0] 95 | # print(len(texts)) 96 | for i in range(self.draw_false_text - 1): 97 | ret.update({f"false_text_{i}": texts[i+1]}) 98 | return ret 99 | 100 | def __len__(self): 101 | return len(self.metadata) -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/msrvttqa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .video_base_dataset import BaseDataset 3 | import os 4 | import json 5 | import pandas as pd 6 | 7 | 8 | class MSRVTTQADataset(BaseDataset): 9 | def __init__(self, *args, split="", **kwargs): 10 | assert split in ["train", "val", "test"] 11 | # if split == "test": 12 | # split = "val" 13 | self.split = split 14 | self.metadata = None 15 | self.ans_lab_dict = None 16 | if split == "train": 17 | names = ["msrvtt_qa_train"] 18 | # names = ["msrvtt_qa_train", "msrvtt_qa_val"] 19 | elif split == "val": 20 | names = ["msrvtt_qa_test"] # ["msrvtt_qa_val"] 21 | elif split == "test": 22 | names = ["msrvtt_qa_test"] # vqav2_test-dev for test-dev 23 | 24 | super().__init__( 25 | *args, 26 | **kwargs, 27 | names=names, 28 | text_column_name="questions", 29 | remove_duplicate=False, 30 | ) 31 | self.names = names 32 | # self.num_frames = 4 33 | self._load_metadata() 34 | 35 | def _load_metadata(self): 36 | metadata_dir = './meta_data/msrvtt' 37 | split_files = { 38 | 'train': 'msrvtt_qa_train.jsonl', 39 | 'val': 'msrvtt_qa_val.jsonl', 40 | 'test': 'msrvtt_qa_test.jsonl' 41 | } 42 | answer_fp = os.path.join(metadata_dir, 'msrvtt_train_ans2label.json') # 1500 in total (all classes in train) 43 | # answer_fp = os.path.join(metadata_dir, 'msrvtt_qa_ans2label.json') # 4539 in total (all classes in train+val+test) 44 | with open(answer_fp, 'r') as JSON: 45 | self.ans_lab_dict = json.load(JSON) 46 | for name in self.names: 47 | split = name.split('_')[-1] 48 | target_split_fp = split_files[split] 49 | # path_or_buf=os.path.join(metadata_dir, target_split_fp) 50 | metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True) 51 | if self.metadata is None: 52 | self.metadata = metadata 53 | else: 54 | self.metadata.update(metadata) 55 | print("total {} samples for {}".format(len(self.metadata), self.names)) 56 | # data1.update(data2) 57 | 58 | def get_text(self, sample): 59 | text = sample['question'] 60 | encoding = self.tokenizer( 61 | text, 62 | padding="max_length", 63 | truncation=True, 64 | max_length=self.max_text_len, 65 | return_special_tokens_mask=True, 66 | ) 67 | return (text, encoding) 68 | 69 | def get_answer_label(self, sample): 70 | text = sample['answer'] 71 | ans_total_len = len(self.ans_lab_dict) + 1 # one additional class 72 | try: 73 | ans_label = self.ans_lab_dict[text] # 74 | except KeyError: 75 | ans_label = -100 # ignore classes 76 | # ans_label = 1500 # other classes 77 | scores = np.zeros(ans_total_len).astype(int) 78 | scores[ans_label] = 1 79 | return text, ans_label, scores 80 | # return text, ans_label_vector, scores 81 | 82 | def __getitem__(self, index): 83 | sample = self.metadata.iloc[index] 84 | image_tensor = self.get_video(sample) 85 | text = self.get_text(sample) 86 | # index, question_index = self.index_mapper[index] 87 | qid = index 88 | if self.split != "test": 89 | answers, labels, scores = self.get_answer_label(sample) 90 | else: 91 | answers = list() 92 | labels = list() 93 | scores = list() 94 | 95 | return { 96 | "image": image_tensor, 97 | "text": text, 98 | "vqa_answer": answers, 99 | "vqa_labels": labels, 100 | "vqa_scores": scores, 101 | "qid": qid, 102 | } 103 | 104 | def __len__(self): 105 | return len(self.metadata) -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/msvdqa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .video_base_dataset import BaseDataset 3 | import os 4 | import pandas as pd 5 | import json 6 | 7 | 8 | class MSVDQADataset(BaseDataset): 9 | def __init__(self, *args, split="", **kwargs): 10 | assert split in ["train", "val", "test"] 11 | self.split = split 12 | self.metadata = None 13 | self.ans_lab_dict = None 14 | if split == "train": 15 | names = ["msvd_qa_train"] 16 | elif split == "val": 17 | names = ["msvd_qa_test"] 18 | elif split == "test": 19 | names = ["msvd_qa_test"] 20 | 21 | super().__init__( 22 | *args, 23 | **kwargs, 24 | names=names, 25 | text_column_name="questions", 26 | remove_duplicate=False, 27 | ) 28 | self._load_metadata() 29 | 30 | def _load_metadata(self): 31 | metadata_dir = '../../ICLR2023/VideoQA//meta_data/msvd' 32 | split_files = {'train': 'msvd_train.jsonl', 'val': 'msvd_val.jsonl', 'test': 'msvd_test.jsonl'} 33 | # split_files = {'train': 'what_train.jsonl', 'val': 'what_test.jsonl', 'test': 'what_test.jsonl'} 34 | 35 | self.ans_lab_dict = {} 36 | answer_fp = os.path.join(metadata_dir, 'msvd_answer_set.txt') 37 | self.youtube_mapping_dict = dict() 38 | with open(os.path.join(metadata_dir, 'msvd_youtube_mapping.txt')) as f: 39 | lines = f.readlines() 40 | for line in lines: 41 | info = line.strip().split(' ') 42 | self.youtube_mapping_dict[info[1]] = info[0] 43 | with open(answer_fp, 'r') as f: 44 | lines = f.readlines() 45 | count = 0 46 | for line in lines: 47 | self.ans_lab_dict[str(line.strip())] = count 48 | count += 1 49 | 50 | split = self.names[0].split('_')[-1] 51 | target_split_fp = split_files[split] 52 | self.metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True) 53 | # for i, k in enumerate(list(self.ans_lab_dict.keys())): 54 | # if i == 250: 55 | # break 56 | # self.metadata = self.metadata[self.metadata['answer'] != k] 57 | 58 | 59 | print("total {} samples for {}".format(len(self.metadata), self.names)) 60 | 61 | def _get_video_path(self, sample): 62 | rel_video_fp = self.youtube_mapping_dict['vid' + str(sample["video_id"])] + '.avi' 63 | full_video_fp = os.path.join(self.data_dir, 'videos', rel_video_fp) 64 | return full_video_fp, rel_video_fp 65 | 66 | def get_text(self, sample): 67 | text = sample['question'] 68 | encoding = self.tokenizer( 69 | text, 70 | padding="max_length", 71 | truncation=True, 72 | max_length=self.max_text_len, 73 | return_special_tokens_mask=True, 74 | ) 75 | return (text, encoding) 76 | 77 | def get_answer_label(self, sample): 78 | text = sample['answer'] 79 | ans_total_len = len(self.ans_lab_dict) + 1 80 | # ans_total_len = len(self.ans_lab_dict) 81 | try: 82 | ans_label = self.ans_lab_dict[text] 83 | except KeyError: 84 | ans_label = -100 85 | scores = np.zeros(ans_total_len).astype(int) 86 | scores[ans_label] = 1 87 | return text, ans_label, scores 88 | 89 | def __getitem__(self, index): 90 | sample = self.metadata.iloc[index] 91 | image_tensor = self.get_video(sample) 92 | text = self.get_text(sample) 93 | qid = index 94 | 95 | answers, labels, scores = self.get_answer_label(sample) 96 | 97 | 98 | return { 99 | "image": image_tensor, 100 | "text": text, 101 | "vqa_answer": answers, 102 | "vqa_labels": labels, 103 | "vqa_scores": scores, 104 | "qid": qid, 105 | } 106 | 107 | def __len__(self): 108 | return len(self.metadata) -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/tgif.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .video_base_dataset import BaseDataset, read_frames_gif 3 | import os 4 | import json 5 | import pandas as pd 6 | 7 | # 2022.1.28 read gif is too slow, may be need to speedup by convert gif -> video 8 | # https://stackify.dev/833655-python-convert-gif-to-videomp4 9 | 10 | 11 | class TGIFDataset(BaseDataset): 12 | def __init__(self, *args, split="", **kwargs): 13 | assert split in ["train", "val", "test"] 14 | self.split = split 15 | self.metadata = None 16 | self.ans_lab_dict = None 17 | if split == "train": 18 | names = ["tgif_train"] 19 | elif split == "val": 20 | names = ["tgif_val"] 21 | elif split == "test": 22 | names = ["tgif_test"] 23 | 24 | super().__init__( 25 | *args, 26 | **kwargs, 27 | names=names, 28 | text_column_name="questions", 29 | remove_duplicate=False, 30 | ) 31 | # self.num_frames = 4 32 | self._load_metadata() 33 | 34 | def _load_metadata(self): 35 | metadata_dir = './meta_data/tgif' 36 | split_files = { 37 | 'train': 'frameqa_train.jsonl', 38 | 'val': 'frameqa_test.jsonl', # frameqa_val.jsonl 39 | 'test': 'frameqa_test.jsonl' 40 | } 41 | target_split_fp = split_files[self.split] 42 | answer_fp = os.path.join(metadata_dir, 'frameqa_trainval_ans2label.json') 43 | # answer_fp = os.path.join(metadata_dir, 'msrvtt_qa_ans2label.json') 44 | with open(answer_fp, 'r') as JSON: 45 | self.ans_lab_dict = json.load(JSON) 46 | # path_or_buf=os.path.join(metadata_dir, target_split_fp) 47 | metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True) 48 | self.metadata = metadata 49 | 50 | def _get_video_path(self, sample): 51 | return os.path.join(self.data_dir, 'gifs', sample['gif_name']) + '.gif', sample['gif_name'] + '.gif' 52 | 53 | def get_raw_video(self, sample): 54 | abs_fp, rel_fp = self._get_video_path(sample) 55 | imgs, idxs, vlen = read_frames_gif(abs_fp, self.num_frames, mode=self.split) 56 | if imgs is None: 57 | raise Exception("Invalid img!", rel_fp) 58 | else: 59 | return imgs 60 | 61 | def get_text(self, sample): 62 | text = sample['question'] 63 | encoding = self.tokenizer( 64 | text, 65 | padding="max_length", 66 | truncation=True, 67 | max_length=self.max_text_len, 68 | return_special_tokens_mask=True, 69 | ) 70 | return (text, encoding) 71 | 72 | def get_answer_label(self, sample): 73 | text = sample['answer'] 74 | ans_total_len = len(self.ans_lab_dict) + 1 # one additional class 75 | try: 76 | ans_label = self.ans_lab_dict[text] # 77 | except KeyError: 78 | ans_label = -100 # ignore classes 79 | # ans_label = 1500 # other classes 80 | scores = np.zeros(ans_total_len).astype(int) 81 | scores[ans_label] = 1 82 | return text, ans_label, scores 83 | # return text, ans_label_vector, scores 84 | 85 | def __getitem__(self, index): 86 | sample = self.metadata.iloc[index] 87 | image_tensor = self.get_video(sample) 88 | text = self.get_text(sample) 89 | # index, question_index = self.index_mapper[index] 90 | qid = index 91 | if self.split != "test": 92 | answers, labels, scores = self.get_answer_label(sample) 93 | else: 94 | answers = list() 95 | labels = list() 96 | scores = list() 97 | 98 | return { 99 | "image": image_tensor, 100 | "text": text, 101 | "vqa_answer": answers, 102 | "vqa_labels": labels, 103 | "vqa_scores": scores, 104 | "qid": qid, 105 | } 106 | 107 | def __len__(self): 108 | return len(self.metadata) -------------------------------------------------------------------------------- /univl/README.md: -------------------------------------------------------------------------------- 1 | # UniVL + MELTR 2 | 3 | ## Requirements 4 | 5 | Our code is implemented under [UniVL](https://github.com/microsoft/UniVL) environment. 6 | 7 | ## Datasets 8 | 9 | We use two datasets (msrvtt, youcook). UniVL also provides downstream datasets and annotation files [here](https://github.com/microsoft/UniVL/blob/main/dataloaders/README.md). 10 | Annotation files of msrvtt that we used can be found [here](https://drive.google.com/drive/folders/1akmVjM6vcjlwuQj7oIN9T_Gtb0bvr5iV). 11 | 12 | Note: As mentioned in UniVL, a transcript is not publicly available due to legal issues. 13 | 14 | 15 | 16 | ### YoucookII 17 | 18 | ``` 19 | mkdir -p data 20 | cd data 21 | wget https://github.com/microsoft/UniVL/releases/download/v0/youcookii.zip 22 | unzip youcookii.zip 23 | cd .. 24 | ``` 25 | 26 | ### MSRVTT 27 | 28 | ``` 29 | mkdir -p data 30 | cd data 31 | wget https://github.com/microsoft/UniVL/releases/download/v0/msrvtt.zip 32 | unzip msrvtt.zip 33 | cd .. 34 | ``` 35 | 36 | 37 | ## Pretrained checkpoint 38 | 39 | ``` 40 | mkdir -p ./checkpoint 41 | wget -P ./checkpoint https://github.com/microsoft/UniVL/releases/download/v0/univl.pretrained.bin 42 | ``` 43 | 44 | ## Training & Evaluation 45 | 46 | * Text-to-Video Retrieval on YouCook2 47 | 48 | ``` 49 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \ 50 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/RY \ 51 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \ 52 | --eval_task retrieval --datatype youcook --youcook_train_csv /path/to/data/youcookii_train.csv \ 53 | --youcook_val_csv /path/to/data/youcookii_val.csv \ 54 | --youcook_features_path /path/to/data/youcookii_videos_features.pickle \ 55 | --youcook_data_path /path/to/data/youcookii_data.transcript.pickle 56 | ``` 57 | 58 | * Text-to-Video Retrieval on MSRVTT-9K 59 | 60 | ``` 61 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \ 62 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/R9K \ 63 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \ 64 | --eval_task retrieval --datatype msrvtt9K --VTT_train_csv /path/to/data/MSRVTT_train.9k.csv \ 65 | --VTT_val_csv /path/to/data/MSRVTT_JSFUSION_test.csv \ 66 | --VTT_data_path /path/to/data/MSRVTT_transcript_data_v2.json \ 67 | --VTT_features_path /path/to/data/msrvtt_videos_features.pickle 68 | ``` 69 | 70 | * Text-to-Video Retrieval on MSRVTT-7K 71 | 72 | ``` 73 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \ 74 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/R7K \ 75 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \ 76 | --eval_task retrieval --datatype msrvtt7K --VTT_train_csv /path/to/data/MSRVTT_train.9k.csv \ 77 | --VTT_val_csv /path/to/data/MSRVTT_JSFUSION_test.csv \ 78 | --VTT_data_path /path/to/data/MSRVTT_transcript_data_v2.json \ 79 | --VTT_features_path /path/to/data/msrvtt_videos_features.pickle 80 | ``` 81 | 82 | * Captioning on YouCook2 83 | 84 | ``` 85 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \ 86 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/CY1 \ 87 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \ 88 | --eval_task caption --datatype youcook --youcook_train_csv /path/to/data/youcookii_train.csv \ 89 | --youcook_val_csv /path/to/data/youcookii_val.csv \ 90 | --youcook_features_path /path/to/data/youcookii_videos_features.pickle \ 91 | --youcook_data_path /path/to/data/youcookii_data.transcript.pickle 92 | ``` 93 | 94 | * Captioning on MSRVTT-Full 95 | 96 | ``` 97 | python main.py --expand_msrvtt_sentences --do_train --train_sim_after_cross \ 98 | --init_model ./checkpoint/univl.pretrained.bin --output_dir ckpts/CF \ 99 | --bert_model bert-base-uncased --do_lower_case --warmup warmup_linear_down \ 100 | --eval_task caption --datatype msrvttFull --VTT_train_csv /path/to/data/MSRVTT_train.9k.csv \ 101 | --VTT_val_csv /path/to/data/MSRVTT_JSFUSION_test.csv \ 102 | --VTT_data_path /path/to/data/MSRVTT_transcript_data_v2.json \ 103 | --VTT_features_path /path/to/data/msrvtt_videos_features.pickle 104 | ``` 105 | 106 | 107 | 108 | ## Acknowledgement 109 | 110 | This repo is built upon [UniVL](https://github.com/microsoft/UniVL). 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /univl/modules/beam.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manage beam search info structure. 3 | Heavily borrowed from OpenNMT-py. 4 | For code in OpenNMT-py, please check the following link (maybe in oldest version): 5 | https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/Beam.py 6 | """ 7 | 8 | import torch 9 | 10 | class Constants(): 11 | def __init__(self): 12 | self.PAD = 0 13 | self.UNK = 1 14 | self.BOS = 2 15 | self.EOS = 3 16 | self.PAD_WORD = '[PAD]' 17 | self.UNK_WORD = '[UNK]' 18 | self.BOS_WORD = '[CLS]' 19 | self.EOS_WORD = '[SEP]' 20 | 21 | @classmethod 22 | def from_tokenizer(cls, tokenizer): 23 | instance = cls() 24 | instance.PAD = tokenizer.vocab[instance.PAD_WORD] 25 | instance.UNK = tokenizer.vocab[instance.UNK_WORD] 26 | instance.BOS = tokenizer.vocab[instance.BOS_WORD] 27 | instance.EOS = tokenizer.vocab[instance.EOS_WORD] 28 | return instance 29 | 30 | class Beam(): 31 | ''' Beam search ''' 32 | 33 | def __init__(self, size, device=False, tokenizer=None): 34 | if tokenizer is None: 35 | self.constants = Constants() 36 | else: 37 | self.constants = Constants.from_tokenizer(tokenizer) 38 | 39 | self.size = size 40 | self._done = False 41 | # The score for each interface on the beam. 42 | self.scores = torch.zeros((size,), dtype=torch.float, device=device) 43 | self.all_scores = [] 44 | 45 | # The backpointers at each time-step. 46 | self.prev_ks = [] 47 | 48 | # The outputs at each time-step. 49 | self.next_ys = [torch.full((size,), self.constants.BOS, dtype=torch.long, device=device)] 50 | 51 | def get_current_state(self): 52 | "Get the outputs for the current timestep." 53 | return self.get_tentative_hypothesis() 54 | 55 | def get_current_origin(self): 56 | "Get the backpointers for the current timestep." 57 | return self.prev_ks[-1] 58 | 59 | @property 60 | def done(self): 61 | return self._done 62 | 63 | def advance(self, word_prob, word_length=None): 64 | 65 | "Update beam status and check if finished or not." 66 | num_words = word_prob.size(1) 67 | # Sum the previous scores. 68 | if len(self.prev_ks) > 0: 69 | beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) 70 | else: 71 | beam_lk = word_prob[0] 72 | flat_beam_lk = beam_lk.view(-1) 73 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 1st sort 74 | self.all_scores.append(self.scores) 75 | self.scores = best_scores 76 | # bestScoresId is flattened as a (beam x word) array, 77 | # so we need to calculate which word and beam each score came from 78 | prev_k = best_scores_id // num_words 79 | self.prev_ks.append(prev_k) 80 | self.next_ys.append(best_scores_id - prev_k * num_words) 81 | # End condition is when top-of-beam is EOS. 82 | if self.next_ys[-1][0].item() == self.constants.EOS: 83 | self._done = True 84 | 85 | return self._done 86 | 87 | def sort_scores(self): 88 | "Sort the scores." 89 | return torch.sort(self.scores, 0, True) 90 | 91 | def get_the_best_score_and_idx(self): 92 | "Get the score of the best in the beam." 93 | scores, ids = self.sort_scores() 94 | return scores[1], ids[1] 95 | 96 | def get_tentative_hypothesis(self): 97 | "Get the decoded sequence for the current timestep." 98 | 99 | if len(self.next_ys) == 1: 100 | dec_seq = self.next_ys[0].unsqueeze(1) 101 | else: 102 | _, keys = self.sort_scores() 103 | hyps = [self.get_hypothesis(k) for k in keys] 104 | hyps = [[self.constants.BOS] + h for h in hyps] 105 | dec_seq = torch.LongTensor(hyps) 106 | 107 | return dec_seq 108 | 109 | def get_hypothesis(self, k): 110 | """ Walk back to construct the full hypothesis. """ 111 | hyp = [] 112 | for j in range(len(self.prev_ks) - 1, -1, -1): 113 | hyp.append(self.next_ys[j+1][k]) 114 | k = self.prev_ks[j][k] 115 | 116 | return list(map(lambda x: x.item(), hyp[::-1])) 117 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/lsmdc_choice.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset 2 | import os 3 | import pandas as pd 4 | from AllInOne.transforms.videoaug import VideoTransform 5 | import random 6 | 7 | 8 | class LSMDCChoiceDataset(BaseDataset): 9 | def __init__(self, *args, split="", **kwargs): 10 | assert split in ["train", "val", "test"] 11 | self.split = split 12 | self.metadata = None 13 | self.ans_lab_dict = None 14 | if split == "train": 15 | names = ["lsmdc_choice_train"] 16 | elif split == "val": 17 | names = ["lsmdc_choice_val"] 18 | elif split == "test": 19 | names = ["lsmdc_choice_test"] # vqav2_test-dev for test-dev 20 | 21 | super().__init__( 22 | *args, 23 | **kwargs, 24 | names=names, 25 | text_column_name="unknown", 26 | remove_duplicate=False, 27 | ) 28 | self._load_metadata() 29 | 30 | def _load_metadata(self): 31 | metadata_dir = './meta_data/lsmdc' 32 | split_files = { 33 | 'train': 'LSMDC16_multiple_choice_train.csv', 34 | 'val': 'LSMDC16_multiple_choice_test_randomized.csv', # 'LSMDC16_multiple_choice_valid.csv', 35 | 'test': 'LSMDC16_multiple_choice_test_randomized.csv' 36 | } 37 | target_split_fp = split_files[self.split] 38 | print(os.path.join(metadata_dir, target_split_fp)) 39 | metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t', header=None, error_bad_lines=False) 40 | self.metadata = metadata 41 | datalist = [] 42 | for raw_id in range(len(metadata)): 43 | raw_d = metadata.iloc[raw_id] 44 | video_fp = raw_d[0] 45 | sub_path = video_fp.split('.')[0] 46 | remove = sub_path.split('_')[-1] 47 | sub_path = sub_path.replace('_'+remove,'/') 48 | rel_video_fp = sub_path + video_fp + '.avi' 49 | options = [raw_d[idx] for idx in range(5, 10)] 50 | d = dict( 51 | id=video_fp, 52 | vid_id=rel_video_fp, 53 | answer=raw_d[10] - 1 if self.split in ['val', 'test'] else 0, 54 | options=options, 55 | ) 56 | datalist.append(d) 57 | self.metadata = datalist 58 | print("load split {}, {} samples".format(self.split, len(self.metadata))) 59 | 60 | def _get_video_path(self, sample): 61 | rel_video_fp = sample['vid_id'] 62 | full_video_fp = os.path.join(self.data_dir, rel_video_fp) 63 | # print(full_video_fp) 64 | # assert os.path.exists(full_video_fp) 65 | return full_video_fp, rel_video_fp 66 | 67 | def get_text(self, sample): 68 | texts = [] 69 | for text in sample['options']: 70 | encoding = self.tokenizer( 71 | text, 72 | padding="max_length", 73 | truncation=True, 74 | max_length=self.max_text_len, 75 | return_special_tokens_mask=True, 76 | ) 77 | texts.append((text, encoding)) 78 | return texts 79 | 80 | def get_answer_label(self, sample): 81 | answer = sample['answer'] 82 | return answer 83 | 84 | def __getitem__(self, index): 85 | result = False 86 | while not result: 87 | try: 88 | sample = self.metadata[index] 89 | image_tensor = self.get_video(sample) 90 | qid = index 91 | answer = self.get_answer_label(sample) 92 | ret = { 93 | "image": image_tensor, 94 | "img_index": index, 95 | "cap_index": index, 96 | "raw_index": index, 97 | 'answer': answer 98 | } 99 | texts = self.get_text(sample) 100 | ret["text"] = texts[0] 101 | for i in range(self.draw_false_text - 1): 102 | ret.update({f"false_text_{i}": texts[i+1]}) 103 | result = True 104 | except Exception as e: 105 | print(f"Error while read file idx {sample['vid_id']} in {self.names[0]} -> {e}") 106 | index = random.randint(0, len(self.metadata) - 1) 107 | return ret 108 | 109 | def __len__(self): 110 | return len(self.metadata) -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/k400.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .video_base_dataset import BaseDataset 3 | import os 4 | import random 5 | import pandas as pd 6 | from AllInOne.transforms.videoaug import VideoTransform 7 | 8 | 9 | class K400Dataset(BaseDataset): 10 | def __init__(self, *args, split="", **kwargs): 11 | assert split in ["train", "val", "test"] 12 | self.split = split 13 | self.metadata = None 14 | self.ans_lab_dict = dict() 15 | if split == "train": 16 | names = ["k400_train"] 17 | elif split == "val": 18 | names = ["k400_val"] 19 | elif split == "test": 20 | names = ["k400_test"] 21 | super().__init__( 22 | *args, 23 | **kwargs, 24 | names=names, 25 | text_column_name="questions", 26 | remove_duplicate=False, 27 | ) 28 | self.video_transform = VideoTransform(mode=self.split) # train or val model 29 | self._load_metadata() 30 | 31 | def _load_metadata(self): 32 | metadata_dir = './meta_data/k400' 33 | split_files = { 34 | 'train': 'k400_train_tsm.list', 35 | 'val': 'k400_test_tsm.list', 36 | 'test': 'k400_test_tsm.list' 37 | } 38 | target_split_fp = split_files[self.split] 39 | with open(os.path.join(metadata_dir, target_split_fp)) as f: 40 | self.metadata = f.readlines() 41 | answer_fp = os.path.join(metadata_dir, 'kinetics_label_map.txt') 42 | count = 0 43 | with open(answer_fp, 'r') as f: 44 | lines = f.readlines() 45 | for line in lines: 46 | self.ans_lab_dict[str(line.strip())] = count 47 | count += 1 48 | 49 | def _get_video_path(self, sample): 50 | # find the name is os.listdir() e.g. abseiling/0wR5jVB-WPk.mp4 51 | # /data/algceph/arcdata/Kinetics-400/train_zips/snowboarding/MCgJO4s1qBA_000129_000139.zip 52 | # -> snowboarding/MCgJO4s1qBA_000129_000139.mp4 53 | if self.split == 'train': 54 | rel_path = sample[0][46:-4] + '.mp4' 55 | else: 56 | # val maybe mkv. webm etc. 57 | fake_path = sample[0][44:-4] 58 | sub_dir, video_name = fake_path.split('/') 59 | rel_path = sub_dir 60 | for video in os.listdir(os.path.join(self.data_dir, self.split, sub_dir)): 61 | if video_name in video: 62 | rel_path = os.path.join(rel_path, video) 63 | break 64 | full_path = os.path.join(self.data_dir, self.split, rel_path) 65 | # print(full_path) 66 | return full_path, rel_path 67 | 68 | def get_text(self, sample): 69 | text = "A persion is doing [MASK]" 70 | encoding = self.tokenizer( 71 | text, 72 | padding="max_length", 73 | truncation=True, 74 | max_length=self.max_text_len, 75 | return_special_tokens_mask=True, 76 | ) 77 | return (text, encoding) 78 | 79 | def get_answer_label(self, sample): 80 | text = "None" 81 | # print(len(self.ans_lab_dict)) 82 | ans_total_len = len(self.ans_lab_dict) + 1 # one additional class 83 | ans_label = int(sample[1]) 84 | scores = np.zeros(ans_total_len).astype(int) 85 | scores[ans_label] = 1 86 | return text, ans_label, scores 87 | 88 | def __getitem__(self, index): 89 | result = None 90 | while result is None: 91 | sample = self.metadata[index].split('\t') 92 | try: 93 | image_tensor = self.get_video(sample) 94 | text = self.get_text(sample) 95 | qid = index 96 | if self.split != "test": 97 | answers, labels, scores = self.get_answer_label(sample) 98 | else: 99 | answers = list() 100 | labels = list() 101 | scores = list() 102 | result = True 103 | except Exception as e: 104 | print(f"Error while read file idx {sample[0]} -> {e}") 105 | index = random.randint(0, len(self.metadata) - 1) 106 | return { 107 | "image": image_tensor, 108 | "text": text, 109 | "vqa_answer": answers, 110 | "vqa_labels": labels, 111 | "vqa_scores": scores, 112 | "qid": qid, 113 | } 114 | 115 | def __len__(self): 116 | return len(self.metadata) -------------------------------------------------------------------------------- /univl/eval/retrieval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | import numpy as np 4 | from utils.metrics import compute_metrics 5 | from utils.utils import parallel_apply 6 | global logger 7 | 8 | def _run_on_single_gpu(net, batch_list_t, batch_list_v, batch_sequence_output_list, batch_visual_output_list): 9 | sim_matrix = [] 10 | for idx1, b1 in enumerate(tqdm(batch_list_t)): 11 | input_ids, input_mask, segment_ids, _, _, _, _, _, _ = b1 12 | sequence_output = batch_sequence_output_list[idx1] 13 | each_row = [] 14 | for idx2, b2 in enumerate(batch_list_v): 15 | _, _, _, video, video_mask, _, _, _, _ = b2 16 | visual_output = batch_visual_output_list[idx2] 17 | b1b2_logits = net.get_similarity_logits_align(sequence_output, visual_output, input_mask, video_mask) 18 | b1b2_logits = b1b2_logits.cpu().detach().numpy() 19 | each_row.append(b1b2_logits) 20 | each_row = np.concatenate(tuple(each_row), axis=-1) 21 | sim_matrix.append(each_row) 22 | return sim_matrix 23 | 24 | 25 | def eval_retrieval_epoch(model, test_dataloader, device, n_gpu, logger): 26 | 27 | if hasattr(model, 'module'): 28 | model = model.module.to(device) 29 | else: 30 | model = model.to(device) 31 | model.eval() 32 | with torch.no_grad(): 33 | batch_list = [] 34 | batch_sequence_output_list, batch_visual_output_list = [], [] 35 | for bid, batch in enumerate(tqdm(test_dataloader)): 36 | batch = tuple(t.to(device) for t in batch) 37 | 38 | input_ids, input_mask, segment_ids, video, video_mask, _, _, _, _ = batch 39 | 40 | 41 | sequence_output, visual_output = model.get_sequence_visual_output(input_ids, segment_ids, input_mask, video, video_mask) 42 | 43 | batch_sequence_output_list.append(sequence_output) 44 | batch_visual_output_list.append(visual_output) 45 | batch_list.append(batch) 46 | 47 | print("{}/{}\r".format(bid, len(test_dataloader)), end="") 48 | 49 | if n_gpu > 1: 50 | device_ids = list(range(n_gpu)) 51 | batch_list_t_splits = [] 52 | batch_list_v_splits = [] 53 | batch_t_output_splits = [] 54 | batch_v_output_splits = [] 55 | bacth_len = len(batch_list) 56 | split_len = (bacth_len + n_gpu - 1) // n_gpu 57 | for dev_id in device_ids: 58 | s_, e_ = dev_id * split_len, (dev_id + 1) * split_len 59 | if dev_id == 0: 60 | batch_list_t_splits.append(batch_list[s_:e_]) 61 | batch_list_v_splits.append(batch_list) 62 | 63 | batch_t_output_splits.append(batch_sequence_output_list[s_:e_]) 64 | batch_v_output_splits.append(batch_visual_output_list) 65 | else: 66 | devc = torch.device('cuda:{}'.format(str(dev_id))) 67 | devc_batch_list = [tuple(t.to(devc) for t in b) for b in batch_list[s_:e_]] 68 | batch_list_t_splits.append(devc_batch_list) 69 | devc_batch_list = [tuple(t.to(devc) for t in b) for b in batch_list] 70 | batch_list_v_splits.append(devc_batch_list) 71 | 72 | devc_batch_list = [b.to(devc) for b in batch_sequence_output_list[s_:e_]] 73 | batch_t_output_splits.append(devc_batch_list) 74 | devc_batch_list = [b.to(devc) for b in batch_visual_output_list] 75 | batch_v_output_splits.append(devc_batch_list) 76 | parameters_tuple_list = [(batch_list_t_splits[dev_id], batch_list_v_splits[dev_id], 77 | batch_t_output_splits[dev_id], batch_v_output_splits[dev_id]) for dev_id in device_ids] 78 | parallel_outputs = parallel_apply(_run_on_single_gpu, model, parameters_tuple_list, device_ids) 79 | sim_matrix = [] 80 | for idx in range(len(parallel_outputs)): 81 | sim_matrix += parallel_outputs[idx] 82 | sim_matrix = np.concatenate(tuple(sim_matrix), axis=0) 83 | 84 | else: 85 | sim_matrix = _run_on_single_gpu(model, batch_list, batch_list, batch_sequence_output_list, batch_visual_output_list) 86 | sim_matrix = np.concatenate(sim_matrix, axis=0) 87 | 88 | metrics = compute_metrics(sim_matrix) # 53 * (64, 3369) 89 | logger.info('\t Length-T: {}, Length-V:{}'.format(len(sim_matrix), len(sim_matrix[0]))) 90 | logger.info('\t>>> R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'. 91 | format(metrics['R1'], metrics['R5'], metrics['R10'], metrics['MR'])) 92 | 93 | R1, R5, R10, MR = metrics['R1'], metrics['R5'], metrics['R10'], metrics['MR'] 94 | 95 | return R1, R5, R10, MR 96 | -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/activitynet.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset, read_frames_from_img_dir 2 | import random 3 | import os 4 | import pandas as pd 5 | 6 | 7 | class ActivityNetDataset(BaseDataset): 8 | def __init__(self, *args, split="", **kwargs): 9 | assert split in ["train", "val", "test"] 10 | self.split = split 11 | self.metadata = None 12 | if split == "train": 13 | names = ["activitynet_train"] 14 | elif split == "val": 15 | names = ["activitynet_val"] 16 | elif split == "test": 17 | names = ["activitynet_val"] 18 | self._load_metadata() 19 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 20 | 21 | def _load_metadata(self): 22 | metadata_dir = './meta_data/activitynet' 23 | split_files = { 24 | 'train': 'train.jsonl', 25 | 'val': 'val1.jsonl', 26 | 'test': 'val2.jsonl' 27 | } 28 | target_split_fp = split_files[self.split] 29 | metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True) 30 | self.metadata = metadata 31 | 32 | def _get_video_path(self, sample): 33 | rel_video_fp = sample['clip_name'] 34 | full_video_fp = os.path.join(self.data_dir, 'activitynet_frames', rel_video_fp) 35 | return full_video_fp, rel_video_fp 36 | 37 | def get_raw_video(self, sample): 38 | abs_fp, rel_fp = self._get_video_path(sample) 39 | imgs, idxs, vlen = read_frames_from_img_dir(abs_fp, self.num_frames, mode=self.split) 40 | if imgs is None: 41 | raise Exception("Invalid img!", rel_fp) 42 | else: 43 | return imgs 44 | 45 | def get_video(self, index, sample, image_key="image"): 46 | imgs = self.get_raw_video(sample).permute(1, 0, 2, 3) # to cthw 47 | imgs_tensor = [self.video_transform(imgs).permute(1, 0, 2, 3)] # to tchw 48 | return { 49 | "image": imgs_tensor, 50 | "img_index": index, 51 | "cap_index": index, 52 | "raw_index": index, 53 | } 54 | 55 | def get_false_video(self, rep, image_key="image"): 56 | random_index = random.randint(0, len(self.metadata) - 1) 57 | sample = self.metadata.iloc[random_index] 58 | imgs = self.get_raw_video(sample).permute(1, 0, 2, 3) # to cthw 59 | # can be different augmentation 60 | imgs_tensor = [self.video_transform(imgs).permute(1, 0, 2, 3)] # to tchw 61 | return {f"false_image_{rep}": imgs_tensor} 62 | 63 | def get_text(self, raw_index, sample): 64 | text = sample['caption'] 65 | # print(text) 66 | encoding = self.tokenizer( 67 | text, 68 | padding="max_length", 69 | truncation=True, 70 | max_length=self.max_text_len, 71 | return_special_tokens_mask=True, 72 | ) 73 | # print(encoding.size()) 74 | return { 75 | "text": (text, encoding), 76 | "img_index": raw_index, 77 | "cap_index": raw_index, 78 | "raw_index": raw_index, 79 | } 80 | 81 | def get_false_text(self, rep): 82 | random_index = random.randint(0, len(self.metadata) - 1) 83 | sample = self.metadata.iloc[random_index] 84 | text = sample['caption'] 85 | encoding = self.tokenizer( 86 | text, 87 | # padding="max_length", 88 | truncation=True, 89 | max_length=self.max_text_len, 90 | return_special_tokens_mask=True, 91 | ) 92 | return {f"false_text_{rep}": (text, encoding)} 93 | 94 | def get_suite(self, index): 95 | result = None 96 | while result is None: 97 | sample = self.metadata.iloc[index] 98 | try: 99 | ret = dict() 100 | ret.update(self.get_video(index, sample)) 101 | if not self.image_only: 102 | txt = self.get_text(index, sample) 103 | ret.update({"replica": True if txt["cap_index"] > 0 else False}) 104 | ret.update(txt) 105 | 106 | for i in range(self.draw_false_image): 107 | ret.update(self.get_false_video(i)) 108 | for i in range(self.draw_false_text): 109 | ret.update(self.get_false_text(i)) 110 | result = True 111 | except Exception as e: 112 | print(f"Error while read file idx {sample.name} in {self.names[0]} -> {e}") 113 | index = random.randint(0, len(self.metadata) - 1) 114 | return ret 115 | 116 | def __len__(self): 117 | return len(self.metadata) 118 | 119 | def __getitem__(self, index): 120 | return self.get_suite(index) -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/webvid.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset, read_frames_decord 2 | import random 3 | import os 4 | import pandas as pd 5 | 6 | 7 | class WEBVIDDataset(BaseDataset): 8 | def __init__(self, *args, split="", **kwargs): 9 | assert split in ["train", "val", "test"] 10 | self.split = split 11 | self.metadata = None 12 | self.cut = "jsfusion" 13 | if split == "train": 14 | names = ["webvid_train"] 15 | elif split == "val": 16 | names = ["webvid_val"] 17 | elif split == "test": 18 | names = ["webvid_val"] 19 | self._load_metadata() 20 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 21 | 22 | def _load_metadata(self): 23 | metadata_dir = './meta_data/webvid' 24 | split_files = { 25 | 'train': 'webvid_training_success_full.tsv', 26 | 'val': 'webvid_validation_success_full.tsv', # there is no test 27 | 'test': 'webvid_validation_success_full.tsv' 28 | } 29 | target_split_fp = split_files[self.split] 30 | metadata = pd.read_csv(os.path.join(metadata_dir, target_split_fp), sep='\t') 31 | self.metadata = metadata 32 | 33 | def _get_video_path(self, sample): 34 | rel_video_fp = sample[1] + '.mp4' 35 | full_video_fp = os.path.join(self.data_dir, self.split, rel_video_fp) 36 | return full_video_fp, rel_video_fp 37 | 38 | def _get_caption(self, sample): 39 | return sample[0] 40 | 41 | def get_raw_video(self, sample): 42 | abs_fp, rel_fp = self._get_video_path(sample) 43 | imgs, idxs, vlen = read_frames_decord(abs_fp, self.num_frames, mode=self.split) 44 | if imgs is None: 45 | raise Exception("Invalid img!", rel_fp) 46 | else: 47 | return imgs 48 | 49 | def get_video(self, index, sample, image_key="image"): 50 | imgs = self.get_raw_video(sample).permute(1, 0, 2, 3) # to cthw 51 | imgs_tensor = [self.video_transform(imgs).permute(1, 0, 2, 3)] # to tchw 52 | return { 53 | "image": imgs_tensor, 54 | "img_index": index, 55 | "cap_index": index, 56 | "raw_index": index, 57 | } 58 | 59 | def get_false_video(self, rep, image_key="image"): 60 | random_index = random.randint(0, len(self.metadata) - 1) 61 | sample = self.metadata.iloc[random_index] 62 | imgs = self.get_raw_video(sample).permute(1, 0, 2, 3) # to cthw 63 | # can be different augmentation 64 | imgs_tensor = [self.video_transform(imgs).permute(1, 0, 2, 3)] # to tchw 65 | return {f"false_image_{rep}": imgs_tensor} 66 | 67 | def get_text(self, raw_index, sample): 68 | text = sample[0] 69 | # print(text) 70 | encoding = self.tokenizer( 71 | text, 72 | padding="max_length", 73 | truncation=True, 74 | max_length=self.max_text_len, 75 | return_special_tokens_mask=True, 76 | ) 77 | # print(encoding.size()) 78 | return { 79 | "text": (text, encoding), 80 | "img_index": raw_index, 81 | "cap_index": raw_index, 82 | "raw_index": raw_index, 83 | } 84 | 85 | def get_false_text(self, rep): 86 | random_index = random.randint(0, len(self.metadata) - 1) 87 | sample = self.metadata.iloc[random_index] 88 | text = sample[0] 89 | encoding = self.tokenizer( 90 | text, 91 | # padding="max_length", 92 | truncation=True, 93 | max_length=self.max_text_len, 94 | return_special_tokens_mask=True, 95 | ) 96 | return {f"false_text_{rep}": (text, encoding)} 97 | 98 | def get_suite(self, index): 99 | result = None 100 | while result is None: 101 | sample = self.metadata.iloc[index] 102 | try: 103 | ret = dict() 104 | ret.update(self.get_video(index, sample)) 105 | if not self.image_only: 106 | txt = self.get_text(index, sample) 107 | ret.update({"replica": True if txt["cap_index"] > 0 else False}) 108 | ret.update(txt) 109 | 110 | for i in range(self.draw_false_image): 111 | ret.update(self.get_false_video(i)) 112 | for i in range(self.draw_false_text): 113 | ret.update(self.get_false_text(i)) 114 | result = True 115 | except Exception as e: 116 | print(f"Error while read file idx {sample.name} in {self.names[0]} -> {e}") 117 | index = random.randint(0, len(self.metadata) - 1) 118 | return ret 119 | 120 | def __len__(self): 121 | return len(self.metadata) 122 | 123 | def __getitem__(self, index): 124 | return self.get_suite(index) -------------------------------------------------------------------------------- /allinone/AllInOne/datasets/tvqaplus.py: -------------------------------------------------------------------------------- 1 | from .video_base_dataset import BaseDataset 2 | import os 3 | import pandas as pd 4 | import cv2 5 | import torch 6 | from AllInOne.datasets.video_base_dataset import sample_frames 7 | 8 | # each sample: https://tvqa.cs.unc.edu/download_tvqa_plus.html 9 | # { 10 | # "answer_idx": "1", 11 | # "qid": 134094, 12 | # "ts": [5.99, 11.98], 13 | # "a1": "Howard is talking to Raj and Leonard", 14 | # "a0": "Howard is talking to Bernadette", 15 | # "a3": "Howard is talking to Leonard and Penny", 16 | # "a2": "Howard is talking to Sheldon , and Raj", 17 | # "q": "Who is Howard talking to when he is in the lab room ?", 18 | # "vid_name": "s05e02_seg02_clip_00", 19 | # "a4": "Howard is talking to Penny and Bernadette", 20 | # "bbox": { 21 | # "14": [ 22 | # { 23 | # "img_id": 14, 24 | # "top": 153, 25 | # "label": "Howard", 26 | # "width": 180, 27 | # "height": 207, 28 | # "left": 339 29 | # }, 30 | # { 31 | # "img_id": 14, 32 | # "top": 6, 33 | # "label": "lab", 34 | # "width": 637, 35 | # "height": 354, 36 | # "left": 3 37 | # }, 38 | # ... 39 | # ], 40 | # "20": [ ... ], 41 | # "26": [ ... ], 42 | # "32": [ ... ], 43 | # "38": [ ... ] 44 | # } 45 | # } 46 | 47 | 48 | class TVQAPLUSDataset(BaseDataset): 49 | def __init__(self, *args, split="", **kwargs): 50 | assert split in ["train", "val", "test"] 51 | self.split = split 52 | self.metadata = None 53 | self._load_metadata() 54 | if split == "train": 55 | names = ["tvqaplus_train"] 56 | elif split == "val": 57 | names = ["tvqaplus_val"] 58 | elif split == "test": 59 | names = ["tvqaplus_test"] 60 | 61 | super().__init__(*args, **kwargs, names=names, text_column_name="caption") 62 | # for appear objects 63 | self.only_use_relevant_dets = True 64 | if self.only_use_relevant_dets: 65 | self.relevant_dets = [] # resort the detection numbers 66 | self.relevant_dets_classes = [] 67 | 68 | def _load_metadata(self): 69 | # download specific 70 | metadata_dir = './meta_data/tvqa' 71 | split_files = { 72 | 'train': 'tvqa_plus_train.jsonl', 73 | 'val': 'tvqa_plus_val.jsonl', 74 | 'test': 'tvqa_plus_test_public.jsonl' # no GT label for test set 75 | } 76 | target_split_fp = split_files[self.split] 77 | metadata = pd.read_json(os.path.join(metadata_dir, target_split_fp), lines=True) 78 | self.metadata = metadata 79 | 80 | def _get_image_path(self, sample): 81 | rel_fp = sample['vid_name'] 82 | return os.path.join(self.data_dir, rel_fp), rel_fp 83 | 84 | def _get_caption(self, sample): 85 | return sample[0] 86 | 87 | # tvqaplus provide sampled frames (3 fps) 88 | # To Do: considering sample one frame with bounding box 89 | def get_raw_video(self, sample): 90 | abs_fp, rel_fp = self._get_image_path(sample) 91 | [beg_time, end_time] = sample['ts'] 92 | clip_len = int((float(end_time) - float(beg_time)) * 3) 93 | rel_frame_index = sample_frames(self.num_frames, clip_len) 94 | # sample N frames here 95 | frames = [] 96 | for index in rel_frame_index: 97 | img = cv2.imread(abs_fp + '{}.jpg'.format(index)) 98 | frame = torch.from_numpy(img).byte() 99 | frame = frame.permute(2, 0, 1) 100 | frames.append(frame) 101 | frames = torch.stack(frames).permute(1, 0, 2, 3) 102 | return frames 103 | 104 | def get_text(self, sample): 105 | question = self.get_question(sample) 106 | qa_texts = [] 107 | # 5 choices 108 | for i in range(5): 109 | raw_text = question + "[SEP]" + sample["a{}".format(i)] 110 | qa_encoding = self.tokenizer( 111 | raw_text, 112 | padding="max_length", 113 | truncation=True, 114 | max_length=self.max_text_len, 115 | return_special_tokens_mask=True, 116 | ) 117 | qa_texts.append((raw_text, qa_encoding)) 118 | return qa_texts 119 | 120 | def get_answer_label(self, sample): 121 | answer = int(sample['answer_idx']) 122 | return answer 123 | 124 | def get_question(self, sample): 125 | return sample["q"] 126 | 127 | def __len__(self): 128 | return len(self.metadata) 129 | 130 | def __getitem__(self, index): 131 | sample = self.metadata.iloc[index] 132 | self.relevant_dets = [] # initalize 133 | self.relevant_dets_classes = [] 134 | answer = self.get_answer_label(sample) 135 | ret = { 136 | "img_index": index, 137 | "cap_index": index, 138 | "raw_index": index, 139 | 'answer': answer 140 | } 141 | qa_texts = self.get_text(sample) 142 | ret["text"] = qa_texts[0] 143 | for i in range(self.draw_options_text - 1): 144 | ret.update({f"options_text_{i}": qa_texts[i+1]}) 145 | video_tensor = self.get_video(sample) 146 | ret["image"] = video_tensor 147 | return ret 148 | 149 | -------------------------------------------------------------------------------- /violet/eval_retrieval.py: -------------------------------------------------------------------------------- 1 | 2 | from lib import * 3 | from dataset import Dataset_Base 4 | from model import VIOLET_Base 5 | from agent import Agent_Base 6 | import pandas as pd 7 | 8 | class Dataset_Retrieval(Dataset_Base): 9 | def __init__(self, args, split): 10 | super().__init__(args) 11 | self.split = split 12 | 13 | dataset = args['dataset'] 14 | self.imgs = pickle.load(open(f'./_data/{dataset}/img_{dataset}.pkl', 'rb')) 15 | self.vq = pickle.load(open(f'./_data/{dataset}/{dataset}_vq.pkl', 'rb')) 16 | 17 | if split == 'train': 18 | self.data = json.load(open(args['train_annotation'], 'r'))['sentences'] 19 | else: 20 | self.data = json.load(open(args['test_annotation'], 'r'))['sentences'] 21 | 22 | def __len__(self): 23 | return len(self.data) 24 | 25 | def __getitem__(self, idx): 26 | vid = self.data[idx]['video_id'] 27 | txt, mask = self.str2txt(self.data[idx]['caption']) 28 | 29 | img = [] 30 | for b in self.imgs[vid]: 31 | img.append(self.str2img(b).unsqueeze(0)) 32 | img = T.cat(img, dim=0) 33 | 34 | return img, txt, mask, vid 35 | 36 | class Dataset_Product(T.utils.data.Dataset): 37 | def __init__(self, feat): 38 | super().__init__() 39 | 40 | self.vid2idx = {v: i for i, v in enumerate(feat)} 41 | self.lst = [[feat[p], feat[q]] for p in feat for q in feat] 42 | 43 | def __len__(self): 44 | return len(self.lst) 45 | 46 | def __getitem__(self, idx): 47 | p, q = self.lst[idx] 48 | 49 | return [p['feat_txt'], p['mask_txt'], self.vid2idx[p['video']], 50 | q['feat_img'], q['mask_img'], self.vid2idx[q['video']]] # (p->text, q->video) 51 | 52 | class VIOLET_Retrieval(VIOLET_Base): 53 | def __init__(self): 54 | super().__init__() 55 | 56 | self.fc = T.nn.Sequential(*[T.nn.Dropout(0.1), 57 | T.nn.Linear(768, 768*2), T.nn.ReLU(inplace=True), 58 | T.nn.Linear(768*2, 1)]) 59 | 60 | def forward(self, typ, 61 | img=None, txt=None, mask=None, 62 | feat_img=None, mask_img=None, feat_txt=None, mask_txt=None): 63 | 64 | if typ=='feat': 65 | feat_img, mask_img, feat_txt, mask_txt = self.go_feat(img, txt, mask) 66 | return feat_img, mask_img, feat_txt, mask_txt 67 | 68 | elif typ=='cross': 69 | out, _ = self.go_cross(feat_img, mask_img, feat_txt, mask_txt) 70 | out = self.fc(out[:, feat_img.shape[1], :]).squeeze() 71 | return out 72 | 73 | if __name__=='__main__': 74 | args = json.load(open(sys.argv[1], 'r')) 75 | args['size_batch'] = 100*T.cuda.device_count() 76 | print(args) 77 | 78 | model = T.nn.DataParallel(VIOLET_Retrieval().cuda()) 79 | model.module.load_ckpt(args['path_ckpt']) 80 | model.eval() 81 | 82 | for split in ['val']: 83 | dl = T.utils.data.DataLoader(Dataset_Retrieval(args, split), 84 | batch_size=args['size_batch'], shuffle=False, 85 | num_workers=64, pin_memory=True) 86 | feat = {} 87 | for img, txt, mask, vid in tqdm(dl, ascii=True): 88 | with T.no_grad(): 89 | feat_img, mask_img, feat_txt, mask_txt = model(typ='feat', img=img.cuda(), txt=txt.cuda(), mask=mask.cuda()) 90 | for v, f_i, m_i, f_t, m_t in zip(vid, *[d.data.cpu().numpy() for d in [feat_img, mask_img, feat_txt, mask_txt]]): 91 | feat[v] = {'video': v, 'feat_img': f_i, 'mask_img': m_i, 'feat_txt': f_t, 'mask_txt': m_t} 92 | 93 | dl = T.utils.data.DataLoader(Dataset_Product(feat), 94 | batch_size=args['size_batch'], shuffle=False, 95 | num_workers=64, pin_memory=True) 96 | rank = {} 97 | for feat_txt, mask_txt, idx_txt, feat_img, mask_img, idx_vid in tqdm(dl, ascii=True): 98 | with T.no_grad(): 99 | out = model(typ='cross', feat_img=feat_img, mask_img=mask_img, feat_txt=feat_txt, mask_txt=mask_txt) 100 | out = T.sigmoid(out).data.cpu().numpy() 101 | for i_t, i_v, o in zip(idx_txt, idx_vid, out): 102 | i_t, i_v, o = int(i_t), int(i_v), float(o) 103 | 104 | if not i_t in rank: 105 | rank[i_t] = [] 106 | rank[i_t].append([i_v, o]) 107 | 108 | res = {'r@1': 0, 'r@5': 0, 'r@10': 0, 'median': []} 109 | for i_t in rank: 110 | tmp = sorted(rank[i_t], key=lambda d: -d[1]) 111 | p = [d[0] for d in tmp].index(i_t)+1 112 | 113 | if p<=1: 114 | res['r@1'] += 1.0/len(rank) 115 | if p<=5: 116 | res['r@5'] += 1.0/len(rank) 117 | if p<=10: 118 | res['r@10'] += 1.0/len(rank) 119 | res['median'].append(p) 120 | res['median'] = int(np.median(res['median'])) 121 | 122 | print(split, res) 123 | with open('result.txt', 'a') as f: 124 | text = f"r@1: {res['r@1']}, r@5: {res['r@5']}, r@10: {res['r@10']}, median: {res['median']}\n" 125 | f.write(text) -------------------------------------------------------------------------------- /univl/modules/until_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch BERT model.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import copy 24 | import json 25 | import logging 26 | import tarfile 27 | import tempfile 28 | import shutil 29 | import torch 30 | from .file_utils import cached_path 31 | 32 | logger = logging.getLogger(__name__) 33 | 34 | class PretrainedConfig(object): 35 | 36 | pretrained_model_archive_map = {} 37 | config_name = "" 38 | weights_name = "" 39 | 40 | @classmethod 41 | def get_config(cls, pretrained_model_name, cache_dir, type_vocab_size, state_dict, task_config=None): 42 | archive_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), pretrained_model_name) 43 | if os.path.exists(archive_file) is False: 44 | if pretrained_model_name in cls.pretrained_model_archive_map: 45 | archive_file = cls.pretrained_model_archive_map[pretrained_model_name] 46 | else: 47 | archive_file = pretrained_model_name 48 | 49 | # redirect to the cache, if necessary 50 | try: 51 | resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) 52 | except FileNotFoundError: 53 | if task_config is None or task_config.local_rank == 0: 54 | logger.error( 55 | "Model name '{}' was not found in model name list. " 56 | "We assumed '{}' was a path or url but couldn't find any file " 57 | "associated to this path or url.".format( 58 | pretrained_model_name, 59 | archive_file)) 60 | return None 61 | if resolved_archive_file == archive_file: 62 | if task_config is None or task_config.local_rank == 0: 63 | logger.info("loading archive file {}".format(archive_file)) 64 | else: 65 | if task_config is None or task_config.local_rank == 0: 66 | logger.info("loading archive file {} from cache at {}".format( 67 | archive_file, resolved_archive_file)) 68 | tempdir = None 69 | if os.path.isdir(resolved_archive_file): 70 | serialization_dir = resolved_archive_file 71 | else: 72 | # Extract archive to temp dir 73 | tempdir = tempfile.mkdtemp() 74 | if task_config is None or task_config.local_rank == 0: 75 | logger.info("extracting archive file {} to temp dir {}".format( 76 | resolved_archive_file, tempdir)) 77 | with tarfile.open(resolved_archive_file, 'r:gz') as archive: 78 | archive.extractall(tempdir) 79 | serialization_dir = tempdir 80 | # Load config 81 | config_file = os.path.join(serialization_dir, cls.config_name) 82 | config = cls.from_json_file(config_file) 83 | config.type_vocab_size = type_vocab_size 84 | if task_config is None or task_config.local_rank == 0: 85 | logger.info("Model config {}".format(config)) 86 | 87 | if state_dict is None: 88 | weights_path = os.path.join(serialization_dir, cls.weights_name) 89 | if os.path.exists(weights_path): 90 | state_dict = torch.load(weights_path, map_location='cpu') 91 | else: 92 | if task_config is None or task_config.local_rank == 0: 93 | logger.info("Weight doesn't exsits. {}".format(weights_path)) 94 | 95 | if tempdir: 96 | # Clean up temp dir 97 | shutil.rmtree(tempdir) 98 | 99 | return config, state_dict 100 | 101 | @classmethod 102 | def from_dict(cls, json_object): 103 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 104 | config = cls(vocab_size_or_config_json_file=-1) 105 | for key, value in json_object.items(): 106 | config.__dict__[key] = value 107 | return config 108 | 109 | @classmethod 110 | def from_json_file(cls, json_file): 111 | """Constructs a `BertConfig` from a json file of parameters.""" 112 | with open(json_file, "r", encoding='utf-8') as reader: 113 | text = reader.read() 114 | return cls.from_dict(json.loads(text)) 115 | 116 | def __repr__(self): 117 | return str(self.to_json_string()) 118 | 119 | def to_dict(self): 120 | """Serializes this instance to a Python dictionary.""" 121 | output = copy.deepcopy(self.__dict__) 122 | return output 123 | 124 | def to_json_string(self): 125 | """Serializes this instance to a JSON string.""" 126 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" --------------------------------------------------------------------------------