├── lavis ├── models │ ├── blip2_models │ │ └── __init__.py │ ├── .DS_Store │ ├── blip_models │ │ └── .DS_Store │ ├── clip_models │ │ ├── .DS_Store │ │ ├── pics │ │ │ └── CLIP.png │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ └── __init__.py │ ├── gpt_models │ │ └── .DS_Store │ ├── timesformer │ │ ├── .DS_Store │ │ ├── __init__.py │ │ └── linear.py │ ├── albef_models │ │ └── .DS_Store │ ├── alpro_models │ │ └── .DS_Store │ ├── pnp_vqa_models │ │ ├── .DS_Store │ │ └── __init__.py │ └── img2prompt_models │ │ ├── .DS_Store │ │ └── __init__.py ├── projects │ ├── blip2 │ │ └── eval │ │ │ ├── caption_coco_flant5xl_eval.yaml │ │ │ ├── ret_flickr_eval.yaml │ │ │ ├── caption_coco_opt2.7b_eval.yaml │ │ │ ├── caption_coco_opt6.7b_eval.yaml │ │ │ ├── gqa_zeroshot_flant5xl_eval.yaml │ │ │ ├── okvqa_zeroshot_flant5xl_eval.yaml │ │ │ ├── ret_coco_eval.yaml │ │ │ └── vqav2_zeroshot_flant5xl_eval.yaml │ ├── .DS_Store │ ├── alpro │ │ ├── .DS_Store │ │ ├── eval │ │ │ ├── msrvtt_qa_eval.yaml │ │ │ ├── msvd_qa_eval.yaml │ │ │ ├── msrvtt_ret_eval.yaml │ │ │ └── didemo_ret_eval.yaml │ │ └── train │ │ │ └── msrvtt_retrieval_ft.yaml │ ├── blip │ │ ├── eval │ │ │ ├── nlvr_eval.yaml │ │ │ ├── caption_coco_eval.yaml │ │ │ ├── caption_coco_eval_large.yaml │ │ │ ├── ret_flickr_eval.yaml │ │ │ ├── okvqa_eval.yaml │ │ │ ├── aokvqa_eval.yaml │ │ │ ├── vqav2_eval.yaml │ │ │ ├── nocaps_eval.yaml │ │ │ └── ret_coco_eval.yaml │ │ ├── train │ │ │ ├── nlvr_ft.yaml │ │ │ ├── caption_coco_large_ft.yaml │ │ │ ├── caption_coco_ft.yaml │ │ │ └── okvqa_ft.yaml │ │ └── coco_cap_ft_iter.yaml │ ├── clip │ │ ├── exp_imnet_zs_eval.yaml │ │ ├── exp_coco_ret_eval.yaml │ │ └── exp_flickr_ret_eval.yaml │ ├── albef │ │ ├── eval │ │ │ ├── nlvr_eval.yaml │ │ │ ├── snli_ve_eval.yaml │ │ │ ├── ret_coco_eval.yaml │ │ │ ├── ret_flickr30k_eval.yaml │ │ │ ├── vqa_val.yaml │ │ │ └── vqa_test.yaml │ │ └── train │ │ │ ├── snli_ve_ft.yaml │ │ │ ├── nlvr_ft.yaml │ │ │ ├── okvqa_ft.yaml │ │ │ └── aokvqa_ft.yaml │ ├── gpt │ │ └── eval │ │ │ └── dialogue_avsd_eval.yaml │ └── pnp-vqa │ │ └── eval │ │ ├── okvqa_eval.yaml │ │ ├── okvqa_eval_3b.yaml │ │ ├── okvqa_eval_large.yaml │ │ ├── gqa_eval.yaml │ │ ├── gqa_eval_3b.yaml │ │ ├── vqav2_eval.yaml │ │ ├── vqav2_eval_3b.yaml │ │ ├── gqa_eval_large.yaml │ │ ├── vqav2_test_eval.yaml │ │ ├── vqav2_test_eval_3b.yaml │ │ ├── vqav2_eval_large.yaml │ │ └── vqav2_test_eval_large.yaml ├── configs │ ├── .DS_Store │ ├── datasets │ │ ├── .DS_Store │ │ ├── laion │ │ │ └── defaults_2B_multi.yaml │ │ ├── imagenet │ │ │ └── defaults.yaml │ │ ├── vg │ │ │ ├── defaults_vqa.yaml │ │ │ └── defaults_caption.yaml │ │ ├── conceptual_caption │ │ │ ├── defaults_3m.yaml │ │ │ └── defaults_12m.yaml │ │ ├── how2qa │ │ │ └── defaults_qa.yaml │ │ ├── star │ │ │ └── defaults_qa.yaml │ │ ├── vlep │ │ │ └── defaults_qa.yaml │ │ ├── tvqa │ │ │ └── defaults_qa.yaml │ │ ├── msrvttmc │ │ │ └── defaults_qa.yaml │ │ ├── nextqa │ │ │ ├── defaults_qa.yaml │ │ │ └── defaults_qa_old.yaml │ │ ├── mixed │ │ │ └── defaults.yaml │ │ ├── tacos │ │ │ ├── defaults.yaml │ │ │ └── relative_integer.yaml │ │ ├── nextgqa │ │ │ └── defaults_qa.yaml │ │ ├── anet │ │ │ └── defaults.yaml │ │ ├── sbu_caption │ │ │ └── defaults.yaml │ │ ├── qvh │ │ │ └── defaults.yaml │ │ ├── charades_sta │ │ │ ├── seconds_decimal.yaml │ │ │ ├── relative_integer.yaml │ │ │ ├── relative_decimal.yaml │ │ │ └── defaults.yaml │ │ ├── qvhQ │ │ │ └── defaults.yaml │ │ ├── nocaps │ │ │ └── defaults.yaml │ │ ├── flickr30k │ │ │ └── defaults.yaml │ │ ├── nlvr │ │ │ └── defaults.yaml │ │ ├── snli_ve │ │ │ └── defaults.yaml │ │ ├── msvd │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_qa.yaml │ │ ├── msrvtt │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_ret.yaml │ │ ├── vatex │ │ │ └── defaults_cap.yaml │ │ ├── avsd │ │ │ └── defaults_dial.yaml │ │ ├── didemo │ │ │ └── defaults_ret.yaml │ │ ├── coco │ │ │ ├── defaults_ret.yaml │ │ │ ├── defaults_cap.yaml │ │ │ └── eval_vqa.yaml │ │ └── gqa │ │ │ ├── balanced_val.yaml │ │ │ └── balanced_testdev.yaml │ ├── models │ │ ├── .DS_Store │ │ ├── clip_resnet50.yaml │ │ ├── clip │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32.json │ │ │ ├── ViT-L-14.json │ │ │ ├── ViT-L-16.json │ │ │ ├── ViT-B-16-plus.json │ │ │ ├── ViT-L-14-280.json │ │ │ ├── ViT-L-14-336.json │ │ │ ├── ViT-L-16-320.json │ │ │ ├── ViT-B-16-plus-240.json │ │ │ ├── ViT-B-32-plus-256.json │ │ │ ├── ViT-H-14.json │ │ │ ├── ViT-H-16.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-g-14.json │ │ │ ├── timm-resnet50d.json │ │ │ ├── timm-resnetaa50d.json │ │ │ ├── timm-resnetblur50.json │ │ │ ├── RN101.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── timm-efficientnetv2_rw_s.json │ │ │ ├── timm-vit_base_patch16_224.json │ │ │ ├── timm-vit_base_patch32_224.json │ │ │ ├── timm-vit_small_patch16_224.json │ │ │ ├── timm-swin_base_patch4_window7_224.json │ │ │ ├── RN101-quickgelu.json │ │ │ └── RN50-quickgelu.json │ │ ├── clip_vit_base16.yaml │ │ ├── bert_config.json │ │ ├── med_config.json │ │ ├── med_large_config.json │ │ ├── blip_pretrain_large.yaml │ │ ├── med_config_albef.json │ │ ├── bert_config_alpro.json │ │ ├── blip_classification_base.yaml │ │ ├── blip_feature_extractor_base.yaml │ │ ├── albef_feature_extractor.yaml │ │ ├── blip_itm_base.yaml │ │ ├── blip_itm_large.yaml │ │ ├── blip_pretrain_base.yaml │ │ ├── gpt_dialogue_base.yaml │ │ ├── blip2 │ │ │ ├── blip2_pretrain.yaml │ │ │ ├── blip2_coco.yaml │ │ │ ├── blip2_pretrain_flant5xl.yaml │ │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ │ ├── blip2_pretrain_opt6.7b.yaml │ │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ │ ├── blip2_caption_opt2.7b.yaml │ │ │ ├── blip2_caption_opt6.7b.yaml │ │ │ └── blip2_caption_flant5xl.yaml │ │ ├── albef_pretrain_base.yaml │ │ ├── alpro_retrieval_didemo.yaml │ │ ├── blip_caption_large_coco.yaml │ │ ├── blip_vqa_okvqa.yaml │ │ ├── blip_vqa_aokvqa.yaml │ │ ├── blip_vqav2.yaml │ │ ├── blip_retrieval_coco.yaml │ │ ├── albef_classification_ve.yaml │ │ ├── blip_nlvr.yaml │ │ ├── albef_vqav2.yaml │ │ ├── blip_caption_base_coco.yaml │ │ ├── albef_nlvr.yaml │ │ ├── blip_retrieval_flickr.yaml │ │ ├── alpro_retrieval_msrvtt.yaml │ │ ├── alpro_qa_msvd.yaml │ │ ├── alpro_qa_msrvtt.yaml │ │ ├── albef_retrieval_coco.yaml │ │ ├── albef_retrieval_flickr.yaml │ │ ├── clip_vit_base32.yaml │ │ ├── clip_vit_large14.yaml │ │ └── clip_vit_large14_336.yaml │ └── default.yaml ├── common │ ├── vqa_tools │ │ └── __init__.py │ └── gradcam.py ├── runners │ └── __init__.py ├── tasks │ └── image_text_pretrain.py ├── processors │ └── base_processor.py ├── datasets │ ├── builders │ │ ├── dialogue_builder.py │ │ ├── temporal_action_localization_builder.py │ │ └── classification_builder.py │ ├── datasets │ │ ├── multimodal_classification_datasets.py │ │ └── vg_vqa_datasets.py │ └── download_scripts │ │ └── DownloadConceptualCaptions │ │ ├── LICENSE │ │ └── README.md └── __init__.py ├── assets ├── .DS_Store ├── model.png └── teaser.png ├── docs ├── _static │ ├── merlion.png │ ├── logo_final.png │ └── architecture.png ├── requirements.txt ├── tutorial.rst ├── Makefile ├── index.rst ├── make.bat └── tutorial.evaluation.rst ├── pyproject.toml ├── run_scripts └── mr_BLIP │ ├── eval │ ├── nextGQA.sh │ ├── nextQA.sh │ ├── anet.sh │ ├── charades.sh │ └── qvh.sh │ └── train │ ├── nextGQA.sh │ ├── anet.sh │ ├── charades.sh │ ├── qvh.sh │ ├── qvhQ.sh │ └── nextQA.sh ├── MANIFEST.in ├── standalone_eval └── eval_sample.sh ├── requirements.txt ├── setup.py └── LICENSE.txt /lavis/models/blip2_models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /assets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/assets/.DS_Store -------------------------------------------------------------------------------- /assets/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/assets/model.png -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/assets/teaser.png -------------------------------------------------------------------------------- /lavis/models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/.DS_Store -------------------------------------------------------------------------------- /docs/_static/merlion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/docs/_static/merlion.png -------------------------------------------------------------------------------- /lavis/configs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/configs/.DS_Store -------------------------------------------------------------------------------- /lavis/projects/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/projects/.DS_Store -------------------------------------------------------------------------------- /docs/_static/logo_final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/docs/_static/logo_final.png -------------------------------------------------------------------------------- /docs/_static/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/docs/_static/architecture.png -------------------------------------------------------------------------------- /lavis/configs/datasets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/configs/datasets/.DS_Store -------------------------------------------------------------------------------- /lavis/configs/models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/configs/models/.DS_Store -------------------------------------------------------------------------------- /lavis/projects/alpro/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/projects/alpro/.DS_Store -------------------------------------------------------------------------------- /lavis/models/blip_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/blip_models/.DS_Store -------------------------------------------------------------------------------- /lavis/models/clip_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/clip_models/.DS_Store -------------------------------------------------------------------------------- /lavis/models/gpt_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/gpt_models/.DS_Store -------------------------------------------------------------------------------- /lavis/models/timesformer/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/timesformer/.DS_Store -------------------------------------------------------------------------------- /lavis/models/albef_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/albef_models/.DS_Store -------------------------------------------------------------------------------- /lavis/models/alpro_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/alpro_models/.DS_Store -------------------------------------------------------------------------------- /lavis/models/pnp_vqa_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/pnp_vqa_models/.DS_Store -------------------------------------------------------------------------------- /lavis/models/clip_models/pics/CLIP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/clip_models/pics/CLIP.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | GitPython 2 | ipykernel 3 | nbsphinx==0.8.7 4 | pandoc 5 | sphinx 6 | sphinx_autodoc_typehints 7 | sphinx_rtd_theme -------------------------------------------------------------------------------- /lavis/models/img2prompt_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/img2prompt_models/.DS_Store -------------------------------------------------------------------------------- /lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/eval/nextGQA.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/nextGQA.yaml -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/train/nextGQA.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 train.py --cfg-path lavis/projects/mr_BLIP/train/nextGQA.yaml -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/train/anet.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/anet.yaml -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/train/charades.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/charades.yaml -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include lavis/configs *.yaml *.json 2 | recursive-include lavis/projects *.yaml *.json 3 | 4 | recursive-exclude lavis/datasets/download_scripts * 5 | recursive-exclude lavis/output * 6 | 7 | include requirements.txt 8 | -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ============================== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | tutorial.evaluation 8 | tutorial.training-example 9 | tutorial.configs 10 | tutorial.datasets 11 | tutorial.processors 12 | tutorial.models 13 | tutorial.tasks 14 | -------------------------------------------------------------------------------- /lavis/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/train/qvh.sh: -------------------------------------------------------------------------------- 1 | # CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/qvh.yaml 2 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 train.py --cfg-path lavis/projects/mr_BLIP/train/qvh.yaml -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/train/qvhQ.sh: -------------------------------------------------------------------------------- 1 | # CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/qvh.yaml 2 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 train.py --cfg-path lavis/projects/mr_BLIP/train/qvhQ.yaml -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/eval/nextQA.sh: -------------------------------------------------------------------------------- 1 | # CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/qvh.yaml 2 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/nextQA.yaml -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/train/nextQA.sh: -------------------------------------------------------------------------------- 1 | # CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 train.py --cfg-path lavis/projects/mr_BLIP/train/nextQA.yaml 2 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/nextQA.yaml -------------------------------------------------------------------------------- /lavis/models/img2prompt_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /lavis/models/timesformer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/facebookresearch/TimeSformer 8 | """ 9 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_resnet50.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: RN50 10 | 11 | pretrained: openai 12 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/runners/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.runners.runner_base import RunnerBase 9 | from lavis.runners.runner_iter import RunnerIter 10 | 11 | __all__ = ["RunnerBase", "RunnerIter"] 12 | -------------------------------------------------------------------------------- /lavis/configs/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | env: 7 | # For default users 8 | # cache_root: "cache" 9 | # For internal use with persistent storage 10 | cache_root: "/nas-hdd/shoubin/pretrained_model/" 11 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnet50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnet50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnetaa50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetaa50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnetblur50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetblur50", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_base16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-16 10 | 11 | pretrained: openai 12 | 13 | preprocess: 14 | vis_processor: 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-efficientnetv2_rw_s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "efficientnetv2_rw_s", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 288 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_base_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_base_patch32_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch32_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_small_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_small_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /standalone_eval/eval_sample.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: bash standalone_eval/eval_sample.sh 3 | # submission_path=standalone_eval/sample_val_preds.jsonl 4 | submission_path=standalone_eval/hl_val_submission.jsonl 5 | gt_path=data/annotations/QVH/highlight_val_release.jsonl 6 | save_path=standalone_eval/val_preds_metrics.json 7 | 8 | PYTHONPATH=$PYTHONPATH:. python standalone_eval/eval.py \ 9 | --submission_path ${submission_path} \ 10 | --gt_path ${gt_path} \ 11 | --save_path ${save_path} 12 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /lavis/configs/datasets/laion/defaults_2B_multi.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion2B_multi: 8 | 9 | data_type: images 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 14 | -------------------------------------------------------------------------------- /lavis/models/clip_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/mlfoundations/open_clip 8 | """ 9 | 10 | """ OpenAI pretrained model functions 11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP. 12 | 13 | Originally MIT License, Copyright (c) 2021 OpenAI. 14 | """ 15 | -------------------------------------------------------------------------------- /lavis/configs/models/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /lavis/configs/models/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/eval/anet.sh: -------------------------------------------------------------------------------- 1 | # Should return: 2 | # {"agg_metrics": 32.647, "r1": {"0.5": 53.79, "0.55": 49.43, "0.6": 44.78, "0.65": 40.21, "0.7": 35.47, "0.75": 30.73, "0.8": 25.94, "0.85": 20.9, "0.9": 15.57, "0.95": 9.65}, "mAP": {"0.5": 53.79, "0.55": 49.43, "0.6": 44.78, "0.65": 40.21, "0.7": 35.47, "0.75": 30.73, "0.8": 25.95, "0.85": 20.9, "0.9": 15.57, "0.95": 9.65, "average": 32.65}, "mIoU": 0.515230127209404, "invalid_predictions": 0.0, "total": 17032} 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/anet.yaml -------------------------------------------------------------------------------- /lavis/configs/models/med_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 1024, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /lavis/configs/datasets/imagenet/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | imagenet: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | splits: ["val"] 14 | images: 15 | storage: /export/share/datasets/vision/imagenet 16 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_pretrain_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | # vit encoder 10 | vit_type: "large" 11 | vit_grad_ckpt: True 12 | vit_ckpt_layer: 5 13 | 14 | image_size: 224 15 | 16 | # bert config 17 | med_config_path: "configs/models/med_large_config.json" 18 | 19 | embed_dim: 256 20 | 21 | # generation configs 22 | prompt: "a picture of " 23 | -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/eval/charades.sh: -------------------------------------------------------------------------------- 1 | # Should return: 2 | # {'agg_metrics': 41.40999999999999, 'r1': {'0.5': 69.31, '0.55': 65.13, '0.6': 59.48, '0.65': 55.0, '0.7': 49.29, '0.75': 41.68, '0.8': 32.9, '0.85': 23.51, '0.9': 12.46, '0.95': 5.34}, 'mAP': {'0.5': 66.96, '0.55': 62.53, '0.6': 57.18, '0.65': 52.04, '0.7': 46.43, '0.75': 39.46, '0.8': 30.58, '0.85': 20.9, '0.9': 10.31, '0.95': 4.2, 'average': 39.06}, 'mIoU': 0.5863397571818805, 'invalid_predictions': 0.0, 'total': 3720} 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/charades.yaml -------------------------------------------------------------------------------- /lavis/tasks/image_text_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.tasks.base_task import BaseTask 10 | 11 | 12 | @registry.register_task("image_text_pretrain") 13 | class ImageTextPretrainTask(BaseTask): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def evaluation(self, model, data_loader, cuda_enabled=True): 18 | pass 19 | -------------------------------------------------------------------------------- /run_scripts/mr_BLIP/eval/qvh.sh: -------------------------------------------------------------------------------- 1 | # Should return: 2 | # {'agg_metrics': 57.55899999999999, 'r1': {'0.5': 76.16, '0.55': 72.1, '0.6': 69.2, '0.65': 66.24, '0.7': 62.63, '0.75': 59.73, '0.8': 54.64, '0.85': 49.29, '0.9': 38.92, '0.95': 26.68}, 'mAP': {'0.5': 68.5, '0.55': 65.19, '0.6': 62.91, '0.65': 60.43, '0.7': 57.48, '0.75': 55.06, '0.8': 50.79, '0.85': 45.96, '0.9': 36.4, '0.95': 24.94, 'average': 52.77}, 'mIoU': 0.703218087517246, 'invalid_predictions': 0.014175257731958763, 'total': 1552} 3 | 4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/qvh.yaml -------------------------------------------------------------------------------- /lavis/configs/models/med_config_albef.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "fusion_layer": 6 22 | } -------------------------------------------------------------------------------- /lavis/configs/models/bert_config_alpro.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": true, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522, 20 | "encoder_width": 768, 21 | "add_cross_attention": false, 22 | "fusion_layer": 6 23 | } -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_classification_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_classification 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | 10 | use_distill: True 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | # vit encoder 15 | vit_type: "base" 16 | vit_grad_ckpt: False 17 | vit_ckpt_layer: 0 18 | 19 | image_size: 384 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | -------------------------------------------------------------------------------- /lavis/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /lavis/configs/datasets/vg/defaults_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json 16 | storage: vg/annotations/vg_qa.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /lavis/configs/datasets/vg/defaults_caption.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json 16 | storage: vg/annotations/vg_caption.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /lavis/models/timesformer/linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | """ Linear layer (alternate definition) 9 | """ 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import nn as nn 13 | 14 | 15 | class Linear(nn.Linear): 16 | def forward(self, input: torch.Tensor) -> torch.Tensor: 17 | if torch.jit.is_scripting(): 18 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None 19 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) 20 | else: 21 | return F.linear(input, self.weight, self.bias) 22 | -------------------------------------------------------------------------------- /lavis/configs/datasets/conceptual_caption/defaults_3m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_3m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc3m.json 17 | storage: 18 | - conceptual_caption/annotations/cc3m.json 19 | images: 20 | storage: conceptual_caption/images 21 | -------------------------------------------------------------------------------- /lavis/configs/datasets/how2qa/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | how2qa: # name of the dataset builder 3 | # data_dir: ${env.data_dir}/datasets 4 | data_type: videos # [images|videos|features] 5 | build_info: 6 | # Be careful not to append minus sign (-) before split to avoid itemizing 7 | annotations: 8 | train: 9 | url: /nas-ssd/shoubin/datasets/how2qa/train.json 10 | storage: /nas-ssd/shoubin/datasets/how2qa/train.json 11 | val: 12 | url: /nas-ssd/shoubin/datasets/how2qa/val.json 13 | storage: /nas-ssd/shoubin/datasets/how2qa/val.json 14 | test: 15 | url: /nas-ssd/shoubin/datasets/how2qa/val.json 16 | storage: /nas-ssd/shoubin/datasets/how2qa/val.json 17 | videos: 18 | storage: /nas-hdd/shoubin/how2qa/clips/ -------------------------------------------------------------------------------- /lavis/configs/datasets/conceptual_caption/defaults_12m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_12m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc12m.json 17 | storage: 18 | - conceptual_caption/annotations/cc12m.json 19 | images: 20 | storage: conceptual_caption/images_12m 21 | -------------------------------------------------------------------------------- /lavis/configs/datasets/star/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | star: # name of the dataset builder 3 | # data_dir: ${env.data_dir}/datasets 4 | data_type: videos # [images|videos|features] 5 | build_info: 6 | # Be careful not to append minus sign (-) before split to avoid itemizing 7 | annotations: 8 | train: 9 | url: /nas-ssd/shoubin/datasets/star/train.json 10 | storage: /nas-ssd/shoubin/datasets/star/train.json 11 | val: 12 | url: /nas-ssd/shoubin/datasets/star/val.json 13 | storage: /nas-ssd/shoubin/datasets/star/val.json 14 | test: 15 | url: /nas-ssd/shoubin/datasets/star/val.json 16 | storage: /nas-ssd/shoubin/datasets/star/val.json 17 | videos: 18 | storage: /nas-hdd/shoubin/videos/charades/Charades_v1_480/ -------------------------------------------------------------------------------- /lavis/configs/datasets/vlep/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | vlep: # name of the dataset builder 3 | # data_dir: ${env.data_dir}/datasets 4 | data_type: videos # [images|videos|features] 5 | build_info: 6 | # Be careful not to append minus sign (-) before split to avoid itemizing 7 | annotations: 8 | train: 9 | url: /nas-ssd/shoubin/datasets/vlep/train.json 10 | storage: /nas-ssd/shoubin/datasets/vlep/train.json 11 | val: 12 | url: /nas-ssd/shoubin/datasets/vlep/val.json 13 | storage: /nas-ssd/shoubin/datasets/vlep/val.json 14 | test: 15 | url: /nas-ssd/shoubin/datasets/vlep/val.json 16 | storage: /nas-ssd/shoubin/datasets/vlep/val.json 17 | videos: 18 | storage: /nas-hdd/shoubin/videos/charades/Charades_v1_480/ -------------------------------------------------------------------------------- /lavis/configs/datasets/tvqa/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | tvqa: # name of the dataset builder 3 | # data_dir: ${env.data_dir}/datasets 4 | data_type: videos # [images|videos|features] 5 | build_info: 6 | # Be careful not to append minus sign (-) before split to avoid itemizing 7 | annotations: 8 | train: 9 | url: /nas-ssd/shoubin/datasets/tvqa/train.json 10 | storage: /nas-ssd/shoubin/datasets/tvqa/train.json 11 | val: 12 | url: /nas-ssd/shoubin/datasets/tvqa/val.json 13 | storage: /nas-ssd/shoubin/datasets/tvqa/val.json 14 | test: 15 | url: /nas-ssd/shoubin/datasets/tvqa/val.json 16 | storage: /nas-ssd/shoubin/datasets/tvqa/val.json 17 | videos: 18 | storage: /nas-hdd/shoubin/videos/tvqa/videos_3fps_with_audio/ -------------------------------------------------------------------------------- /lavis/datasets/builders/dialogue_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 10 | from lavis.datasets.datasets.avsd_dialogue_datasets import ( 11 | AVSDDialDataset, 12 | AVSDDialEvalDataset, 13 | ) 14 | 15 | 16 | @registry.register_builder("avsd_dialogue") 17 | class AVSDDialBuilder(BaseDatasetBuilder): 18 | train_dataset_cls = AVSDDialDataset 19 | eval_dataset_cls = AVSDDialEvalDataset 20 | 21 | DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"} 22 | -------------------------------------------------------------------------------- /lavis/datasets/datasets/multimodal_classification_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from abc import abstractmethod 9 | from lavis.datasets.datasets.base_dataset import BaseDataset 10 | 11 | 12 | class MultimodalClassificationDataset(BaseDataset): 13 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 14 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 15 | 16 | self.class_labels = None 17 | 18 | @abstractmethod 19 | def _build_class_labels(self): 20 | pass 21 | 22 | @abstractmethod 23 | def _load_auxiliary_mappings(self): 24 | pass 25 | 26 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_feature_extractor_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | vit_grad_ckpt: False 13 | vit_ckpt_layer: 0 14 | 15 | image_size: 224 16 | 17 | # bert config 18 | med_config_path: "configs/models/med_config.json" 19 | 20 | embed_dim: 256 21 | 22 | preprocess: 23 | vis_processor: 24 | eval: 25 | name: "blip_image_eval" 26 | image_size: 224 27 | text_processor: 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msrvttmc/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | msrvttmc: # name of the dataset builder 3 | # data_dir: ${env.data_dir}/datasets 4 | data_type: videos # [images|videos|features] 5 | build_info: 6 | # Be careful not to append minus sign (-) before split to avoid itemizing 7 | # no training data for this dataset 8 | annotations: 9 | train: 10 | url: /nas-ssd/shoubin/datasets/msrvttmc/val.json 11 | storage: /nas-ssd/shoubin/datasets/msrvttmc/val.json 12 | val: 13 | url: /nas-ssd/shoubin/datasets/msrvttmc/val.json 14 | storage: /nas-ssd/shoubin/datasets/msrvttmc/val.json 15 | test: 16 | url: /nas-ssd/shoubin/datasets/msrvttmc/val.json 17 | storage: /nas-ssd/shoubin/datasets/msrvttmc/val.json 18 | videos: 19 | storage: /nas-hdd/tarbucket/terran/data/msrvtt/videos/all/ -------------------------------------------------------------------------------- /lavis/configs/models/albef_feature_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | image_size: 224 13 | vit_ckpt_layer: 0 14 | vit_drop_path_rate: 0 15 | vit_layer_norm_epsilon: 1e-6 16 | vit_grad_ckpt: False 17 | 18 | # bert config 19 | med_config_path: "configs/models/med_config_albef.json" 20 | 21 | embed_dim: 256 22 | 23 | preprocess: 24 | vis_processor: 25 | eval: 26 | name: "blip_image_eval" 27 | image_size: 224 28 | text_processor: 29 | eval: 30 | name: "blip_caption" 31 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_itm_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /lavis/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_itm_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "large" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /lavis/configs/datasets/nextqa/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nextqa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/train.json 15 | storage: Your/path/to/train.json 16 | val: 17 | url: Your/path/to/val.json 18 | storage: Your/path/to/val.json 19 | test: 20 | url: Your/path/to/val.json 21 | storage: Your/path/to/val.json 22 | videos: 23 | storage: Your/path/to/raw/NExT 24 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. LAVIS documentation master file, created by 2 | sphinx-quickstart on Sun Jul 31 10:32:27 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to LAVIS's documentation! 7 | ================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: Introduction 12 | 13 | intro 14 | 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: Getting Started 19 | 20 | getting_started 21 | 22 | 23 | .. :maxdepth: 1 24 | .. :caption: Advanced Training 25 | 26 | .. advanced_training 27 | 28 | 29 | .. toctree:: 30 | :maxdepth: 2 31 | :caption: Advanced Usage 32 | 33 | benchmark 34 | tutorial 35 | 36 | 37 | .. Documentations 38 | .. =================== 39 | 40 | 41 | Indices and tables 42 | ================== 43 | 44 | * :ref:`genindex` 45 | * :ref:`modindex` 46 | * :ref:`search` 47 | -------------------------------------------------------------------------------- /lavis/configs/datasets/mixed/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | mixed: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/train.json 15 | storage: Your/path/to/train.json 16 | val: 17 | url: Your/path/to/val.json 18 | storage: Your/path/to/val.json 19 | test: 20 | url: Your/path/to/test.json 21 | storage: Your/path/to/test.json 22 | videos: 23 | storage: Your/path/to/raw/Mixed_Charades_QVH -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /lavis/configs/datasets/tacos/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | tacos: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/train.json 15 | storage: Your/path/to/train.json 16 | val: 17 | url: Your/path/to/val_float.json 18 | storage: Your/path/to/val_float.json 19 | test: 20 | url: Your/path/to/test.json 21 | storage: Your/path/to/test.json 22 | videos: 23 | storage: Your/path/to/raw/TACoS/res_224 -------------------------------------------------------------------------------- /lavis/projects/blip/eval/nlvr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | 10 | datasets: 11 | nlvr: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: multimodal_classification 22 | 23 | batch_size_train: 16 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | seed: 42 28 | output_dir: "output/BLIP/NLVR" 29 | 30 | evaluate: True 31 | test_splits: ["val", "test"] 32 | 33 | # distribution-specific 34 | device: "cuda" 35 | world_size: 1 36 | dist_url: "env://" 37 | distributed: True 38 | -------------------------------------------------------------------------------- /lavis/projects/clip/exp_imnet_zs_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | imagenet: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "clip_image_eval" 16 | # image_size: 224 17 | image_size: 336 18 | 19 | run: 20 | task: multimodal_classification 21 | 22 | # dataloading 23 | num_workers: 4 24 | batch_size_train: 32 25 | batch_size_eval: 128 26 | 27 | test_splits: ["val"] 28 | 29 | # distribution 30 | device: "cuda" 31 | world_size: 1 32 | dist_url: "env://" 33 | distributed: True 34 | 35 | # misc 36 | seed: 42 37 | output_dir: "output/clip/zs_imnet" 38 | 39 | evaluate: True 40 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/nlvr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | model_type: nlvr 9 | 10 | datasets: 11 | nlvr: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: multimodal_classification 22 | 23 | batch_size_train: 16 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | seed: 42 28 | output_dir: "output/ALBEF/NLVR" 29 | 30 | evaluate: True 31 | test_splits: ["val", "test"] 32 | 33 | # distribution-specific 34 | device: "cuda" 35 | world_size: 1 36 | dist_url: "env://" 37 | distributed: True 38 | -------------------------------------------------------------------------------- /lavis/configs/datasets/nextgqa/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nextgqa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/train.json 15 | storage: Your/path/to/train.json 16 | val: 17 | url: Your/path/to/nextgqa/test.json 18 | storage: Your/path/to/nextgqa/test.json 19 | test: 20 | url: Your/path/to/nextgqa/test.json 21 | storage: Your/path/to/nextgqa/test.json 22 | videos: 23 | storage: Your/path/to/raw/NExT 24 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/snli_ve_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | model_type: ve 9 | 10 | datasets: 11 | snli_ve: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | task: multimodal_classification 21 | # optimization-specific 22 | batch_size_train: 32 23 | batch_size_eval: 64 24 | num_workers: 4 25 | 26 | seed: 42 27 | output_dir: "output/ALBEF/SNLI_VE" 28 | 29 | evaluate: True 30 | test_splits: ["val", "test"] 31 | 32 | # distribution-specific 33 | device: "cuda" 34 | world_size: 1 35 | dist_url: "env://" 36 | distributed: True 37 | -------------------------------------------------------------------------------- /lavis/configs/datasets/anet/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | anet: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/train.json 15 | storage: Your/path/to/train.json 16 | val: 17 | url: Your/path/to/val_float.json 18 | storage: Your/path/to/val_float.json 19 | test: 20 | url: Your/path/to/test_float.json 21 | storage: Your/path/to/test_float.json 22 | videos: 23 | storage: Your/path/to/data/raw/ANet/Anet_videos_15fps_short256 -------------------------------------------------------------------------------- /lavis/configs/datasets/sbu_caption/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | sbu_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json 17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json 18 | storage: 19 | - sbu_captions/annotations/sbu.json 20 | images: 21 | storage: sbu_captions/images 22 | # storage: /export/share/datasets/vision_language/sbu_resize 23 | -------------------------------------------------------------------------------- /lavis/configs/datasets/qvh/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | qvh: # name of the dataset builder 8 | data_type: videos # [images|videos|features] 9 | build_info: 10 | # Be careful not to append minus sign (-) before split to avoid itemizing 11 | annotations: 12 | train: 13 | url: Your/path/to/train.json 14 | storage: Your/path/to/train.json 15 | val: 16 | url: Your/path/to/val.json 17 | storage: Your/path/to/val.json 18 | test: 19 | # url: Your/path/to/test_dummy.json 20 | # storage: Your/path/to/test_dummy.json 21 | url: Your/path/to/val.json 22 | storage: Your/path/to/val.json 23 | videos: 24 | storage: Your/path/to/data/raw/QVHighlights 25 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 224 18 | alpha: 0.4 19 | 20 | # bert config 21 | med_config_path: "configs/models/bert_config.json" 22 | 23 | embed_dim: 256 24 | 25 | # generation configs 26 | prompt: "a picture of " 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /lavis/configs/datasets/tacos/relative_integer.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | tacos-relative_integer: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/train_relative.json 15 | storage: Your/path/to/train_relative.json 16 | val: 17 | url: Your/path/to/val_float.json 18 | storage: Your/path/to/val_float.json 19 | test: 20 | url: Your/path/to/test_float.json 21 | storage: Your/path/to/test_float.json 22 | videos: 23 | storage: Your/path/to/raw/TACoS/res_224 -------------------------------------------------------------------------------- /lavis/configs/datasets/charades_sta/seconds_decimal.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | charades_sta-seconds_decimal: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/new_train_float.json 15 | storage: Your/path/to/new_train_float.json 16 | val: 17 | url: Your/path/to/new_val_float.json 18 | storage: Your/path/to/new_val_float.json 19 | test: 20 | url: Your/path/to/test.json 21 | storage: Your/path/to/test.json 22 | videos: 23 | storage: Your/path/to/raw/Charades -------------------------------------------------------------------------------- /lavis/configs/datasets/qvhQ/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | qvhQ: # name of the dataset builder 8 | data_type: videos # [images|videos|features] 9 | build_info: 10 | # Be careful not to append minus sign (-) before split to avoid itemizing 11 | annotations: 12 | train: 13 | url: Your/path/to/train_mcqa.json 14 | storage: Your/path/to/train_mcqa.json 15 | val: 16 | url: Your/path/to/val.json 17 | storage: Your/path/to/val.json 18 | test: 19 | # url: Your/path/to/test_dummy.json 20 | # storage: Your/path/to/test_dummy.json 21 | url: Your/path/to/val.json 22 | storage: Your/path/to/val.json 23 | videos: 24 | storage: Your/path/to/data/raw/QVHighlights 25 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/caption_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: base_coco 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | # task: retrieval 21 | task: captioning 22 | # optimizer 23 | batch_size_train: 32 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | max_len: 20 28 | min_len: 5 29 | num_beams: 3 30 | 31 | seed: 42 32 | output_dir: "output/BLIP/Caption_coco" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/configs/datasets/charades_sta/relative_integer.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | charades_sta-relative_integer: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/new_train_relative.json 15 | storage: Your/path/to/new_train_relative.json 16 | val: 17 | url: Your/path/to/new_val_float.json 18 | storage: Your/path/to/new_val_float.json 19 | test: 20 | url: Your/path/to/test.json 21 | storage: Your/path/to/test.json 22 | videos: 23 | storage: Your/path/to/raw/Charades -------------------------------------------------------------------------------- /lavis/projects/blip/eval/caption_coco_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: large_coco 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | # task: retrieval 21 | task: captioning 22 | # optimizer 23 | batch_size_train: 32 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | max_len: 20 28 | min_len: 5 29 | num_beams: 3 30 | 31 | seed: 42 32 | output_dir: "output/BLIP/Caption_coco" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/configs/models/gpt_dialogue_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 10 | 11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 12 | 13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "gpt_video_ft" 19 | eval: 20 | name: "gpt_video_ft" 21 | text_processor: 22 | train: 23 | name: "gpt_dialogue" 24 | eval: 25 | name: "gpt_dialogue" -------------------------------------------------------------------------------- /lavis/configs/datasets/charades_sta/relative_decimal.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | charades_sta-relative_decimal: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/new_train_relative_float.json 15 | storage: Your/path/to/new_train_relative_float.json 16 | val: 17 | url: Your/path/to/new_val_float.json 18 | storage: Your/path/to/new_val_float.json 19 | test: 20 | url: Your/path/to/test.json 21 | storage: Your/path/to/test.json 22 | videos: 23 | storage: Your/path/to/raw/Charades -------------------------------------------------------------------------------- /lavis/configs/datasets/nocaps/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nocaps: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json 16 | storage: nocaps/annotations/nocaps_val.json 17 | test: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json 19 | storage: nocaps/annotations/nocaps_test.json 20 | images: 21 | storage: nocaps/images 22 | # storage: /export/share/datasets/vision/nocaps/ 23 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 224 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msrvtt_qa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | model_type: msrvtt 9 | 10 | datasets: 11 | msrvtt_qa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 16 16 | image_size: 224 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: multimodal_classification 23 | # optimization-specific 24 | batch_size_train: 32 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | seed: 42 29 | output_dir: "output/ALPRO/msrvtt_qa" 30 | 31 | evaluate: True 32 | valid_splits: ["val"] 33 | test_splits: ["test"] 34 | 35 | # distribution-specific 36 | device: "cuda" 37 | world_size: 1 38 | dist_url: "env://" 39 | distributed: True 40 | -------------------------------------------------------------------------------- /lavis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.registry import registry 14 | 15 | from lavis.datasets.builders import * 16 | from lavis.models import * 17 | from lavis.processors import * 18 | from lavis.tasks import * 19 | 20 | 21 | root_dir = os.path.dirname(os.path.abspath(__file__)) 22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 23 | 24 | registry.register_path("library_root", root_dir) 25 | repo_root = os.path.join(root_dir, "..") 26 | registry.register_path("repo_root", repo_root) 27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 28 | registry.register_path("cache_root", cache_root) 29 | 30 | registry.register("MAX_INT", sys.maxsize) 31 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 32 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | image_size: 224 15 | vit_ckpt_layer: 0 16 | vit_drop_path_rate: 0 17 | vit_layer_norm_epsilon: 1e-6 18 | vit_grad_ckpt: False 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config_albef.json" 22 | mlm_mask_prob: 0.15 23 | 24 | embed_dim: 256 25 | momentum: 0.995 26 | alpha: 0.4 27 | temp: 0.07 28 | 29 | max_txt_len: 30 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 256 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msvd_qa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | model_type: msvd 9 | 10 | datasets: 11 | msvd_qa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 16 16 | image_size: 224 17 | text_processor: 18 | train: 19 | name: "blip_caption" 20 | eval: 21 | name: "blip_caption" 22 | 23 | run: 24 | task: multimodal_classification 25 | # optimization-specific 26 | batch_size_train: 24 27 | batch_size_eval: 64 28 | num_workers: 4 29 | 30 | seed: 42 31 | output_dir: "output/ALPRO/msvd_qa" 32 | 33 | evaluate: True 34 | test_splits: ["test"] 35 | 36 | # distribution-specific 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/configs/datasets/nextqa/defaults_qa_old.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nextqa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: /nas-ssd/shoubin/datasets/nextqa/train.json 15 | storage: /nas-ssd/shoubin/datasets/nextqa/train.json 16 | val: 17 | url: /nas-ssd/shoubin/datasets/nextqa/val.json 18 | storage: /nas-ssd/shoubin/datasets/nextqa/val.json 19 | test: 20 | url: /nas-ssd/shoubin/datasets/nextqa/val.json 21 | storage: /nas-ssd/shoubin/datasets/nextqa/val.json 22 | videos: 23 | storage: /nas-hdd/shoubin/videos/vidor/videos/ 24 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/ret_flickr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | model_type: flickr 9 | 10 | datasets: 11 | flickr30k: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/Retrieval_Flickr30k" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | model_type: coco 9 | 10 | datasets: 11 | coco_retrieval: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/ALBEF/Retrieval_COCO" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/configs/datasets/flickr30k/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | flickr30k: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images 10 | 11 | build_info: 12 | annotations: 13 | train: 14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json 15 | storage: flickr30k/annotations/train.json 16 | val: 17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json 18 | storage: flickr30k/annotations/val.json 19 | test: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json 21 | storage: flickr30k/annotations/test.json 22 | images: 23 | storage: flickr30k/images 24 | # storage: /export/share/datasets/vision/flickr30k 25 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/ret_flickr30k_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | model_type: flickr 9 | 10 | datasets: 11 | flickr30k: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/ALBEF/Retrieval_Flickr30k" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/ret_flickr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2 8 | model_type: coco 9 | use_grad_checkpoint: False 10 | 11 | datasets: 12 | flickr30k: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 364 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: retrieval 23 | 24 | # dataloading 25 | num_workers: 4 26 | batch_size_train: 16 27 | batch_size_eval: 32 28 | 29 | test_splits: ["test"] 30 | 31 | # distribution 32 | device: "cuda" 33 | world_size: 1 34 | dist_url: "env://" 35 | distributed: True 36 | use_dist_eval_sampler: False 37 | 38 | # model specific 39 | k_test: 128 40 | 41 | # misc 42 | seed: 42 43 | output_dir: "output/BLIP2/Retrieval_Flickr30k" 44 | 45 | evaluate: True -------------------------------------------------------------------------------- /lavis/configs/models/alpro_retrieval_didemo.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | eval: 30 | name: "alpro_video_eval" 31 | n_frms: 8 32 | image_size: 224 33 | text_processor: 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msrvtt_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | model_type: msrvtt 9 | 10 | datasets: 11 | msrvtt_retrieval: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 8 16 | image_size: 224 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: retrieval 23 | # optimization-specific 24 | batch_size_train: 24 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | # k_test: 256 29 | k_test: 1000 30 | 31 | seed: 42 32 | output_dir: "output/ALPRO/msrvtt_retrieval" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | # distribution-specific 38 | device: "cuda" 39 | world_size: 1 40 | dist_url: "env://" 41 | distributed: True 42 | use_dist_eval_sampler: False 43 | -------------------------------------------------------------------------------- /lavis/projects/gpt/eval/dialogue_avsd_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | model_type: base 9 | 10 | datasets: 11 | avsd_dialogue: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "gpt_video_ft" 15 | visual_ft: ["i3d_flow", "i3d_rgb"] 16 | audio_ft: ["vggish"] 17 | text_processor: 18 | eval: 19 | name: "gpt_dialogue" 20 | max_turns: 3 21 | use_caption: True 22 | 23 | run: 24 | task: dialogue 25 | # optimizer 26 | batch_size_train: 16 27 | batch_size_eval: 16 28 | num_workers: 0 29 | 30 | max_len: 20 31 | min_len: 5 32 | num_beams: 5 33 | 34 | seed: 42 35 | output_dir: "output/gpt2/dialogue_avsd" 36 | 37 | evaluate: True 38 | valid_splits: ["test"] 39 | 40 | device: "cuda" 41 | world_size: 1 42 | dist_url: "env://" 43 | distributed: True 44 | -------------------------------------------------------------------------------- /lavis/datasets/builders/temporal_action_localization_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.common.utils import get_cache_path 10 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 11 | from lavis.datasets.datasets.temporal_action_localization_dataset import ( 12 | TemporalActionLocalizationDataset, 13 | ) 14 | 15 | 16 | class TemporalActionLocalizationBuilder(BaseDatasetBuilder): 17 | train_dataset_cls = TemporalActionLocalizationDataset 18 | eval_dataset_cls = TemporalActionLocalizationDataset 19 | 20 | def build(self): 21 | datasets = super().build() 22 | 23 | return datasets 24 | 25 | 26 | @registry.register_builder("anet_TAL") 27 | class ANetTALBuilder(TemporalActionLocalizationBuilder): 28 | DATASET_CONFIG_DICT = { 29 | "default": "configs/datasets/anet_TAL/defaults.yaml", 30 | } 31 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/okvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: okvqa 9 | image_size: 480 10 | 11 | datasets: 12 | ok_vqa: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 480 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa 23 | # optimization-specific 24 | batch_size_train: 16 25 | batch_size_eval: 16 26 | num_workers: 4 27 | 28 | # inference-specific 29 | max_len: 10 30 | min_len: 1 31 | num_beams: 3 32 | num_ans_candidates: 128 33 | inference_method: "rank" 34 | 35 | seed: 42 36 | output_dir: "output/BLIP/OKVQA" 37 | 38 | evaluate: True 39 | test_splits: ["test"] 40 | 41 | # distribution-specific 42 | device: "cuda" 43 | world_size: 1 44 | dist_url: "env://" 45 | distributed: True 46 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: coco 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: True 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 364 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 364 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /lavis/configs/datasets/nlvr/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nlvr: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json 16 | storage: nlvr/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 19 | storage: nlvr/annotations/dev.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 22 | storage: nlvr/annotations/test.json 23 | images: 24 | storage: /export/share/datasets/vision/NLVR2/ 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/snli_ve/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | snli_ve: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json 16 | storage: snli/annotations/ve_train.json 17 | val: 18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json 19 | storage: snli/annotations/ve_dev.json 20 | test: 21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json 22 | storage: snli/annotations/ve_test.json 23 | images: 24 | storage: flickr30k/images/flickr30k-images 25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images 26 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/aokvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: aokvqa 9 | image_size: 480 10 | 11 | datasets: 12 | aok_vqa: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 480 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: aok_vqa 23 | # optimization-specific 24 | batch_size_train: 64 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | # inference-specific 29 | max_len: 10 30 | min_len: 1 31 | num_beams: 3 32 | num_ans_candidates: 128 33 | inference_method: "rank" 34 | 35 | seed: 42 36 | output_dir: "output/BLIP/AOKVQA" 37 | 38 | evaluate: True 39 | test_splits: ["val", "test"] 40 | 41 | # distribution-specific 42 | device: "cuda" 43 | world_size: 1 44 | dist_url: "env://" 45 | distributed: True 46 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/vqav2_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: vqav2 9 | image_size: 480 10 | 11 | datasets: 12 | coco_vqa: # name of the dataset builder 13 | type: eval 14 | vis_processor: 15 | eval: 16 | name: "blip_image_eval" 17 | image_size: 480 18 | text_processor: 19 | eval: 20 | name: "blip_question" 21 | 22 | run: 23 | task: vqa 24 | # optimization-specific 25 | batch_size_train: 16 26 | batch_size_eval: 64 27 | num_workers: 4 28 | 29 | # inference-specific 30 | max_len: 10 31 | min_len: 1 32 | num_beams: 3 33 | num_ans_candidates: 128 34 | inference_method: "rank" 35 | 36 | seed: 42 37 | output_dir: "output/BLIP/VQA" 38 | 39 | evaluate: True 40 | test_splits: ["val"] 41 | 42 | # distribution-specific 43 | device: "cuda" 44 | world_size: 1 45 | dist_url: "env://" 46 | distributed: True 47 | -------------------------------------------------------------------------------- /lavis/datasets/builders/classification_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset 11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset 12 | 13 | 14 | @registry.register_builder("nlvr") 15 | class NLVRBuilder(BaseDatasetBuilder): 16 | train_dataset_cls = NLVRDataset 17 | eval_dataset_cls = NLVREvalDataset 18 | 19 | DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"} 20 | 21 | 22 | @registry.register_builder("snli_ve") 23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder): 24 | train_dataset_cls = SNLIVisualEntialmentDataset 25 | eval_dataset_cls = SNLIVisualEntialmentDataset 26 | 27 | DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"} 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ffmpeg # ==1.4 2 | ffmpeg-python 3 | future==0.18.2 4 | glob2==0.7 5 | imageio==2.9.0 6 | matplotlib==3.4.2 7 | moviepy==1.0.3 8 | numpy # ==1.17.4 9 | # pandas==1.3.1 10 | pickleshare==0.7.5 11 | # Pillow==7.0.0 12 | protobuf==3.15.6 13 | python-dateutil==2.8.2 14 | pytube==15.0.0 15 | PyYAML==5.2 16 | scikit-learn==0.24.2 17 | scikit-video==1.1.11 18 | scipy==1.7.1 19 | six==1.12.0 20 | tabulate==0.9.0 21 | # tensorboard==2.5.0 22 | tensorboard==2.11 23 | tensorboardX==2.1 24 | # torch==1.13.1 25 | # torch==1.13.1+cu117 26 | # torchtext==0.14.1 27 | # tqdm # ==4.36.1 28 | tzdata==2023.3 29 | 30 | contexttimer 31 | decord 32 | einops>=0.4.1 33 | fairscale==0.4.4 34 | ftfy 35 | iopath 36 | ipython 37 | omegaconf 38 | opencv-python-headless==4.5.5.64 39 | opendatasets 40 | packaging 41 | pandas 42 | plotly 43 | pre-commit 44 | pycocoevalcap 45 | pycocotools 46 | python-magic 47 | scikit-image 48 | sentencepiece 49 | spacy 50 | # streamlit 51 | timm==0.4.12 52 | # torchvision 53 | tqdm 54 | # transformers>=4.25.0 55 | # transformers==4.25.1 56 | transformers==4.46.1 57 | wheel 58 | 59 | peft==0.13.0 60 | wandb==0.18.3 61 | 62 | av 63 | webdataset==0.2.100 -------------------------------------------------------------------------------- /lavis/configs/datasets/msvd/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json 16 | storage: msvd/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json 19 | storage: msvd/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json 22 | storage: msvd/annotations/cap_test.json 23 | videos: 24 | storage: msvd/videos 25 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_caption_large_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" 12 | 13 | vit_type: "large" 14 | vit_grad_ckpt: True 15 | vit_ckpt_layer: 5 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | # generation configs 23 | prompt: "a picture of " 24 | 25 | 26 | preprocess: 27 | vis_processor: 28 | train: 29 | name: "blip_image_train" 30 | eval: 31 | name: "blip_image_eval" 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | prompt: "a picture of " 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqa_okvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/vqa_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | datasets: 13 | coco_vqa: # name of the dataset builder 14 | type: eval 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | 23 | run: 24 | task: vqa 25 | 26 | # optimization-specific 27 | batch_size_train: 16 28 | batch_size_eval: 64 29 | num_workers: 4 30 | 31 | # inference-specific 32 | max_len: 10 33 | min_len: 1 34 | num_beams: 3 35 | num_ans_candidates: 128 36 | inference_method: "rank" 37 | 38 | seed: 42 39 | output_dir: "output/ALBEF/VQA" 40 | 41 | evaluate: True 42 | test_splits: ["val"] 43 | 44 | # distribution-specific 45 | device: "cuda" 46 | world_size: 1 47 | dist_url: "env://" 48 | distributed: True 49 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqa_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt2.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt6.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/projects/clip/exp_coco_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | coco_retrieval: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "clip_image_train" 16 | image_size: 336 17 | eval: 18 | name: "clip_image_eval" 19 | image_size: 336 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: retrieval 28 | 29 | # dataloading 30 | num_workers: 4 31 | batch_size_train: 32 32 | batch_size_eval: 128 33 | 34 | test_splits: ["test"] 35 | 36 | # distribution 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | use_dist_eval_sampler: True 42 | 43 | # misc 44 | seed: 42 45 | output_dir: "output/clip/Retrieval_COCO" 46 | 47 | evaluate: True 48 | -------------------------------------------------------------------------------- /lavis/projects/clip/exp_flickr_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | flickr30k: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "clip_image_train" 16 | image_size: 336 17 | eval: 18 | name: "clip_image_eval" 19 | image_size: 336 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: retrieval 28 | 29 | # dataloading 30 | num_workers: 4 31 | batch_size_train: 32 32 | batch_size_eval: 128 33 | 34 | test_splits: ["test"] 35 | 36 | # distribution 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | use_dist_eval_sampler: True 42 | 43 | # misc 44 | seed: 42 45 | output_dir: "output/clip/Retrieval_Flickr" 46 | 47 | evaluate: True 48 | -------------------------------------------------------------------------------- /lavis/configs/datasets/charades_sta/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | charades_sta: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | build_info: 11 | # Be careful not to append minus sign (-) before split to avoid itemizing 12 | annotations: 13 | train: 14 | url: Your/path/to/train.json 15 | storage: Your/path/to/train.json 16 | # url: Your/path/to/new_train.json 17 | # storage: Your/path/to/new_train.json 18 | val: 19 | url: Your/path/to/test_float.json 20 | storage: Your/path/to/test_float.json 21 | # url: Your/path/to/new_val_float.json 22 | # storage: Your/path/to/new_val_float.json 23 | test: 24 | url: Your/path/to/test_float.json 25 | storage: Your/path/to/test_float.json 26 | videos: 27 | storage: Your/path/to/data/raw/Charades -------------------------------------------------------------------------------- /lavis/configs/datasets/msrvtt/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json 16 | storage: msrvtt/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json 19 | storage: msrvtt/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json 22 | storage: msrvtt/annotations/cap_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xxl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xxl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | vit_grad_ckpt: True 18 | vit_ckpt_layer: 4 19 | 20 | image_size: 384 21 | 22 | # bert config 23 | med_config_path: "configs/models/med_config.json" 24 | 25 | embed_dim: 256 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/vqa_test.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | 13 | datasets: 14 | coco_vqa: # name of the dataset builder 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | 23 | run: 24 | task: vqa 25 | 26 | # optimization-specific 27 | batch_size_train: 16 28 | batch_size_eval: 64 29 | num_workers: 4 30 | 31 | # inference-specific 32 | max_len: 10 33 | min_len: 1 34 | num_beams: 3 35 | num_ans_candidates: 128 36 | inference_method: "rank" 37 | 38 | seed: 42 39 | output_dir: "output/ALBEF/VQA" 40 | 41 | evaluate: True 42 | train_splits: ["train"] 43 | test_splits: ["test"] 44 | 45 | # distribution-specific 46 | device: "cuda" 47 | world_size: 1 48 | dist_url: "env://" 49 | distributed: True 50 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_classification_ve.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" 11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 12 | 13 | num_classes: 3 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | eval: 35 | name: "blip_image_eval" 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | eval: 40 | name: "blip_caption" 41 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 13 | 14 | num_classes: 2 15 | 16 | # vit encoder 17 | vit_type: "base" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | 22 | image_size: 384 23 | 24 | # bert config 25 | med_config_path: "configs/models/med_config.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt" 12 | 13 | use_distill: True 14 | momentum: 0.995 15 | alpha: 0.4 16 | 17 | # vit encoder 18 | vit_type: "base" 19 | vit_grad_ckpt: False 20 | vit_ckpt_layer: 0 21 | vit_layer_norm_epsilon: 1e-6 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config_albef.json" 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 384 33 | eval: 34 | name: "blip_image_eval" 35 | image_size: 384 36 | text_processor: 37 | train: 38 | name: "blip_question" 39 | eval: 40 | name: "blip_question" 41 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_caption_base_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | 18 | image_size: 384 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config.json" 22 | 23 | # generation configs 24 | prompt: "a picture of " 25 | 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | eval: 32 | name: "blip_image_eval" 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | prompt: "a picture of " 37 | eval: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /lavis/configs/datasets/vatex/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json 16 | storage: vatex/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json 19 | storage: vatex/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json 22 | storage: vatex/annotations/cap_test.json 23 | videos: 24 | storage: /export/share/dongxuli/data/vatex 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/avsd/defaults_dial.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | avsd_dialogue: # name of the dataset builder 8 | dataset_card: dataset_card/avsd_dialogue.md 9 | data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json 16 | storage: avsd/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json 19 | storage: avsd/annotations/val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json 22 | storage: avsd/annotations/test.json 23 | features: 24 | storage: avsd/features/ 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msrvtt/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json 16 | storage: msrvtt/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json 19 | storage: msrvtt/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json 22 | storage: msrvtt/annotations/retrieval_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/nocaps_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: base_coco 9 | # pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth' 10 | 11 | datasets: 12 | nocaps: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | prompt: "a picture of " 21 | 22 | run: 23 | # task: retrieval 24 | task: captioning 25 | # optimizer 26 | batch_size_train: 32 27 | batch_size_eval: 64 28 | num_workers: 4 29 | 30 | max_len: 20 31 | min_len: 5 32 | num_beams: 3 33 | 34 | seed: 42 35 | output_dir: "output/BLIP/NoCaps" 36 | 37 | evaluate: True 38 | test_splits: ["val", "test"] 39 | 40 | device: "cuda" 41 | world_size: 1 42 | dist_url: "env://" 43 | distributed: True 44 | 45 | report_metric: False 46 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt" 12 | 13 | num_classes: 2 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/datasets/datasets/vg_vqa_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | 10 | from PIL import Image 11 | 12 | from lavis.datasets.datasets.vqa_datasets import VQADataset 13 | 14 | 15 | class VGVQADataset(VQADataset): 16 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 17 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 18 | 19 | def __getitem__(self, index): 20 | ann = self.annotation[index] 21 | 22 | image_path = os.path.join(self.vis_root, ann["image"]) 23 | image = Image.open(image_path).convert("RGB") 24 | 25 | image = self.vis_processor(image) 26 | question = self.text_processor(ann["question"]) 27 | 28 | answers = [ann["answer"]] 29 | # TODO this should be configured better 30 | weights = [0.2] 31 | 32 | return { 33 | "image": image, 34 | "text_input": question, 35 | "answers": answers, 36 | "weights": weights, 37 | } 38 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | alpha: 0.4 15 | 16 | negative_all_rank: False 17 | 18 | # vit encoder 19 | vit_type: "base" 20 | vit_grad_ckpt: True 21 | vit_ckpt_layer: 4 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config.json" 27 | 28 | embed_dim: 256 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/alpro_retrieval_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "alpro_video_train" 31 | n_frms: 8 32 | image_size: 224 33 | eval: 34 | name: "alpro_video_eval" 35 | n_frms: 8 36 | image_size: 224 37 | text_processor: 38 | train: 39 | name: "blip_caption" 40 | eval: 41 | name: "blip_caption" 42 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/didemo_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | model_type: didemo 9 | 10 | max_txt_len: 50 11 | 12 | timesformer: 13 | n_frms: 8 14 | image_size: 224 15 | 16 | 17 | datasets: 18 | didemo_retrieval: # name of the dataset builder 19 | vis_processor: 20 | eval: 21 | name: "alpro_video_eval" 22 | n_frms: 8 23 | image_size: 224 24 | text_processor: 25 | eval: 26 | name: "blip_caption" 27 | 28 | run: 29 | task: retrieval 30 | # optimization-specific 31 | batch_size_train: 8 32 | batch_size_eval: 64 33 | num_workers: 4 34 | 35 | # k_test: 256 36 | k_test: 1000 37 | 38 | seed: 42 39 | output_dir: "output/ALPRO/didemo_retrieval" 40 | 41 | evaluate: True 42 | train_splits: ["train"] 43 | valid_splits: ["val", "test"] 44 | test_splits: ["test"] 45 | 46 | # distribution-specific 47 | device: "cuda" 48 | world_size: 1 49 | dist_url: "env://" 50 | distributed: True 51 | use_dist_eval_sampler: False 52 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | model_type: coco 9 | 10 | datasets: 11 | coco_retrieval: # name of the dataset builder 12 | vis_processor: 13 | train: 14 | name: "blip_image_train" 15 | image_size: 384 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | eval: 23 | name: "blip_caption" 24 | 25 | run: 26 | task: retrieval 27 | 28 | # dataloading 29 | num_workers: 4 30 | batch_size_train: 32 31 | batch_size_eval: 128 32 | 33 | train_splits: ["train"] 34 | valid_splits: ["val"] 35 | test_splits: ["test"] 36 | 37 | # distribution 38 | device: "cuda" 39 | world_size: 1 40 | dist_url: "env://" 41 | distributed: True 42 | use_dist_eval_sampler: False 43 | 44 | # model specific 45 | k_test: 256 46 | 47 | # misc 48 | seed: 42 49 | output_dir: "output/BLIP/Retrieval_COCO" 50 | 51 | evaluate: True 52 | -------------------------------------------------------------------------------- /lavis/configs/models/alpro_qa_msvd.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 2423 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | use_grad_ckpt: True 24 | ckpt_layer: 12 25 | 26 | # bert config 27 | med_config_path: "configs/models/bert_config_alpro.json" 28 | 29 | preprocess: 30 | vis_processor: 31 | train: 32 | name: "alpro_video_train" 33 | n_frms: 16 34 | image_size: 224 35 | eval: 36 | name: "alpro_video_eval" 37 | n_frms: 16 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt2.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt6.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_caption_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_flant5xl 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/datasets/didemo/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | didemo_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json 16 | storage: didemo/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json 19 | storage: didemo/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json 22 | storage: didemo/annotations/retrieval_test.json 23 | videos: 24 | storage: didemo/videos 25 | # storage: /export/share/dongxuli/data/didemo_retrieval/videos 26 | -------------------------------------------------------------------------------- /lavis/configs/models/alpro_qa_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 1500 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | 24 | use_grad_ckpt: True 25 | ckpt_layer: 12 26 | 27 | # bert config 28 | med_config_path: "configs/models/bert_config_alpro.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "alpro_video_train" 34 | n_frms: 16 35 | image_size: 224 36 | eval: 37 | name: "alpro_video_eval" 38 | n_frms: 16 39 | image_size: 224 40 | text_processor: 41 | train: 42 | name: "blip_caption" 43 | eval: 44 | name: "blip_caption" 45 | -------------------------------------------------------------------------------- /lavis/configs/datasets/coco/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_retrieval: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 16 | md5: aa31ac474cf6250ebb81d18348a07ed8 17 | storage: coco/annotations/coco_karpathy_train.json 18 | val: 19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 20 | md5: b273847456ef5580e33713b1f7de52a0 21 | storage: coco/annotations/coco_karpathy_val.json 22 | test: 23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 25 | storage: coco/annotations/coco_karpathy_test.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt" 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt 12 | 13 | queue_size: 65536 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | image_size: 384 18 | vit_ckpt_layer: 0 19 | vit_drop_path_rate: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | vit_grad_ckpt: False 22 | 23 | # bert config 24 | med_config_path: "configs/models/med_config_albef.json" 25 | 26 | embed_dim: 256 27 | momentum: 0.995 28 | alpha: 0.4 29 | temp: 0.07 30 | use_distill: True 31 | 32 | max_txt_len: 30 33 | 34 | preprocess: 35 | vis_processor: 36 | train: 37 | name: "blip_image_train" 38 | image_size: 384 39 | eval: 40 | name: "blip_image_eval" 41 | image_size: 384 42 | text_processor: 43 | train: 44 | name: "blip_caption" 45 | eval: 46 | name: "blip_caption" 47 | -------------------------------------------------------------------------------- /lavis/configs/datasets/gqa/balanced_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json 22 | storage: 23 | - gqa/annotations/val_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /lavis/configs/datasets/gqa/balanced_testdev.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | gqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json 17 | storage: 18 | - gqa/annotations/train_balanced_questions.json 19 | val: 20 | url: 21 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json 22 | storage: 23 | - gqa/annotations/testdev_balanced_questions.json 24 | test: 25 | url: 26 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json 27 | storage: 28 | - gqa/annotations/test_balanced_questions.json 29 | images: 30 | storage: gqa/images/ 31 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/caption_coco_opt2.7b_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Bleu_1: 0.832 7 | # Bleu_2: 0.691 8 | # Bleu_3: 0.556 9 | # Bleu_4: 0.438 10 | # METEOR: 0.317 11 | # ROUGE_L: 0.620 12 | # CIDEr: 1.461 13 | # SPICE: 0.252 14 | 15 | model: 16 | arch: blip2_opt 17 | model_type: caption_coco_opt2.7b 18 | use_grad_checkpoint: False 19 | 20 | datasets: 21 | coco_caption: # name of the dataset builder 22 | vis_processor: 23 | eval: 24 | name: "blip_image_eval" 25 | image_size: 364 26 | text_processor: 27 | eval: 28 | name: "blip_caption" 29 | # build_info: 30 | # images: 31 | # storage: '/export/share/datasets/vision/coco/images/' 32 | 33 | run: 34 | task: captioning 35 | # optimizer 36 | batch_size_train: 32 37 | batch_size_eval: 16 38 | num_workers: 4 39 | 40 | max_len: 30 41 | min_len: 8 42 | num_beams: 5 43 | 44 | seed: 42 45 | output_dir: "output/BLIP2/Caption_coco_opt2.7b" 46 | 47 | evaluate: True 48 | test_splits: ["test"] 49 | 50 | device: "cuda" 51 | world_size: 1 52 | dist_url: "env://" 53 | distributed: True 54 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/caption_coco_opt6.7b_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Bleu_1: 0.831 7 | # Bleu_2: 0.689 8 | # Bleu_3: 0.552 9 | # Bleu_4: 0.434 10 | # METEOR: 0.316 11 | # ROUGE_L: 0.618 12 | # CIDEr: 1.451 13 | # SPICE: 0.251 14 | 15 | model: 16 | arch: blip2_opt 17 | model_type: caption_coco_opt6.7b 18 | use_grad_checkpoint: False 19 | 20 | datasets: 21 | coco_caption: # name of the dataset builder 22 | vis_processor: 23 | eval: 24 | name: "blip_image_eval" 25 | image_size: 364 26 | text_processor: 27 | eval: 28 | name: "blip_caption" 29 | # build_info: 30 | # images: 31 | # storage: '/export/share/datasets/vision/coco/images/' 32 | 33 | run: 34 | task: captioning 35 | # optimizer 36 | batch_size_train: 32 37 | batch_size_eval: 16 38 | num_workers: 4 39 | 40 | max_len: 30 41 | min_len: 8 42 | num_beams: 5 43 | 44 | seed: 42 45 | output_dir: "output/BLIP2/Caption_coco_opt6.7b" 46 | 47 | evaluate: True 48 | test_splits: ["test"] 49 | 50 | device: "cuda" 51 | world_size: 1 52 | dist_url: "env://" 53 | distributed: True 54 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/gqa_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Overall Accuracy is: 43.98 7 | model: 8 | arch: blip2_t5 9 | model_type: pretrain_flant5xl 10 | use_grad_checkpoint: False 11 | 12 | datasets: 13 | gqa: # name of the dataset builder 14 | type: balanced_testdev 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 224 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | build_info: 23 | images: 24 | storage: "/export/share/datasets/vision/GQA/images/" 25 | 26 | run: 27 | task: gqa 28 | # optimization-specific 29 | batch_size_train: 16 30 | batch_size_eval: 64 31 | num_workers: 4 32 | 33 | # inference-specific 34 | max_len: 10 35 | min_len: 1 36 | num_beams: 5 37 | inference_method: "generate" 38 | prompt: "Question: {} Short answer:" 39 | 40 | seed: 42 41 | output_dir: "output/BLIP2/GQA" 42 | 43 | evaluate: True 44 | test_splits: ["val"] 45 | 46 | # distribution-specific 47 | device: "cuda" 48 | world_size: 1 49 | dist_url: "env://" 50 | distributed: True 51 | -------------------------------------------------------------------------------- /lavis/projects/albef/train/snli_ve_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | model_type: ve 9 | load_finetuned: False 10 | num_classes: 3 11 | 12 | datasets: 13 | snli_ve: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | eval: 18 | name: "blip_image_eval" 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | eval: 23 | name: "blip_caption" 24 | 25 | run: 26 | task: multimodal_classification 27 | # optimization-specific 28 | lr_sched: "linear_warmup_cosine_lr" 29 | init_lr: 2e-5 30 | min_lr: 0 31 | weight_decay: 0.05 32 | max_epoch: 10 33 | batch_size_train: 32 34 | batch_size_eval: 64 35 | num_workers: 4 36 | 37 | seed: 42 38 | output_dir: "output/ALBEF/SNLI_VE" 39 | 40 | amp: False 41 | resume_ckpt_path: null 42 | 43 | evaluate: False 44 | train_splits: ["train"] 45 | valid_splits: ["val"] 46 | test_splits: ["test"] 47 | 48 | # distribution-specific 49 | device: "cuda" 50 | world_size: 1 51 | dist_url: "env://" 52 | distributed: True 53 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_base32.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-32 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_large14.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 224 53 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_large14_336.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | # ['RN50', 11 | # 'RN50-quickgelu', 12 | # 'RN50x4', 13 | # 'RN50x16', 14 | # 'RN101', 15 | # 'RN101-quickgelu', 16 | # 'timm-efficientnetv2_rw_s', 17 | # 'timm-resnet50d', 18 | # 'timm-resnetaa50d', 19 | # 'timm-resnetblur50', 20 | # 'timm-swin_base_patch4_window7_224', 21 | # 'timm-vit_base_patch16_224', 22 | # 'timm-vit_base_patch32_224', 23 | # 'timm-vit_small_patch16_224', 24 | # 'ViT-B-16', 25 | # 'ViT-B-16-plus', 26 | # 'ViT-B-16-plus-240', 27 | # 'ViT-B-32', 28 | # 'ViT-B-32-plus-256', 29 | # 'ViT-B-32-quickgelu', 30 | # 'ViT-g-14', 31 | # 'ViT-H-14', 32 | # 'ViT-H-16', 33 | # 'ViT-L-14', 34 | # 'ViT-L-14-280', 35 | # 'ViT-L-14-336', 36 | # 'ViT-L-16', 37 | # 'ViT-L-16-320'] 38 | 39 | pretrained: openai 40 | # "openai" 41 | # following not available for all models 42 | # "yfcc15m" 43 | # "cc12m" 44 | # "laion400m_e31" 45 | # "laion400m_e32" 46 | # "laion400m_avg" 47 | 48 | preprocess: 49 | vis_processor: 50 | eval: 51 | name: "clip_image_eval" 52 | image_size: 336 53 | -------------------------------------------------------------------------------- /lavis/projects/blip/train/nlvr_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | 9 | model_type: nlvr 10 | load_finetuned: False 11 | 12 | datasets: 13 | nlvr: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | image_size: 384 18 | eval: 19 | name: "blip_image_eval" 20 | image_size: 384 21 | text_processor: 22 | train: 23 | name: "blip_caption" 24 | eval: 25 | name: "blip_caption" 26 | 27 | run: 28 | task: multimodal_classification 29 | 30 | lr_sched: "linear_warmup_cosine_lr" 31 | init_lr: 2.5e-5 32 | min_lr: 0 33 | weight_decay: 0.05 34 | max_epoch: 15 35 | 36 | batch_size_train: 16 37 | batch_size_eval: 64 38 | num_workers: 4 39 | 40 | seed: 42 41 | output_dir: "output/BLIP/NLVR" 42 | 43 | amp: False 44 | resume_ckpt_path: null 45 | 46 | evaluate: False 47 | train_splits: ["train"] 48 | valid_splits: ["val", "test"] 49 | test_splits: ["test"] 50 | 51 | # distribution-specific 52 | device: "cuda" 53 | world_size: 1 54 | dist_url: "env://" 55 | distributed: True 56 | -------------------------------------------------------------------------------- /lavis/models/pnp_vqa_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | 11 | def prepare_qa_input(sample, num_captions, num_captions_fid): 12 | sample_question_captions = [] 13 | 14 | for question, captions in zip(sample['text_input'], sample['captions']): 15 | assert isinstance(captions, list) 16 | question_captions = [] 17 | question_caption = '' 18 | for cap_id, cap_ in enumerate(captions[0:num_captions]): 19 | question_caption += (cap_.strip() + '. ') 20 | if (cap_id + 1) != num_captions and ((cap_id + 1) % num_captions_fid == 0): 21 | question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip() 22 | question_captions.append(question_caption) 23 | question_caption = '' 24 | if (cap_id + 1) == num_captions: 25 | question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip() 26 | question_captions.append(question_caption) 27 | sample_question_captions.append(question_captions) 28 | 29 | sample['question_captions'] = sample_question_captions 30 | -------------------------------------------------------------------------------- /lavis/configs/datasets/coco/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_caption: # name of the dataset builder 8 | dataset_card: dataset_card/coco_caption.md 9 | # data_dir: ${env.data_dir}/datasets 10 | data_type: images # [images|videos|features] 11 | 12 | build_info: 13 | # Be careful not to append minus sign (-) before split to avoid itemizing 14 | annotations: 15 | train: 16 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 17 | md5: aa31ac474cf6250ebb81d18348a07ed8 18 | storage: coco/annotations/coco_karpathy_train.json 19 | val: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 21 | md5: b273847456ef5580e33713b1f7de52a0 22 | storage: coco/annotations/coco_karpathy_val.json 23 | test: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 25 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 26 | storage: coco/annotations/coco_karpathy_test.json 27 | images: 28 | storage: coco/images/ 29 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msvd/defaults_qa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_qa: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json 16 | storage: msvd/annotations/qa_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json 19 | storage: msvd/annotations/qa_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json 22 | storage: msvd/annotations/qa_test.json 23 | ans2label: 24 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json 25 | storage: msvd/annotations/qa_ans2label.json 26 | videos: 27 | storage: msvd/videos 28 | 29 | instance_id_key: question_id 30 | -------------------------------------------------------------------------------- /lavis/projects/blip/train/caption_coco_large_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | 9 | model_type: large_coco 10 | load_finetuned: False 11 | 12 | datasets: 13 | coco_caption: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | eval: 18 | name: "blip_image_eval" 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | prompt: "a picture of " 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: captioning 28 | # optimizer 29 | lr_sched: "linear_warmup_cosine_lr" 30 | init_lr: 2e-6 31 | min_lr: 0 32 | weight_decay: 0.05 33 | max_epoch: 5 34 | batch_size_train: 16 35 | batch_size_eval: 64 36 | num_workers: 4 37 | 38 | max_len: 20 39 | min_len: 5 40 | num_beams: 3 41 | 42 | seed: 42 43 | output_dir: "output/BLIP/Caption_coco" 44 | 45 | amp: False 46 | resume_ckpt_path: null 47 | 48 | evaluate: False 49 | train_splits: ["train"] 50 | valid_splits: ["val"] 51 | test_splits: ["test"] 52 | 53 | device: "cuda" 54 | world_size: 1 55 | dist_url: "env://" 56 | distributed: True 57 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/okvqa_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Overall Accuracy is: 41.22 7 | 8 | model: 9 | arch: blip2_t5 10 | model_type: pretrain_flant5xl 11 | use_grad_checkpoint: False 12 | 13 | # for OKVQA evaluation 14 | apply_lemmatizer: True 15 | 16 | datasets: 17 | ok_vqa: # name of the dataset builder 18 | vis_processor: 19 | eval: 20 | name: "blip_image_eval" 21 | image_size: 224 22 | text_processor: 23 | eval: 24 | name: "blip_question" 25 | # build_info: 26 | # images: 27 | # storage: '/export/share/datasets/vision/coco/images/' 28 | 29 | run: 30 | task: vqa 31 | # optimization-specific 32 | batch_size_train: 16 33 | batch_size_eval: 64 34 | num_workers: 4 35 | 36 | # inference-specific 37 | max_len: 10 38 | min_len: 1 39 | num_beams: 5 40 | inference_method: "generate" 41 | prompt: "Question: {} Short answer:" 42 | 43 | seed: 42 44 | output_dir: "output/BLIP2/OKVQA" 45 | 46 | evaluate: True 47 | test_splits: ["test"] 48 | 49 | # distribution-specific 50 | device: "cuda" 51 | world_size: 1 52 | dist_url: "env://" 53 | distributed: True 54 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2 8 | model_type: coco 9 | use_grad_checkpoint: False 10 | 11 | datasets: 12 | coco_retrieval: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "blip_image_train" 16 | image_size: 364 17 | eval: 18 | name: "blip_image_eval" 19 | image_size: 364 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | # build_info: 26 | # images: 27 | # storage: '/export/share/datasets/vision/coco/images/' 28 | run: 29 | task: retrieval 30 | 31 | # dataloading 32 | num_workers: 4 33 | batch_size_train: 16 34 | batch_size_eval: 32 35 | 36 | train_splits: ["train"] 37 | valid_splits: ["val"] 38 | test_splits: ["test"] 39 | 40 | # distribution 41 | device: "cuda" 42 | world_size: 1 43 | dist_url: "env://" 44 | distributed: True 45 | use_dist_eval_sampler: False 46 | 47 | # model specific 48 | k_test: 128 49 | 50 | # misc 51 | seed: 42 52 | output_dir: "output/BLIP2/Retrieval_COCO" 53 | 54 | evaluate: True 55 | -------------------------------------------------------------------------------- /lavis/projects/blip/train/caption_coco_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | 9 | model_type: base_coco 10 | load_finetuned: False 11 | 12 | datasets: 13 | coco_caption: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | eval: 18 | name: "blip_image_eval" 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | prompt: "a picture of " 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | # task: retrieval 28 | task: captioning 29 | # optimizer 30 | lr_sched: "linear_warmup_cosine_lr" 31 | init_lr: 1e-5 32 | min_lr: 0 33 | weight_decay: 0.05 34 | max_epoch: 5 35 | batch_size_train: 32 36 | batch_size_eval: 64 37 | num_workers: 4 38 | 39 | max_len: 20 40 | min_len: 5 41 | num_beams: 3 42 | 43 | seed: 42 44 | output_dir: "output/BLIP/Caption_coco" 45 | 46 | amp: False 47 | resume_ckpt_path: null 48 | 49 | evaluate: False 50 | train_splits: ["train"] 51 | valid_splits: ["val"] 52 | test_splits: ["test"] 53 | 54 | device: "cuda" 55 | world_size: 1 56 | dist_url: "env://" 57 | distributed: True 58 | -------------------------------------------------------------------------------- /lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | MIT License 6 | 7 | Copyright (c) 2019 Igor Brigadir 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from setuptools import setup, find_namespace_packages 9 | import platform 10 | 11 | DEPENDENCY_LINKS = [] 12 | if platform.system() == "Windows": 13 | DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html") 14 | 15 | 16 | def fetch_requirements(filename): 17 | with open(filename) as f: 18 | return [ln.strip() for ln in f.read().split("\n")] 19 | 20 | 21 | setup( 22 | name="salesforce-lavis", 23 | version="1.0.0.dev1", 24 | author="Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven C.H. Hoi", 25 | description="LAVIS - A One-stop Library for Language-Vision Intelligence", 26 | long_description=open("README.md", "r", encoding="utf-8").read(), 27 | long_description_content_type="text/markdown", 28 | keywords="Vision-Language, Multimodal, Image Captioning, Generative AI, Deep Learning, Library, PyTorch", 29 | license="3-Clause BSD", 30 | packages=find_namespace_packages(include="lavis.*"), 31 | install_requires=fetch_requirements("requirements.txt"), 32 | python_requires=">=3.7.0", 33 | include_package_data=True, 34 | dependency_links=DEPENDENCY_LINKS, 35 | zip_safe=False, 36 | ) 37 | -------------------------------------------------------------------------------- /lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | # Download Conceptual Captions Data 9 | 10 | Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder 11 | 12 | `Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333) 13 | 14 | run `download_data_cc3m.py` or `download_data_cc12m.py`. 15 | 16 | Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is. 17 | 18 | Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates. 19 | 20 | A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this. 21 | 22 | It should take about a day or two to download the training data, keep an eye on disk space. 23 | -------------------------------------------------------------------------------- /lavis/projects/blip/coco_cap_ft_iter.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: large 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | train: 14 | name: "blip_image_train" 15 | eval: 16 | name: "blip_image_eval" 17 | text_processor: 18 | train: 19 | name: "blip_caption" 20 | prompt: "a picture of " 21 | eval: 22 | name: "blip_caption" 23 | 24 | run: 25 | runner: runner_iter 26 | 27 | max_iters: 2e4 28 | iters_per_inner_epoch: 2e3 29 | 30 | # task: retrieval 31 | task: captioning 32 | # optimizer 33 | lr_sched: "linear_warmup_cosine_lr" 34 | init_lr: 2e-6 35 | min_lr: 0 36 | weight_decay: 0.05 37 | batch_size_train: 16 38 | batch_size_eval: 64 39 | num_workers: 4 40 | 41 | max_len: 20 42 | min_len: 5 43 | num_beams: 3 44 | 45 | seed: 42 46 | output_dir: "output/BLIP/Caption_coco" 47 | 48 | amp: False 49 | resume_ckpt_path: null 50 | 51 | evaluate: False 52 | train_splits: ["train"] 53 | valid_splits: ["val", "test"] 54 | 55 | device: "cuda" 56 | world_size: 1 57 | dist_url: "env://" 58 | distributed: True 59 | -------------------------------------------------------------------------------- /lavis/projects/albef/train/nlvr_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | model_type: nlvr 9 | load_finetuned: False 10 | 11 | datasets: 12 | nlvr: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "blip_image_train" 16 | image_size: 384 17 | eval: 18 | name: "blip_image_eval" 19 | image_size: 384 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: multimodal_classification 28 | # optimization-specific 29 | lr_sched: "linear_warmup_cosine_lr" 30 | init_lr: 2e-5 31 | min_lr: 1e-6 32 | weight_decay: 0.02 33 | warmup_lr: 1e-5 34 | warmup_steps: 650 35 | max_epoch: 10 36 | batch_size_train: 16 37 | batch_size_eval: 64 38 | num_workers: 4 39 | 40 | seed: 42 41 | output_dir: "output/ALBEF/NLVR" 42 | 43 | amp: False 44 | resume_ckpt_path: null 45 | 46 | evaluate: False 47 | train_splits: ["train"] 48 | valid_splits: ["val", "test"] 49 | test_splits: ["test"] 50 | 51 | # distribution-specific 52 | device: "cuda" 53 | world_size: 1 54 | dist_url: "env://" 55 | distributed: True 56 | -------------------------------------------------------------------------------- /lavis/configs/datasets/coco/eval_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: 16 | # TODO make this order insensitive 17 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json 18 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json 19 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json 20 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json 21 | storage: 22 | - coco/annotations/vqa_val_eval.json 23 | - coco/annotations/answer_list.json 24 | - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json 25 | - coco/annotations/v2_mscoco_val2014_annotations.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /docs/tutorial.evaluation.rst: -------------------------------------------------------------------------------- 1 | Evaluating Pre-trained Models on Task Datasets 2 | ############################################### 3 | LAVIS provides pre-trained and finetuned model for off-the-shelf evaluation on task dataset. 4 | Let's now see an example to evaluate BLIP model on the captioning task, using MSCOCO dataset. 5 | 6 | .. _prep coco: 7 | 8 | Preparing Datasets 9 | ****************** 10 | First, let's download the dataset. LAVIS provides `automatic downloading scripts` to help prepare 11 | most of the public dataset, to download MSCOCO dataset, simply run 12 | 13 | .. code-block:: bash 14 | 15 | cd lavis/datasets/download_scripts && bash download_coco.py 16 | 17 | This will put the downloaded dataset at a default cache location ``cache`` used by LAVIS. 18 | 19 | If you want to use a different cache location, you can specify it by updating ``cache_root`` in ``lavis/configs/default.yaml``. 20 | 21 | If you have a local copy of the dataset, it is recommended to create a symlink from the cache location to the local copy, e.g. 22 | 23 | .. code-block:: bash 24 | 25 | ln -s /path/to/local/coco cache/coco 26 | 27 | Evaluating pre-trained models 28 | ****************************** 29 | 30 | To evaluate pre-trained model, simply run 31 | 32 | .. code-block:: bash 33 | 34 | bash run_scripts/lavis/blip/eval/eval_coco_cap.sh 35 | 36 | Or to evaluate a large model: 37 | 38 | .. code-block:: bash 39 | 40 | bash run_scripts/lavis/blip/eval/eval_coco_cap_large.sh -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/okvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: base 9 | 10 | datasets: 11 | ok_vqa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_question" 19 | 20 | run: 21 | task: vqa_reading_comprehension 22 | 23 | # optimization-specific 24 | batch_size_train: 16 25 | batch_size_eval: 16 26 | num_workers: 4 27 | 28 | # image question matching specific 29 | block_num: 7 30 | 31 | # image captioning specific 32 | top_k: 50 33 | top_p: 1 34 | cap_min_length: 10 35 | cap_max_length: 20 36 | repetition_penalty: 1 37 | num_patches: 20 38 | num_captions: 100 39 | prompt: 'a picture of ' 40 | 41 | # question answering specific 42 | internal_bsz_fid: 1 43 | num_captions_fid: 1 44 | min_len: 0 45 | max_len: 20 46 | num_beams: 1 47 | inference_method: "generate" 48 | 49 | seed: 42 50 | output_dir: "output/PNP-VQA/OKVQA" 51 | 52 | evaluate: True 53 | test_splits: ["test"] 54 | 55 | # distribution-specific 56 | device: "cuda" 57 | world_size: 1 58 | dist_url: "env://" 59 | distributed: True 60 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: 3b 9 | 10 | datasets: 11 | ok_vqa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_question" 19 | 20 | run: 21 | task: vqa_reading_comprehension 22 | 23 | # optimization-specific 24 | batch_size_train: 4 25 | batch_size_eval: 4 26 | num_workers: 4 27 | 28 | # image question matching specific 29 | block_num: 7 30 | 31 | # image captioning specific 32 | top_k: 50 33 | top_p: 1 34 | cap_min_length: 10 35 | cap_max_length: 20 36 | repetition_penalty: 1 37 | num_patches: 20 38 | num_captions: 100 39 | prompt: 'a picture of ' 40 | 41 | # question answering specific 42 | internal_bsz_fid: 1 43 | num_captions_fid: 1 44 | min_len: 0 45 | max_len: 20 46 | num_beams: 1 47 | inference_method: "generate" 48 | 49 | seed: 42 50 | output_dir: "output/PNP-VQA-3b/OKVQA" 51 | 52 | evaluate: True 53 | test_splits: ["test"] 54 | 55 | # distribution-specific 56 | device: "cuda" 57 | world_size: 1 58 | dist_url: "env://" 59 | distributed: True 60 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/vqav2_zeroshot_flant5xl_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | # Overall Accuracy is: 63.13 7 | # Per Answer Type Accuracy is the following: 8 | # other : 52.90 9 | # yes/no : 84.28 10 | # number : 41.01 11 | 12 | model: 13 | arch: blip2_t5 14 | model_type: pretrain_flant5xl 15 | use_grad_checkpoint: False 16 | 17 | datasets: 18 | coco_vqa: # name of the dataset builder 19 | type: eval 20 | vis_processor: 21 | eval: 22 | name: "blip_image_eval" 23 | image_size: 224 24 | text_processor: 25 | eval: 26 | name: "blip_question" 27 | # build_info: 28 | # images: 29 | # storage: '/export/share/datasets/vision/coco/images/' 30 | 31 | run: 32 | task: vqa 33 | # optimization-specific 34 | batch_size_train: 16 35 | batch_size_eval: 64 36 | num_workers: 4 37 | 38 | # inference-specific 39 | max_len: 10 40 | min_len: 1 41 | num_beams: 5 42 | inference_method: "generate" 43 | prompt: "Question: {} Short answer:" 44 | 45 | seed: 42 46 | output_dir: "output/BLIP2/VQA" 47 | 48 | evaluate: True 49 | test_splits: ["val"] 50 | 51 | # distribution-specific 52 | device: "cuda" 53 | world_size: 1 54 | dist_url: "env://" 55 | distributed: True 56 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: large 9 | 10 | datasets: 11 | ok_vqa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_question" 19 | 20 | run: 21 | task: vqa_reading_comprehension 22 | 23 | # optimization-specific 24 | batch_size_train: 12 25 | batch_size_eval: 12 26 | num_workers: 4 27 | 28 | # image question matching specific 29 | block_num: 7 30 | 31 | # image captioning specific 32 | top_k: 50 33 | top_p: 1 34 | cap_min_length: 10 35 | cap_max_length: 20 36 | repetition_penalty: 1 37 | num_patches: 20 38 | num_captions: 100 39 | prompt: 'a picture of ' 40 | 41 | # question answering specific 42 | internal_bsz_fid: 1 43 | num_captions_fid: 1 44 | min_len: 0 45 | max_len: 20 46 | num_beams: 1 47 | inference_method: "generate" 48 | 49 | seed: 42 50 | output_dir: "output/PNP-VQA-large/OKVQA" 51 | 52 | evaluate: True 53 | test_splits: ["test"] 54 | 55 | # distribution-specific 56 | device: "cuda" 57 | world_size: 1 58 | dist_url: "env://" 59 | distributed: True 60 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/gqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: base 9 | 10 | datasets: 11 | gqa: # name of the dataset builder 12 | type: balanced_testdev 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: gqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 16 26 | batch_size_eval: 16 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 5 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA/GQA" 52 | 53 | evaluate: True 54 | test_splits: ["val"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: 3b 9 | 10 | datasets: 11 | gqa: # name of the dataset builder 12 | type: balanced_testdev 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: gqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 4 26 | batch_size_eval: 4 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 5 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA-3b/GQA" 52 | 53 | evaluate: True 54 | test_splits: ["val"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/vqav2_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: base 9 | 10 | datasets: 11 | coco_vqa: # name of the dataset builder 12 | type: eval 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 16 26 | batch_size_eval: 16 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 1 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA/VQAv2_val" 52 | 53 | evaluate: True 54 | test_splits: ["val"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: 3b 9 | 10 | datasets: 11 | coco_vqa: # name of the dataset builder 12 | type: eval 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 4 26 | batch_size_eval: 4 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 1 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA-3b/VQAv2_val" 52 | 53 | evaluate: True 54 | test_splits: ["val"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: large 9 | 10 | datasets: 11 | gqa: # name of the dataset builder 12 | type: balanced_testdev 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: gqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 12 26 | batch_size_eval: 12 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 5 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA-large/GQA" 52 | 53 | evaluate: True 54 | test_splits: ["val"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: base 9 | 10 | datasets: 11 | coco_vqa: # name of the dataset builder 12 | type: default 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 16 26 | batch_size_eval: 16 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 1 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA/VQAv2_test" 52 | 53 | evaluate: True 54 | test_splits: ["test"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: 3b 9 | 10 | datasets: 11 | coco_vqa: # name of the dataset builder 12 | type: default 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 4 26 | batch_size_eval: 4 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 1 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA-3b/VQAv2_test" 52 | 53 | evaluate: True 54 | test_splits: ["test"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: large 9 | 10 | datasets: 11 | coco_vqa: # name of the dataset builder 12 | type: eval 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 12 26 | batch_size_eval: 12 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 1 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA-large/VQAv2_val" 52 | 53 | evaluate: True 54 | test_splits: ["val"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pnp_vqa 8 | model_type: large 9 | 10 | datasets: 11 | coco_vqa: # name of the dataset builder 12 | type: default 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa_reading_comprehension 23 | 24 | # optimization-specific 25 | batch_size_train: 12 26 | batch_size_eval: 12 27 | num_workers: 4 28 | 29 | # image question matching specific 30 | block_num: 7 31 | 32 | # image captioning specific 33 | top_k: 50 34 | top_p: 1 35 | cap_min_length: 10 36 | cap_max_length: 20 37 | repetition_penalty: 1 38 | num_patches: 20 39 | num_captions: 100 40 | prompt: 'a picture of ' 41 | 42 | # question answering specific 43 | internal_bsz_fid: 1 44 | num_captions_fid: 1 45 | min_len: 0 46 | max_len: 20 47 | num_beams: 1 48 | inference_method: "generate" 49 | 50 | seed: 42 51 | output_dir: "output/PNP-VQA-large/VQAv2_test" 52 | 53 | evaluate: True 54 | test_splits: ["test"] 55 | 56 | # distribution-specific 57 | device: "cuda" 58 | world_size: 1 59 | dist_url: "env://" 60 | distributed: True 61 | -------------------------------------------------------------------------------- /lavis/projects/albef/train/okvqa_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | datasets: 13 | ok_vqa: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | image_size: 384 18 | eval: 19 | name: "blip_image_eval" 20 | image_size: 384 21 | text_processor: 22 | train: 23 | name: "blip_question" 24 | eval: 25 | name: "blip_question" 26 | 27 | run: 28 | task: vqa 29 | # optimization-specific 30 | lr_sched: "linear_warmup_cosine_lr" 31 | init_lr: 2e-5 32 | min_lr: 1e-6 33 | weight_decay: 0.02 34 | max_epoch: 6 35 | batch_size_train: 16 36 | batch_size_eval: 16 37 | num_workers: 4 38 | 39 | # inference-specific 40 | max_len: 10 41 | min_len: 1 42 | num_beams: 256 43 | num_ans_candidates: 128 44 | inference_method: "rank" 45 | 46 | seed: 42 47 | output_dir: "output/BLIP/OKVQA" 48 | 49 | amp: False 50 | resume_ckpt_path: null 51 | 52 | evaluate: False 53 | train_splits: ["train"] 54 | valid_splits: ["val"] 55 | test_splits: ["test"] 56 | 57 | # distribution-specific 58 | device: "cuda" 59 | world_size: 1 60 | dist_url: "env://" 61 | distributed: True 62 | -------------------------------------------------------------------------------- /lavis/projects/alpro/train/msrvtt_retrieval_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | model_type: msrvtt 9 | load_finetuned: False 10 | 11 | datasets: 12 | msrvtt_retrieval: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "alpro_video_train" 16 | n_frms: 8 17 | image_size: 224 18 | eval: 19 | name: "alpro_video_eval" 20 | n_frms: 8 21 | image_size: 224 22 | text_processor: 23 | train: 24 | name: "blip_caption" 25 | eval: 26 | name: "blip_caption" 27 | 28 | run: 29 | task: retrieval 30 | # optimization-specific 31 | lr_sched: "linear_warmup_cosine_lr" 32 | init_lr: 3e-5 33 | min_lr: 1e-6 34 | weight_decay: 1e-4 35 | max_epoch: 5 36 | batch_size_train: 8 37 | batch_size_eval: 8 38 | num_workers: 4 39 | 40 | k_test: 1000 41 | 42 | seed: 42 43 | output_dir: "output/ALPRO/msrvtt_retrieval" 44 | 45 | amp: False 46 | resume_ckpt_path: null 47 | 48 | evaluate: False 49 | train_splits: ["train"] 50 | valid_splits: ["val"] 51 | test_splits: ["test"] 52 | 53 | # distribution-specific 54 | device: "cuda" 55 | world_size: 1 56 | dist_url: "env://" 57 | distributed: True 58 | use_dist_eval_sampler: False 59 | -------------------------------------------------------------------------------- /lavis/projects/blip/train/okvqa_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | 9 | model_type: okvqa 10 | load_finetuned: False 11 | 12 | image_size: 480 13 | 14 | datasets: 15 | ok_vqa: # name of the dataset builder 16 | vis_processor: 17 | train: 18 | name: "blip_image_train" 19 | image_size: 480 20 | eval: 21 | name: "blip_image_eval" 22 | image_size: 480 23 | text_processor: 24 | train: 25 | name: "blip_question" 26 | eval: 27 | name: "blip_question" 28 | 29 | run: 30 | task: vqa 31 | # optimization-specific 32 | lr_sched: "linear_warmup_cosine_lr" 33 | init_lr: 3e-5 34 | min_lr: 1e-5 35 | weight_decay: 0.02 36 | max_epoch: 7 37 | batch_size_train: 16 38 | batch_size_eval: 16 39 | num_workers: 4 40 | 41 | # inference-specific 42 | max_len: 10 43 | min_len: 1 44 | num_beams: 256 45 | num_ans_candidates: 128 46 | inference_method: "rank" 47 | 48 | seed: 42 49 | output_dir: "output/BLIP/OKVQA" 50 | 51 | amp: False 52 | resume_ckpt_path: null 53 | 54 | evaluate: False 55 | train_splits: ["train"] 56 | test_splits: ["test"] 57 | 58 | # distribution-specific 59 | device: "cuda" 60 | world_size: 1 61 | dist_url: "env://" 62 | distributed: True 63 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022 Salesforce, Inc. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 11 | 12 | 3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | -------------------------------------------------------------------------------- /lavis/projects/albef/train/aokvqa_ft.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | datasets: 13 | aok_vqa: # name of the dataset builder 14 | vis_processor: 15 | train: 16 | name: "blip_image_train" 17 | image_size: 384 18 | eval: 19 | name: "blip_image_eval" 20 | image_size: 384 21 | text_processor: 22 | train: 23 | name: "blip_question" 24 | eval: 25 | name: "blip_question" 26 | 27 | run: 28 | task: aok_vqa 29 | # optimization-specific 30 | lr_sched: "linear_warmup_cosine_lr" 31 | init_lr: 2e-5 32 | min_lr: 1e-6 33 | weight_decay: 0.02 34 | max_epoch: 6 35 | batch_size_train: 16 36 | batch_size_eval: 16 37 | num_workers: 4 38 | 39 | # inference-specific 40 | max_len: 10 41 | min_len: 1 42 | num_beams: 256 43 | num_ans_candidates: 128 44 | inference_method: "rank" 45 | 46 | seed: 42 47 | output_dir: "output/BLIP/AOKVQA" 48 | 49 | amp: False 50 | resume_ckpt_path: null 51 | 52 | evaluate: False 53 | train_splits: ["train"] 54 | valid_splits: ["val"] 55 | test_splits: ["test"] 56 | 57 | # distribution-specific 58 | device: "cuda" 59 | world_size: 1 60 | dist_url: "env://" 61 | distributed: True 62 | --------------------------------------------------------------------------------