├── lavis
    ├── models
    │   ├── blip2_models
    │   │   └── __init__.py
    │   ├── .DS_Store
    │   ├── blip_models
    │   │   └── .DS_Store
    │   ├── clip_models
    │   │   ├── .DS_Store
    │   │   ├── pics
    │   │   │   └── CLIP.png
    │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   └── __init__.py
    │   ├── gpt_models
    │   │   └── .DS_Store
    │   ├── timesformer
    │   │   ├── .DS_Store
    │   │   ├── __init__.py
    │   │   └── linear.py
    │   ├── albef_models
    │   │   └── .DS_Store
    │   ├── alpro_models
    │   │   └── .DS_Store
    │   ├── pnp_vqa_models
    │   │   ├── .DS_Store
    │   │   └── __init__.py
    │   └── img2prompt_models
    │   │   ├── .DS_Store
    │   │   └── __init__.py
    ├── projects
    │   ├── blip2
    │   │   └── eval
    │   │   │   ├── caption_coco_flant5xl_eval.yaml
    │   │   │   ├── ret_flickr_eval.yaml
    │   │   │   ├── caption_coco_opt2.7b_eval.yaml
    │   │   │   ├── caption_coco_opt6.7b_eval.yaml
    │   │   │   ├── gqa_zeroshot_flant5xl_eval.yaml
    │   │   │   ├── okvqa_zeroshot_flant5xl_eval.yaml
    │   │   │   ├── ret_coco_eval.yaml
    │   │   │   └── vqav2_zeroshot_flant5xl_eval.yaml
    │   ├── .DS_Store
    │   ├── alpro
    │   │   ├── .DS_Store
    │   │   ├── eval
    │   │   │   ├── msrvtt_qa_eval.yaml
    │   │   │   ├── msvd_qa_eval.yaml
    │   │   │   ├── msrvtt_ret_eval.yaml
    │   │   │   └── didemo_ret_eval.yaml
    │   │   └── train
    │   │   │   └── msrvtt_retrieval_ft.yaml
    │   ├── blip
    │   │   ├── eval
    │   │   │   ├── nlvr_eval.yaml
    │   │   │   ├── caption_coco_eval.yaml
    │   │   │   ├── caption_coco_eval_large.yaml
    │   │   │   ├── ret_flickr_eval.yaml
    │   │   │   ├── okvqa_eval.yaml
    │   │   │   ├── aokvqa_eval.yaml
    │   │   │   ├── vqav2_eval.yaml
    │   │   │   ├── nocaps_eval.yaml
    │   │   │   └── ret_coco_eval.yaml
    │   │   ├── train
    │   │   │   ├── nlvr_ft.yaml
    │   │   │   ├── caption_coco_large_ft.yaml
    │   │   │   ├── caption_coco_ft.yaml
    │   │   │   └── okvqa_ft.yaml
    │   │   └── coco_cap_ft_iter.yaml
    │   ├── clip
    │   │   ├── exp_imnet_zs_eval.yaml
    │   │   ├── exp_coco_ret_eval.yaml
    │   │   └── exp_flickr_ret_eval.yaml
    │   ├── albef
    │   │   ├── eval
    │   │   │   ├── nlvr_eval.yaml
    │   │   │   ├── snli_ve_eval.yaml
    │   │   │   ├── ret_coco_eval.yaml
    │   │   │   ├── ret_flickr30k_eval.yaml
    │   │   │   ├── vqa_val.yaml
    │   │   │   └── vqa_test.yaml
    │   │   └── train
    │   │   │   ├── snli_ve_ft.yaml
    │   │   │   ├── nlvr_ft.yaml
    │   │   │   ├── okvqa_ft.yaml
    │   │   │   └── aokvqa_ft.yaml
    │   ├── gpt
    │   │   └── eval
    │   │   │   └── dialogue_avsd_eval.yaml
    │   └── pnp-vqa
    │   │   └── eval
    │   │       ├── okvqa_eval.yaml
    │   │       ├── okvqa_eval_3b.yaml
    │   │       ├── okvqa_eval_large.yaml
    │   │       ├── gqa_eval.yaml
    │   │       ├── gqa_eval_3b.yaml
    │   │       ├── vqav2_eval.yaml
    │   │       ├── vqav2_eval_3b.yaml
    │   │       ├── gqa_eval_large.yaml
    │   │       ├── vqav2_test_eval.yaml
    │   │       ├── vqav2_test_eval_3b.yaml
    │   │       ├── vqav2_eval_large.yaml
    │   │       └── vqav2_test_eval_large.yaml
    ├── configs
    │   ├── .DS_Store
    │   ├── datasets
    │   │   ├── .DS_Store
    │   │   ├── laion
    │   │   │   └── defaults_2B_multi.yaml
    │   │   ├── imagenet
    │   │   │   └── defaults.yaml
    │   │   ├── vg
    │   │   │   ├── defaults_vqa.yaml
    │   │   │   └── defaults_caption.yaml
    │   │   ├── conceptual_caption
    │   │   │   ├── defaults_3m.yaml
    │   │   │   └── defaults_12m.yaml
    │   │   ├── how2qa
    │   │   │   └── defaults_qa.yaml
    │   │   ├── star
    │   │   │   └── defaults_qa.yaml
    │   │   ├── vlep
    │   │   │   └── defaults_qa.yaml
    │   │   ├── tvqa
    │   │   │   └── defaults_qa.yaml
    │   │   ├── msrvttmc
    │   │   │   └── defaults_qa.yaml
    │   │   ├── nextqa
    │   │   │   ├── defaults_qa.yaml
    │   │   │   └── defaults_qa_old.yaml
    │   │   ├── mixed
    │   │   │   └── defaults.yaml
    │   │   ├── tacos
    │   │   │   ├── defaults.yaml
    │   │   │   └── relative_integer.yaml
    │   │   ├── nextgqa
    │   │   │   └── defaults_qa.yaml
    │   │   ├── anet
    │   │   │   └── defaults.yaml
    │   │   ├── sbu_caption
    │   │   │   └── defaults.yaml
    │   │   ├── qvh
    │   │   │   └── defaults.yaml
    │   │   ├── charades_sta
    │   │   │   ├── seconds_decimal.yaml
    │   │   │   ├── relative_integer.yaml
    │   │   │   ├── relative_decimal.yaml
    │   │   │   └── defaults.yaml
    │   │   ├── qvhQ
    │   │   │   └── defaults.yaml
    │   │   ├── nocaps
    │   │   │   └── defaults.yaml
    │   │   ├── flickr30k
    │   │   │   └── defaults.yaml
    │   │   ├── nlvr
    │   │   │   └── defaults.yaml
    │   │   ├── snli_ve
    │   │   │   └── defaults.yaml
    │   │   ├── msvd
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_qa.yaml
    │   │   ├── msrvtt
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_ret.yaml
    │   │   ├── vatex
    │   │   │   └── defaults_cap.yaml
    │   │   ├── avsd
    │   │   │   └── defaults_dial.yaml
    │   │   ├── didemo
    │   │   │   └── defaults_ret.yaml
    │   │   ├── coco
    │   │   │   ├── defaults_ret.yaml
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── eval_vqa.yaml
    │   │   └── gqa
    │   │   │   ├── balanced_val.yaml
    │   │   │   └── balanced_testdev.yaml
    │   ├── models
    │   │   ├── .DS_Store
    │   │   ├── clip_resnet50.yaml
    │   │   ├── clip
    │   │   │   ├── ViT-B-16.json
    │   │   │   ├── ViT-B-32.json
    │   │   │   ├── ViT-L-14.json
    │   │   │   ├── ViT-L-16.json
    │   │   │   ├── ViT-B-16-plus.json
    │   │   │   ├── ViT-L-14-280.json
    │   │   │   ├── ViT-L-14-336.json
    │   │   │   ├── ViT-L-16-320.json
    │   │   │   ├── ViT-B-16-plus-240.json
    │   │   │   ├── ViT-B-32-plus-256.json
    │   │   │   ├── ViT-H-14.json
    │   │   │   ├── ViT-H-16.json
    │   │   │   ├── ViT-B-32-quickgelu.json
    │   │   │   ├── ViT-g-14.json
    │   │   │   ├── timm-resnet50d.json
    │   │   │   ├── timm-resnetaa50d.json
    │   │   │   ├── timm-resnetblur50.json
    │   │   │   ├── RN101.json
    │   │   │   ├── RN50.json
    │   │   │   ├── RN50x16.json
    │   │   │   ├── RN50x4.json
    │   │   │   ├── timm-efficientnetv2_rw_s.json
    │   │   │   ├── timm-vit_base_patch16_224.json
    │   │   │   ├── timm-vit_base_patch32_224.json
    │   │   │   ├── timm-vit_small_patch16_224.json
    │   │   │   ├── timm-swin_base_patch4_window7_224.json
    │   │   │   ├── RN101-quickgelu.json
    │   │   │   └── RN50-quickgelu.json
    │   │   ├── clip_vit_base16.yaml
    │   │   ├── bert_config.json
    │   │   ├── med_config.json
    │   │   ├── med_large_config.json
    │   │   ├── blip_pretrain_large.yaml
    │   │   ├── med_config_albef.json
    │   │   ├── bert_config_alpro.json
    │   │   ├── blip_classification_base.yaml
    │   │   ├── blip_feature_extractor_base.yaml
    │   │   ├── albef_feature_extractor.yaml
    │   │   ├── blip_itm_base.yaml
    │   │   ├── blip_itm_large.yaml
    │   │   ├── blip_pretrain_base.yaml
    │   │   ├── gpt_dialogue_base.yaml
    │   │   ├── blip2
    │   │   │   ├── blip2_pretrain.yaml
    │   │   │   ├── blip2_coco.yaml
    │   │   │   ├── blip2_pretrain_flant5xl.yaml
    │   │   │   ├── blip2_pretrain_opt2.7b.yaml
    │   │   │   ├── blip2_pretrain_opt6.7b.yaml
    │   │   │   ├── blip2_pretrain_flant5xxl.yaml
    │   │   │   ├── blip2_caption_opt2.7b.yaml
    │   │   │   ├── blip2_caption_opt6.7b.yaml
    │   │   │   └── blip2_caption_flant5xl.yaml
    │   │   ├── albef_pretrain_base.yaml
    │   │   ├── alpro_retrieval_didemo.yaml
    │   │   ├── blip_caption_large_coco.yaml
    │   │   ├── blip_vqa_okvqa.yaml
    │   │   ├── blip_vqa_aokvqa.yaml
    │   │   ├── blip_vqav2.yaml
    │   │   ├── blip_retrieval_coco.yaml
    │   │   ├── albef_classification_ve.yaml
    │   │   ├── blip_nlvr.yaml
    │   │   ├── albef_vqav2.yaml
    │   │   ├── blip_caption_base_coco.yaml
    │   │   ├── albef_nlvr.yaml
    │   │   ├── blip_retrieval_flickr.yaml
    │   │   ├── alpro_retrieval_msrvtt.yaml
    │   │   ├── alpro_qa_msvd.yaml
    │   │   ├── alpro_qa_msrvtt.yaml
    │   │   ├── albef_retrieval_coco.yaml
    │   │   ├── albef_retrieval_flickr.yaml
    │   │   ├── clip_vit_base32.yaml
    │   │   ├── clip_vit_large14.yaml
    │   │   └── clip_vit_large14_336.yaml
    │   └── default.yaml
    ├── common
    │   ├── vqa_tools
    │   │   └── __init__.py
    │   └── gradcam.py
    ├── runners
    │   └── __init__.py
    ├── tasks
    │   └── image_text_pretrain.py
    ├── processors
    │   └── base_processor.py
    ├── datasets
    │   ├── builders
    │   │   ├── dialogue_builder.py
    │   │   ├── temporal_action_localization_builder.py
    │   │   └── classification_builder.py
    │   ├── datasets
    │   │   ├── multimodal_classification_datasets.py
    │   │   └── vg_vqa_datasets.py
    │   └── download_scripts
    │   │   └── DownloadConceptualCaptions
    │   │       ├── LICENSE
    │   │       └── README.md
    └── __init__.py
├── assets
    ├── .DS_Store
    ├── model.png
    └── teaser.png
├── docs
    ├── _static
    │   ├── merlion.png
    │   ├── logo_final.png
    │   └── architecture.png
    ├── requirements.txt
    ├── tutorial.rst
    ├── Makefile
    ├── index.rst
    ├── make.bat
    └── tutorial.evaluation.rst
├── pyproject.toml
├── run_scripts
    └── mr_BLIP
    │   ├── eval
    │       ├── nextGQA.sh
    │       ├── nextQA.sh
    │       ├── anet.sh
    │       ├── charades.sh
    │       └── qvh.sh
    │   └── train
    │       ├── nextGQA.sh
    │       ├── anet.sh
    │       ├── charades.sh
    │       ├── qvh.sh
    │       ├── qvhQ.sh
    │       └── nextQA.sh
├── MANIFEST.in
├── standalone_eval
    └── eval_sample.sh
├── requirements.txt
├── setup.py
└── LICENSE.txt


/lavis/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/assets/.DS_Store


--------------------------------------------------------------------------------
/assets/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/assets/model.png


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/assets/teaser.png


--------------------------------------------------------------------------------
/lavis/models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/.DS_Store


--------------------------------------------------------------------------------
/docs/_static/merlion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/docs/_static/merlion.png


--------------------------------------------------------------------------------
/lavis/configs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/configs/.DS_Store


--------------------------------------------------------------------------------
/lavis/projects/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/projects/.DS_Store


--------------------------------------------------------------------------------
/docs/_static/logo_final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/docs/_static/logo_final.png


--------------------------------------------------------------------------------
/docs/_static/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/docs/_static/architecture.png


--------------------------------------------------------------------------------
/lavis/configs/datasets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/configs/datasets/.DS_Store


--------------------------------------------------------------------------------
/lavis/configs/models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/configs/models/.DS_Store


--------------------------------------------------------------------------------
/lavis/projects/alpro/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/projects/alpro/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/blip_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/blip_models/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/clip_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/clip_models/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/gpt_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/gpt_models/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/timesformer/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/timesformer/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/albef_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/albef_models/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/alpro_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/alpro_models/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/pnp_vqa_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/pnp_vqa_models/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/clip_models/pics/CLIP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/clip_models/pics/CLIP.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires      = ["setuptools>=61.0.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | GitPython
2 | ipykernel
3 | nbsphinx==0.8.7
4 | pandoc
5 | sphinx
6 | sphinx_autodoc_typehints
7 | sphinx_rtd_theme


--------------------------------------------------------------------------------
/lavis/models/img2prompt_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/img2prompt_models/.DS_Store


--------------------------------------------------------------------------------
/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sudo-Boris/mr-Blip/HEAD/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/eval/nextGQA.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/nextGQA.yaml


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/train/nextGQA.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 train.py --cfg-path lavis/projects/mr_BLIP/train/nextGQA.yaml


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/train/anet.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/anet.yaml


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/train/charades.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/charades.yaml


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include lavis/configs *.yaml *.json
2 | recursive-include lavis/projects *.yaml *.json
3 | 
4 | recursive-exclude lavis/datasets/download_scripts *
5 | recursive-exclude lavis/output *
6 | 
7 | include requirements.txt
8 | 


--------------------------------------------------------------------------------
/docs/tutorial.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | ==============================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    tutorial.evaluation
 8 |    tutorial.training-example
 9 |    tutorial.configs
10 |    tutorial.datasets
11 |    tutorial.processors
12 |    tutorial.models
13 |    tutorial.tasks
14 | 


--------------------------------------------------------------------------------
/lavis/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/train/qvh.sh:
--------------------------------------------------------------------------------
1 | # CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/qvh.yaml
2 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 train.py --cfg-path lavis/projects/mr_BLIP/train/qvh.yaml


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/train/qvhQ.sh:
--------------------------------------------------------------------------------
1 | # CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/qvh.yaml
2 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 train.py --cfg-path lavis/projects/mr_BLIP/train/qvhQ.yaml


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/eval/nextQA.sh:
--------------------------------------------------------------------------------
1 | # CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/qvh.yaml
2 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/nextQA.yaml


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/train/nextQA.sh:
--------------------------------------------------------------------------------
1 | # CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run --nproc_per_node=4 train.py --cfg-path lavis/projects/mr_BLIP/train/nextQA.yaml
2 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/mr_BLIP/train/nextQA.yaml


--------------------------------------------------------------------------------
/lavis/models/img2prompt_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/lavis/models/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | 
7 |  Based on https://github.com/facebookresearch/TimeSformer
8 | """
9 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: RN50
10 | 
11 |   pretrained: openai
12 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.runners.runner_base import RunnerBase
 9 | from lavis.runners.runner_iter import RunnerIter
10 | 
11 | __all__ = ["RunnerBase", "RunnerIter"]
12 | 


--------------------------------------------------------------------------------
/lavis/configs/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | env:
 7 |   # For default users
 8 |   # cache_root: "cache"
 9 |   # For internal use with persistent storage
10 |   cache_root: "/nas-hdd/shoubin/pretrained_model/"
11 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnet50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetaa50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetblur50",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-16
10 | 
11 |   pretrained: openai
12 | 
13 | preprocess:
14 |   vis_processor:
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "efficientnetv2_rw_s",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 288
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch32_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_small_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/standalone_eval/eval_sample.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage: bash standalone_eval/eval_sample.sh
 3 | # submission_path=standalone_eval/sample_val_preds.jsonl
 4 | submission_path=standalone_eval/hl_val_submission.jsonl
 5 | gt_path=data/annotations/QVH/highlight_val_release.jsonl
 6 | save_path=standalone_eval/val_preds_metrics.json
 7 | 
 8 | PYTHONPATH=$PYTHONPATH:. python standalone_eval/eval.py \
 9 | --submission_path ${submission_path} \
10 | --gt_path ${gt_path} \
11 | --save_path ${save_path}
12 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/laion/defaults_2B_multi.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   laion2B_multi:
 8 | 
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
14 | 


--------------------------------------------------------------------------------
/lavis/models/clip_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | 
 7 |  Based on https://github.com/mlfoundations/open_clip
 8 | """
 9 | 
10 | """ OpenAI pretrained model functions
11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP.
12 | 
13 | Originally MIT License, Copyright (c) 2021 OpenAI.
14 | """
15 | 


--------------------------------------------------------------------------------
/lavis/configs/models/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/lavis/configs/models/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/eval/anet.sh:
--------------------------------------------------------------------------------
1 | # Should return:
2 | # {"agg_metrics": 32.647, "r1": {"0.5": 53.79, "0.55": 49.43, "0.6": 44.78, "0.65": 40.21, "0.7": 35.47, "0.75": 30.73, "0.8": 25.94, "0.85": 20.9, "0.9": 15.57, "0.95": 9.65}, "mAP": {"0.5": 53.79, "0.55": 49.43, "0.6": 44.78, "0.65": 40.21, "0.7": 35.47, "0.75": 30.73, "0.8": 25.95, "0.85": 20.9, "0.9": 15.57, "0.95": 9.65, "average": 32.65}, "mIoU": 0.515230127209404, "invalid_predictions": 0.0, "total": 17032}
3 | 
4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/anet.yaml


--------------------------------------------------------------------------------
/lavis/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 1024,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/lavis/configs/datasets/imagenet/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   imagenet:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       splits: ["val"]
14 |       images:
15 |           storage: /export/share/datasets/vision/imagenet
16 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   # vit encoder
10 |   vit_type: "large"
11 |   vit_grad_ckpt: True
12 |   vit_ckpt_layer: 5
13 | 
14 |   image_size: 224
15 | 
16 |   # bert config
17 |   med_config_path: "configs/models/med_large_config.json"
18 | 
19 |   embed_dim: 256
20 | 
21 |   # generation configs
22 |   prompt: "a picture of "
23 | 


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/eval/charades.sh:
--------------------------------------------------------------------------------
1 | # Should return:
2 | # {'agg_metrics': 41.40999999999999, 'r1': {'0.5': 69.31, '0.55': 65.13, '0.6': 59.48, '0.65': 55.0, '0.7': 49.29, '0.75': 41.68, '0.8': 32.9, '0.85': 23.51, '0.9': 12.46, '0.95': 5.34}, 'mAP': {'0.5': 66.96, '0.55': 62.53, '0.6': 57.18, '0.65': 52.04, '0.7': 46.43, '0.75': 39.46, '0.8': 30.58, '0.85': 20.9, '0.9': 10.31, '0.95': 4.2, 'average': 39.06}, 'mIoU': 0.5863397571818805, 'invalid_predictions': 0.0, 'total': 3720}
3 | 
4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/charades.yaml


--------------------------------------------------------------------------------
/lavis/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------
/run_scripts/mr_BLIP/eval/qvh.sh:
--------------------------------------------------------------------------------
1 | # Should return:
2 | # {'agg_metrics': 57.55899999999999, 'r1': {'0.5': 76.16, '0.55': 72.1, '0.6': 69.2, '0.65': 66.24, '0.7': 62.63, '0.75': 59.73, '0.8': 54.64, '0.85': 49.29, '0.9': 38.92, '0.95': 26.68}, 'mAP': {'0.5': 68.5, '0.55': 65.19, '0.6': 62.91, '0.65': 60.43, '0.7': 57.48, '0.75': 55.06, '0.8': 50.79, '0.85': 45.96, '0.9': 36.4, '0.95': 24.94, 'average': 52.77}, 'mIoU': 0.703218087517246, 'invalid_predictions': 0.014175257731958763, 'total': 1552}
3 | 
4 | CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/mr_BLIP/eval/qvh.yaml


--------------------------------------------------------------------------------
/lavis/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true,
21 |   "fusion_layer": 6
22 | }


--------------------------------------------------------------------------------
/lavis/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": true,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 30522,
20 |   "encoder_width": 768,
21 |   "add_cross_attention": false,
22 |   "fusion_layer": 6
23 | }


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_classification
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 | 
10 |   use_distill: True
11 |   momentum: 0.995
12 |   alpha: 0.4
13 | 
14 |   # vit encoder
15 |   vit_type: "base"
16 |   vit_grad_ckpt: False
17 |   vit_ckpt_layer: 0
18 | 
19 |   image_size: 384
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 


--------------------------------------------------------------------------------
/lavis/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/vg/defaults_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16 |           storage: vg/annotations/vg_qa.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/vg/defaults_caption.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16 |           storage: vg/annotations/vg_caption.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/lavis/models/timesformer/linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | """ Linear layer (alternate definition)
 9 | """
10 | import torch
11 | import torch.nn.functional as F
12 | from torch import nn as nn
13 | 
14 | 
15 | class Linear(nn.Linear):
16 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
17 |         if torch.jit.is_scripting():
18 |             bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
19 |             return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
20 |         else:
21 |             return F.linear(input, self.weight, self.bias)
22 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_3m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc3m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc3m.json
19 |       images:
20 |           storage: conceptual_caption/images
21 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/how2qa/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   how2qa: # name of the dataset builder
 3 |     # data_dir: ${env.data_dir}/datasets
 4 |     data_type: videos # [images|videos|features]
 5 |     build_info:
 6 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 7 |       annotations:
 8 |         train:
 9 |           url: /nas-ssd/shoubin/datasets/how2qa/train.json
10 |           storage: /nas-ssd/shoubin/datasets/how2qa/train.json
11 |         val:
12 |           url: /nas-ssd/shoubin/datasets/how2qa/val.json
13 |           storage: /nas-ssd/shoubin/datasets/how2qa/val.json
14 |         test:
15 |           url: /nas-ssd/shoubin/datasets/how2qa/val.json
16 |           storage: /nas-ssd/shoubin/datasets/how2qa/val.json
17 |       videos:
18 |         storage: /nas-hdd/shoubin/how2qa/clips/


--------------------------------------------------------------------------------
/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_12m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc12m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc12m.json
19 |       images:
20 |           storage: conceptual_caption/images_12m
21 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/star/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   star: # name of the dataset builder
 3 |     # data_dir: ${env.data_dir}/datasets
 4 |     data_type: videos # [images|videos|features]
 5 |     build_info:
 6 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 7 |       annotations:
 8 |         train:
 9 |           url: /nas-ssd/shoubin/datasets/star/train.json
10 |           storage: /nas-ssd/shoubin/datasets/star/train.json
11 |         val:
12 |           url: /nas-ssd/shoubin/datasets/star/val.json
13 |           storage: /nas-ssd/shoubin/datasets/star/val.json
14 |         test:
15 |           url: /nas-ssd/shoubin/datasets/star/val.json
16 |           storage: /nas-ssd/shoubin/datasets/star/val.json
17 |       videos:
18 |         storage: /nas-hdd/shoubin/videos/charades/Charades_v1_480/


--------------------------------------------------------------------------------
/lavis/configs/datasets/vlep/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   vlep: # name of the dataset builder
 3 |     # data_dir: ${env.data_dir}/datasets
 4 |     data_type: videos # [images|videos|features]
 5 |     build_info:
 6 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 7 |       annotations:
 8 |         train:
 9 |           url: /nas-ssd/shoubin/datasets/vlep/train.json
10 |           storage: /nas-ssd/shoubin/datasets/vlep/train.json
11 |         val:
12 |           url: /nas-ssd/shoubin/datasets/vlep/val.json
13 |           storage: /nas-ssd/shoubin/datasets/vlep/val.json
14 |         test:
15 |           url: /nas-ssd/shoubin/datasets/vlep/val.json
16 |           storage: /nas-ssd/shoubin/datasets/vlep/val.json
17 |       videos:
18 |         storage: /nas-hdd/shoubin/videos/charades/Charades_v1_480/


--------------------------------------------------------------------------------
/lavis/configs/datasets/tvqa/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   tvqa: # name of the dataset builder
 3 |     # data_dir: ${env.data_dir}/datasets
 4 |     data_type: videos # [images|videos|features]
 5 |     build_info:
 6 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 7 |       annotations:
 8 |         train:
 9 |           url: /nas-ssd/shoubin/datasets/tvqa/train.json
10 |           storage: /nas-ssd/shoubin/datasets/tvqa/train.json
11 |         val:
12 |           url: /nas-ssd/shoubin/datasets/tvqa/val.json
13 |           storage: /nas-ssd/shoubin/datasets/tvqa/val.json
14 |         test:
15 |           url: /nas-ssd/shoubin/datasets/tvqa/val.json
16 |           storage: /nas-ssd/shoubin/datasets/tvqa/val.json
17 |       videos:
18 |         storage: /nas-hdd/shoubin/videos/tvqa/videos_3fps_with_audio/


--------------------------------------------------------------------------------
/lavis/datasets/builders/dialogue_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.avsd_dialogue_datasets import (
11 |     AVSDDialDataset,
12 |     AVSDDialEvalDataset,
13 | )
14 | 
15 | 
16 | @registry.register_builder("avsd_dialogue")
17 | class AVSDDialBuilder(BaseDatasetBuilder):
18 |     train_dataset_cls = AVSDDialDataset
19 |     eval_dataset_cls = AVSDDialEvalDataset
20 | 
21 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"}
22 | 


--------------------------------------------------------------------------------
/lavis/datasets/datasets/multimodal_classification_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from abc import abstractmethod
 9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class MultimodalClassificationDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
14 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
15 | 
16 |         self.class_labels = None
17 | 
18 |     @abstractmethod
19 |     def _build_class_labels(self):
20 |         pass
21 | 
22 |     @abstractmethod
23 |     def _load_auxiliary_mappings(self):
24 |         pass
25 | 
26 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   vit_grad_ckpt: False
13 |   vit_ckpt_layer: 0
14 | 
15 |   image_size: 224
16 | 
17 |   # bert config
18 |   med_config_path: "configs/models/med_config.json"
19 | 
20 |   embed_dim: 256
21 | 
22 | preprocess:
23 |   vis_processor:
24 |       eval:
25 |         name: "blip_image_eval"
26 |         image_size: 224
27 |   text_processor:
28 |       eval:
29 |         name: "blip_caption"
30 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msrvttmc/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 | datasets:
 2 |   msrvttmc: # name of the dataset builder
 3 |     # data_dir: ${env.data_dir}/datasets
 4 |     data_type: videos # [images|videos|features]
 5 |     build_info:
 6 |       # Be careful not to append minus sign (-) before split to avoid itemizing
 7 |       # no training data for this dataset
 8 |       annotations:
 9 |         train:
10 |           url: /nas-ssd/shoubin/datasets/msrvttmc/val.json
11 |           storage: /nas-ssd/shoubin/datasets/msrvttmc/val.json
12 |         val:
13 |           url: /nas-ssd/shoubin/datasets/msrvttmc/val.json
14 |           storage: /nas-ssd/shoubin/datasets/msrvttmc/val.json
15 |         test:
16 |           url: /nas-ssd/shoubin/datasets/msrvttmc/val.json
17 |           storage: /nas-ssd/shoubin/datasets/msrvttmc/val.json
18 |       videos:
19 |         storage: /nas-hdd/tarbucket/terran/data/msrvtt/videos/all/


--------------------------------------------------------------------------------
/lavis/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   image_size: 224
13 |   vit_ckpt_layer: 0
14 |   vit_drop_path_rate: 0
15 |   vit_layer_norm_epsilon: 1e-6
16 |   vit_grad_ckpt: False
17 | 
18 |   # bert config
19 |   med_config_path: "configs/models/med_config_albef.json"
20 | 
21 |   embed_dim: 256
22 | 
23 | preprocess:
24 |   vis_processor:
25 |       eval:
26 |         name: "blip_image_eval"
27 |         image_size: 224
28 |   text_processor:
29 |       eval:
30 |         name: "blip_caption"
31 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/lavis/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "large"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/nextqa/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nextqa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/train.json
15 |           storage: Your/path/to/train.json
16 |         val:
17 |           url: Your/path/to/val.json
18 |           storage: Your/path/to/val.json
19 |         test:
20 |           url: Your/path/to/val.json
21 |           storage: Your/path/to/val.json
22 |       videos:
23 |         storage: Your/path/to/raw/NExT
24 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. LAVIS documentation master file, created by
 2 |    sphinx-quickstart on Sun Jul 31 10:32:27 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to LAVIS's documentation!
 7 | =================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 |    :caption: Introduction
12 | 
13 |    intro
14 | 
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: Getting Started
19 | 
20 |    getting_started
21 | 
22 | 
23 | ..    :maxdepth: 1
24 | ..    :caption: Advanced Training
25 | 
26 | ..    advanced_training
27 | 
28 | 
29 | .. toctree::
30 |    :maxdepth: 2
31 |    :caption: Advanced Usage
32 | 
33 |    benchmark
34 |    tutorial
35 | 
36 | 
37 | .. Documentations
38 | .. ===================
39 | 
40 | 
41 | Indices and tables
42 | ==================
43 | 
44 | * :ref:`genindex`
45 | * :ref:`modindex`
46 | * :ref:`search`
47 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/mixed/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   mixed: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/train.json
15 |           storage: Your/path/to/train.json
16 |         val:
17 |           url: Your/path/to/val.json
18 |           storage: Your/path/to/val.json
19 |         test:
20 |           url: Your/path/to/test.json
21 |           storage: Your/path/to/test.json
22 |       videos:
23 |         storage: Your/path/to/raw/Mixed_Charades_QVH


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/tacos/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   tacos: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/train.json
15 |           storage: Your/path/to/train.json
16 |         val:
17 |           url: Your/path/to/val_float.json
18 |           storage: Your/path/to/val_float.json
19 |         test:
20 |           url: Your/path/to/test.json
21 |           storage: Your/path/to/test.json
22 |       videos:
23 |         storage: Your/path/to/raw/TACoS/res_224


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 | 
10 | datasets:
11 |   nlvr: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: multimodal_classification
22 | 
23 |   batch_size_train: 16
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   seed: 42
28 |   output_dir: "output/BLIP/NLVR"
29 | 
30 |   evaluate: True
31 |   test_splits: ["val", "test"]
32 | 
33 |   # distribution-specific
34 |   device: "cuda"
35 |   world_size: 1
36 |   dist_url: "env://"
37 |   distributed: True
38 | 


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_imnet_zs_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   imagenet: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "clip_image_eval"
16 |           # image_size: 224
17 |           image_size: 336
18 | 
19 | run:
20 |   task: multimodal_classification
21 | 
22 |   # dataloading
23 |   num_workers: 4
24 |   batch_size_train: 32
25 |   batch_size_eval: 128
26 | 
27 |   test_splits: ["val"]
28 | 
29 |   # distribution
30 |   device: "cuda"
31 |   world_size: 1
32 |   dist_url: "env://"
33 |   distributed: True
34 | 
35 |   # misc
36 |   seed: 42
37 |   output_dir: "output/clip/zs_imnet"
38 | 
39 |   evaluate: True
40 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   model_type: nlvr
 9 | 
10 | datasets:
11 |   nlvr: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: multimodal_classification
22 | 
23 |   batch_size_train: 16
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   seed: 42
28 |   output_dir: "output/ALBEF/NLVR"
29 | 
30 |   evaluate: True
31 |   test_splits: ["val", "test"]
32 | 
33 |   # distribution-specific
34 |   device: "cuda"
35 |   world_size: 1
36 |   dist_url: "env://"
37 |   distributed: True
38 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/nextgqa/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nextgqa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/train.json
15 |           storage: Your/path/to/train.json
16 |         val:
17 |           url: Your/path/to/nextgqa/test.json
18 |           storage: Your/path/to/nextgqa/test.json
19 |         test:
20 |           url: Your/path/to/nextgqa/test.json
21 |           storage: Your/path/to/nextgqa/test.json
22 |       videos:
23 |         storage: Your/path/to/raw/NExT
24 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/snli_ve_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   model_type: ve
 9 | 
10 | datasets:
11 |   snli_ve: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   task: multimodal_classification
21 |   # optimization-specific
22 |   batch_size_train: 32
23 |   batch_size_eval: 64
24 |   num_workers: 4
25 | 
26 |   seed: 42
27 |   output_dir: "output/ALBEF/SNLI_VE"
28 | 
29 |   evaluate: True
30 |   test_splits: ["val", "test"]
31 | 
32 |   # distribution-specific
33 |   device: "cuda"
34 |   world_size: 1
35 |   dist_url: "env://"
36 |   distributed: True
37 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/anet/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   anet: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/train.json
15 |           storage: Your/path/to/train.json
16 |         val:
17 |           url: Your/path/to/val_float.json
18 |           storage: Your/path/to/val_float.json
19 |         test:
20 |           url: Your/path/to/test_float.json
21 |           storage: Your/path/to/test_float.json
22 |       videos:
23 |         storage: Your/path/to/data/raw/ANet/Anet_videos_15fps_short256


--------------------------------------------------------------------------------
/lavis/configs/datasets/sbu_caption/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   sbu_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17 |               # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18 |           storage:
19 |               - sbu_captions/annotations/sbu.json
20 |       images:
21 |           storage: sbu_captions/images
22 |           # storage: /export/share/datasets/vision_language/sbu_resize
23 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/qvh/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   qvh: # name of the dataset builder
 8 |     data_type: videos # [images|videos|features]
 9 |     build_info:
10 |       # Be careful not to append minus sign (-) before split to avoid itemizing
11 |       annotations:
12 |         train:
13 |           url: Your/path/to/train.json
14 |           storage: Your/path/to/train.json
15 |         val:
16 |           url: Your/path/to/val.json
17 |           storage: Your/path/to/val.json
18 |         test:
19 |           # url: Your/path/to/test_dummy.json
20 |           # storage: Your/path/to/test_dummy.json
21 |           url: Your/path/to/val.json
22 |           storage: Your/path/to/val.json
23 |       videos:
24 |         storage: Your/path/to/data/raw/QVHighlights
25 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 224
18 |   alpha: 0.4
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/bert_config.json"
22 | 
23 |   embed_dim: 256
24 | 
25 |   # generation configs
26 |   prompt: "a picture of "
27 | 
28 | preprocess:
29 |     vis_processor:
30 |         train:
31 |           name: "blip_image_train"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/tacos/relative_integer.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   tacos-relative_integer: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/train_relative.json
15 |           storage: Your/path/to/train_relative.json
16 |         val:
17 |           url: Your/path/to/val_float.json
18 |           storage: Your/path/to/val_float.json
19 |         test:
20 |           url: Your/path/to/test_float.json
21 |           storage: Your/path/to/test_float.json
22 |       videos:
23 |         storage: Your/path/to/raw/TACoS/res_224


--------------------------------------------------------------------------------
/lavis/configs/datasets/charades_sta/seconds_decimal.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   charades_sta-seconds_decimal: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/new_train_float.json
15 |           storage: Your/path/to/new_train_float.json
16 |         val:
17 |           url: Your/path/to/new_val_float.json
18 |           storage: Your/path/to/new_val_float.json
19 |         test:
20 |           url: Your/path/to/test.json
21 |           storage: Your/path/to/test.json
22 |       videos:
23 |         storage: Your/path/to/raw/Charades


--------------------------------------------------------------------------------
/lavis/configs/datasets/qvhQ/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   qvhQ: # name of the dataset builder
 8 |     data_type: videos # [images|videos|features]
 9 |     build_info:
10 |       # Be careful not to append minus sign (-) before split to avoid itemizing
11 |       annotations:
12 |         train:
13 |           url: Your/path/to/train_mcqa.json
14 |           storage: Your/path/to/train_mcqa.json
15 |         val:
16 |           url: Your/path/to/val.json
17 |           storage: Your/path/to/val.json
18 |         test:
19 |           # url: Your/path/to/test_dummy.json
20 |           # storage: Your/path/to/test_dummy.json
21 |           url: Your/path/to/val.json
22 |           storage: Your/path/to/val.json
23 |       videos:
24 |         storage: Your/path/to/data/raw/QVHighlights
25 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/caption_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: base_coco
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   # task: retrieval
21 |   task: captioning
22 |   # optimizer
23 |   batch_size_train: 32
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   max_len: 20
28 |   min_len: 5
29 |   num_beams: 3
30 | 
31 |   seed: 42
32 |   output_dir: "output/BLIP/Caption_coco"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/charades_sta/relative_integer.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   charades_sta-relative_integer: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/new_train_relative.json
15 |           storage: Your/path/to/new_train_relative.json
16 |         val:
17 |           url: Your/path/to/new_val_float.json
18 |           storage: Your/path/to/new_val_float.json
19 |         test:
20 |           url: Your/path/to/test.json
21 |           storage: Your/path/to/test.json
22 |       videos:
23 |         storage: Your/path/to/raw/Charades


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/caption_coco_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: large_coco
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   # task: retrieval
21 |   task: captioning
22 |   # optimizer
23 |   batch_size_train: 32
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   max_len: 20
28 |   min_len: 5
29 |   num_beams: 3
30 | 
31 |   seed: 42
32 |   output_dir: "output/BLIP/Caption_coco"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 | 
11 |   len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 
12 |   
13 |   len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 | 
15 | preprocess:
16 |     vis_processor:
17 |         train:
18 |           name: "gpt_video_ft"
19 |         eval:
20 |           name: "gpt_video_ft"
21 |     text_processor:
22 |         train:
23 |           name: "gpt_dialogue"
24 |         eval:
25 |           name: "gpt_dialogue"


--------------------------------------------------------------------------------
/lavis/configs/datasets/charades_sta/relative_decimal.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   charades_sta-relative_decimal: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/new_train_relative_float.json
15 |           storage: Your/path/to/new_train_relative_float.json
16 |         val:
17 |           url: Your/path/to/new_val_float.json
18 |           storage: Your/path/to/new_val_float.json
19 |         test:
20 |           url: Your/path/to/test.json
21 |           storage: Your/path/to/test.json
22 |       videos:
23 |         storage: Your/path/to/raw/Charades


--------------------------------------------------------------------------------
/lavis/configs/datasets/nocaps/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nocaps: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16 |           storage:  nocaps/annotations/nocaps_val.json
17 |         test:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19 |           storage: nocaps/annotations/nocaps_test.json
20 |       images:
21 |         storage: nocaps/images
22 |         # storage: /export/share/datasets/vision/nocaps/
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 224
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 224
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msrvtt_qa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   model_type: msrvtt
 9 | 
10 | datasets:
11 |   msrvtt_qa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 16
16 |           image_size: 224
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: multimodal_classification
23 |   # optimization-specific
24 |   batch_size_train: 32
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   seed: 42
29 |   output_dir: "output/ALPRO/msrvtt_qa"
30 | 
31 |   evaluate: True
32 |   valid_splits: ["val"]
33 |   test_splits: ["test"]
34 | 
35 |   # distribution-specific
36 |   device: "cuda"
37 |   world_size: 1
38 |   dist_url: "env://"
39 |   distributed: True
40 | 


--------------------------------------------------------------------------------
/lavis/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.registry import registry
14 | 
15 | from lavis.datasets.builders import *
16 | from lavis.models import *
17 | from lavis.processors import *
18 | from lavis.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   image_size: 224
15 |   vit_ckpt_layer: 0
16 |   vit_drop_path_rate: 0
17 |   vit_layer_norm_epsilon: 1e-6
18 |   vit_grad_ckpt: False
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config_albef.json"
22 |   mlm_mask_prob: 0.15
23 | 
24 |   embed_dim: 256
25 |   momentum: 0.995
26 |   alpha: 0.4
27 |   temp: 0.07
28 | 
29 |   max_txt_len: 30
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 256
36 |     text_processor:
37 |         train:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msvd_qa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   model_type: msvd
 9 | 
10 | datasets:
11 |   msvd_qa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 16
16 |           image_size: 224
17 |     text_processor:
18 |         train:
19 |           name: "blip_caption"
20 |         eval:
21 |           name: "blip_caption"
22 | 
23 | run:
24 |   task: multimodal_classification
25 |   # optimization-specific
26 |   batch_size_train: 24
27 |   batch_size_eval: 64
28 |   num_workers: 4
29 | 
30 |   seed: 42
31 |   output_dir: "output/ALPRO/msvd_qa"
32 | 
33 |   evaluate: True
34 |   test_splits: ["test"]
35 | 
36 |   # distribution-specific
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/nextqa/defaults_qa_old.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nextqa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: /nas-ssd/shoubin/datasets/nextqa/train.json
15 |           storage: /nas-ssd/shoubin/datasets/nextqa/train.json
16 |         val:
17 |           url: /nas-ssd/shoubin/datasets/nextqa/val.json
18 |           storage: /nas-ssd/shoubin/datasets/nextqa/val.json
19 |         test:
20 |           url: /nas-ssd/shoubin/datasets/nextqa/val.json
21 |           storage: /nas-ssd/shoubin/datasets/nextqa/val.json
22 |       videos:
23 |         storage: /nas-hdd/shoubin/videos/vidor/videos/
24 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   model_type: flickr
 9 | 
10 | datasets:
11 |   flickr30k: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/Retrieval_Flickr30k"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   model_type: coco
 9 | 
10 | datasets:
11 |   coco_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/ALBEF/Retrieval_COCO"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/flickr30k/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   flickr30k:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       annotations:
13 |         train:
14 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json
15 |           storage: flickr30k/annotations/train.json
16 |         val:
17 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
18 |           storage: flickr30k/annotations/val.json
19 |         test:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
21 |           storage: flickr30k/annotations/test.json
22 |       images:
23 |           storage: flickr30k/images
24 |           # storage: /export/share/datasets/vision/flickr30k
25 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/ret_flickr30k_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   model_type: flickr
 9 | 
10 | datasets:
11 |   flickr30k: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/ALBEF/Retrieval_Flickr30k"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2
 8 |   model_type: coco
 9 |   use_grad_checkpoint: False
10 | 
11 | datasets:
12 |   flickr30k: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 364
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: retrieval
23 | 
24 |   # dataloading
25 |   num_workers: 4
26 |   batch_size_train: 16
27 |   batch_size_eval: 32
28 | 
29 |   test_splits: ["test"]
30 | 
31 |   # distribution
32 |   device: "cuda"
33 |   world_size: 1
34 |   dist_url: "env://"
35 |   distributed: True
36 |   use_dist_eval_sampler: False
37 | 
38 |   # model specific
39 |   k_test: 128
40 | 
41 |   # misc
42 |   seed: 42
43 |   output_dir: "output/BLIP2/Retrieval_Flickr30k"
44 | 
45 |   evaluate: True


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       eval:
30 |         name: "alpro_video_eval"
31 |         n_frms: 8
32 |         image_size: 224
33 |   text_processor:
34 |       eval:
35 |         name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msrvtt_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 |   model_type: msrvtt
 9 | 
10 | datasets:
11 |   msrvtt_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 8
16 |           image_size: 224
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: retrieval
23 |   # optimization-specific
24 |   batch_size_train: 24
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   # k_test: 256
29 |   k_test: 1000
30 | 
31 |   seed: 42
32 |   output_dir: "output/ALPRO/msrvtt_retrieval"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   # distribution-specific
38 |   device: "cuda"
39 |   world_size: 1
40 |   dist_url: "env://"
41 |   distributed: True
42 |   use_dist_eval_sampler: False
43 | 


--------------------------------------------------------------------------------
/lavis/projects/gpt/eval/dialogue_avsd_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   avsd_dialogue: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "gpt_video_ft"
15 |           visual_ft: ["i3d_flow", "i3d_rgb"]
16 |           audio_ft: ["vggish"]
17 |     text_processor:
18 |         eval:
19 |           name: "gpt_dialogue"
20 |           max_turns:  3
21 |           use_caption: True
22 | 
23 | run:
24 |   task: dialogue
25 |   # optimizer
26 |   batch_size_train: 16
27 |   batch_size_eval: 16
28 |   num_workers: 0
29 | 
30 |   max_len: 20
31 |   min_len: 5
32 |   num_beams: 5
33 | 
34 |   seed: 42
35 |   output_dir: "output/gpt2/dialogue_avsd"
36 | 
37 |   evaluate: True
38 |   valid_splits: ["test"]
39 | 
40 |   device: "cuda"
41 |   world_size: 1
42 |   dist_url: "env://"
43 |   distributed: True
44 | 


--------------------------------------------------------------------------------
/lavis/datasets/builders/temporal_action_localization_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.common.utils import get_cache_path
10 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
11 | from lavis.datasets.datasets.temporal_action_localization_dataset import (
12 |     TemporalActionLocalizationDataset,
13 | )
14 | 
15 | 
16 | class TemporalActionLocalizationBuilder(BaseDatasetBuilder):
17 |     train_dataset_cls = TemporalActionLocalizationDataset
18 |     eval_dataset_cls = TemporalActionLocalizationDataset
19 | 
20 |     def build(self):
21 |         datasets = super().build()
22 | 
23 |         return datasets
24 | 
25 | 
26 | @registry.register_builder("anet_TAL")
27 | class ANetTALBuilder(TemporalActionLocalizationBuilder):
28 |     DATASET_CONFIG_DICT = {
29 |         "default": "configs/datasets/anet_TAL/defaults.yaml",
30 |     }
31 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/okvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: okvqa
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   ok_vqa: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 480
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa
23 |   # optimization-specific
24 |   batch_size_train: 16
25 |   batch_size_eval: 16
26 |   num_workers: 4
27 | 
28 |   # inference-specific
29 |   max_len: 10
30 |   min_len: 1
31 |   num_beams: 3
32 |   num_ans_candidates: 128
33 |   inference_method: "rank"
34 | 
35 |   seed: 42
36 |   output_dir: "output/BLIP/OKVQA"
37 | 
38 |   evaluate: True
39 |   test_splits: ["test"]
40 | 
41 |   # distribution-specific
42 |   device: "cuda"
43 |   world_size: 1
44 |   dist_url: "env://"
45 |   distributed: True
46 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: coco
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: True
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 364
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 364
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/nlvr/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nlvr:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16 |           storage: nlvr/annotations/train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19 |           storage: nlvr/annotations/dev.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22 |           storage: nlvr/annotations/test.json
23 |       images:
24 |           storage: /export/share/datasets/vision/NLVR2/
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/snli_ve/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   snli_ve:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16 |           storage: snli/annotations/ve_train.json
17 |         val:
18 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19 |           storage: snli/annotations/ve_dev.json
20 |         test:
21 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22 |           storage: snli/annotations/ve_test.json
23 |       images:
24 |           storage: flickr30k/images/flickr30k-images
25 |           # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
26 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/aokvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: aokvqa
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   aok_vqa: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 480
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: aok_vqa
23 |   # optimization-specific
24 |   batch_size_train: 64
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   # inference-specific
29 |   max_len: 10
30 |   min_len: 1
31 |   num_beams: 3
32 |   num_ans_candidates: 128
33 |   inference_method: "rank"
34 | 
35 |   seed: 42
36 |   output_dir: "output/BLIP/AOKVQA"
37 | 
38 |   evaluate: True
39 |   test_splits: ["val", "test"]
40 | 
41 |   # distribution-specific
42 |   device: "cuda"
43 |   world_size: 1
44 |   dist_url: "env://"
45 |   distributed: True
46 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/vqav2_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: vqav2
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   coco_vqa: # name of the dataset builder
13 |     type: eval
14 |     vis_processor:
15 |         eval:
16 |           name: "blip_image_eval"
17 |           image_size: 480
18 |     text_processor:
19 |         eval:
20 |           name: "blip_question"
21 | 
22 | run:
23 |   task: vqa
24 |   # optimization-specific
25 |   batch_size_train: 16
26 |   batch_size_eval: 64
27 |   num_workers: 4
28 | 
29 |   # inference-specific
30 |   max_len: 10
31 |   min_len: 1
32 |   num_beams: 3
33 |   num_ans_candidates: 128
34 |   inference_method: "rank"
35 | 
36 |   seed: 42
37 |   output_dir: "output/BLIP/VQA"
38 | 
39 |   evaluate: True
40 |   test_splits: ["val"]
41 | 
42 |   # distribution-specific
43 |   device: "cuda"
44 |   world_size: 1
45 |   dist_url: "env://"
46 |   distributed: True
47 | 


--------------------------------------------------------------------------------
/lavis/datasets/builders/classification_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset
11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset
12 | 
13 | 
14 | @registry.register_builder("nlvr")
15 | class NLVRBuilder(BaseDatasetBuilder):
16 |     train_dataset_cls = NLVRDataset
17 |     eval_dataset_cls = NLVREvalDataset
18 | 
19 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"}
20 | 
21 | 
22 | @registry.register_builder("snli_ve")
23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder):
24 |     train_dataset_cls = SNLIVisualEntialmentDataset
25 |     eval_dataset_cls = SNLIVisualEntialmentDataset
26 | 
27 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"}
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ffmpeg # ==1.4
 2 | ffmpeg-python
 3 | future==0.18.2
 4 | glob2==0.7
 5 | imageio==2.9.0
 6 | matplotlib==3.4.2
 7 | moviepy==1.0.3
 8 | numpy # ==1.17.4
 9 | # pandas==1.3.1
10 | pickleshare==0.7.5
11 | # Pillow==7.0.0
12 | protobuf==3.15.6
13 | python-dateutil==2.8.2
14 | pytube==15.0.0
15 | PyYAML==5.2
16 | scikit-learn==0.24.2
17 | scikit-video==1.1.11
18 | scipy==1.7.1
19 | six==1.12.0
20 | tabulate==0.9.0
21 | # tensorboard==2.5.0
22 | tensorboard==2.11
23 | tensorboardX==2.1
24 | # torch==1.13.1
25 | # torch==1.13.1+cu117
26 | # torchtext==0.14.1
27 | # tqdm # ==4.36.1
28 | tzdata==2023.3
29 | 
30 | contexttimer
31 | decord
32 | einops>=0.4.1
33 | fairscale==0.4.4
34 | ftfy
35 | iopath
36 | ipython
37 | omegaconf
38 | opencv-python-headless==4.5.5.64
39 | opendatasets
40 | packaging
41 | pandas
42 | plotly
43 | pre-commit
44 | pycocoevalcap
45 | pycocotools
46 | python-magic
47 | scikit-image
48 | sentencepiece
49 | spacy
50 | # streamlit
51 | timm==0.4.12
52 | # torchvision
53 | tqdm
54 | # transformers>=4.25.0
55 | # transformers==4.25.1
56 | transformers==4.46.1
57 | wheel
58 | 
59 | peft==0.13.0
60 | wandb==0.18.3
61 | 
62 | av
63 | webdataset==0.2.100


--------------------------------------------------------------------------------
/lavis/configs/datasets/msvd/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16 |           storage: msvd/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19 |           storage: msvd/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22 |           storage: msvd/annotations/cap_test.json
23 |       videos:
24 |         storage: msvd/videos
25 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 | 
13 |   vit_type: "large"
14 |   vit_grad_ckpt: True
15 |   vit_ckpt_layer: 5
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   # generation configs
23 |   prompt: "a picture of "
24 | 
25 | 
26 | preprocess:
27 |     vis_processor:
28 |         train:
29 |           name: "blip_image_train"
30 |         eval:
31 |           name: "blip_image_eval"
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |           prompt: "a picture of "
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/vqa_val.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | datasets:
13 |   coco_vqa: # name of the dataset builder
14 |     type: eval
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 | 
23 | run:
24 |   task: vqa
25 | 
26 |   # optimization-specific
27 |   batch_size_train: 16
28 |   batch_size_eval: 64
29 |   num_workers: 4
30 | 
31 |   # inference-specific
32 |   max_len: 10
33 |   min_len: 1
34 |   num_beams: 3
35 |   num_ans_candidates: 128
36 |   inference_method: "rank"
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALBEF/VQA"
40 | 
41 |   evaluate: True
42 |   test_splits: ["val"]
43 | 
44 |   # distribution-specific
45 |   device: "cuda"
46 |   world_size: 1
47 |   dist_url: "env://"
48 |   distributed: True
49 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt2.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt6.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_coco_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   coco_retrieval: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "clip_image_train"
16 |           image_size: 336
17 |         eval:
18 |           name: "clip_image_eval"
19 |           image_size: 336
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: retrieval
28 | 
29 |   # dataloading
30 |   num_workers: 4
31 |   batch_size_train: 32
32 |   batch_size_eval: 128
33 | 
34 |   test_splits: ["test"]
35 | 
36 |   # distribution
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 |   use_dist_eval_sampler: True
42 | 
43 |   # misc
44 |   seed: 42
45 |   output_dir: "output/clip/Retrieval_COCO"
46 | 
47 |   evaluate: True
48 | 


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_flickr_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   flickr30k: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "clip_image_train"
16 |           image_size: 336
17 |         eval:
18 |           name: "clip_image_eval"
19 |           image_size: 336
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: retrieval
28 | 
29 |   # dataloading
30 |   num_workers: 4
31 |   batch_size_train: 32
32 |   batch_size_eval: 128
33 | 
34 |   test_splits: ["test"]
35 | 
36 |   # distribution
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 |   use_dist_eval_sampler: True
42 | 
43 |   # misc
44 |   seed: 42
45 |   output_dir: "output/clip/Retrieval_Flickr"
46 | 
47 |   evaluate: True
48 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/charades_sta/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   charades_sta: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 |     build_info:
11 |       # Be careful not to append minus sign (-) before split to avoid itemizing
12 |       annotations:
13 |         train:
14 |           url: Your/path/to/train.json
15 |           storage: Your/path/to/train.json
16 |           # url: Your/path/to/new_train.json
17 |           # storage: Your/path/to/new_train.json
18 |         val:
19 |           url: Your/path/to/test_float.json
20 |           storage: Your/path/to/test_float.json
21 |           # url: Your/path/to/new_val_float.json
22 |           # storage: Your/path/to/new_val_float.json
23 |         test:
24 |           url: Your/path/to/test_float.json
25 |           storage: Your/path/to/test_float.json
26 |       videos:
27 |         storage: Your/path/to/data/raw/Charades


--------------------------------------------------------------------------------
/lavis/configs/datasets/msrvtt/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16 |           storage: msrvtt/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19 |           storage: msrvtt/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22 |           storage: msrvtt/annotations/cap_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xxl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xxl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   vit_grad_ckpt: True
18 |   vit_ckpt_layer: 4
19 | 
20 |   image_size: 384
21 | 
22 |   # bert config
23 |   med_config_path: "configs/models/med_config.json"
24 | 
25 |   embed_dim: 256
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/vqa_test.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | 
13 | datasets:
14 |   coco_vqa: # name of the dataset builder
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 | 
23 | run:
24 |   task: vqa
25 | 
26 |   # optimization-specific
27 |   batch_size_train: 16
28 |   batch_size_eval: 64
29 |   num_workers: 4
30 | 
31 |   # inference-specific
32 |   max_len: 10
33 |   min_len: 1
34 |   num_beams: 3
35 |   num_ans_candidates: 128
36 |   inference_method: "rank"
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALBEF/VQA"
40 | 
41 |   evaluate: True
42 |   train_splits: ["train"]
43 |   test_splits: ["test"]
44 | 
45 |   # distribution-specific
46 |   device: "cuda"
47 |   world_size: 1
48 |   dist_url: "env://"
49 |   distributed: True
50 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 | 
13 |   num_classes: 3
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |       eval:
35 |         name: "blip_image_eval"
36 |   text_processor:
37 |       train:
38 |         name: "blip_caption"
39 |       eval:
40 |         name: "blip_caption"
41 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
13 | 
14 |   num_classes: 2
15 | 
16 |   # vit encoder
17 |   vit_type: "base"
18 |   vit_grad_ckpt: False
19 |   vit_ckpt_layer: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 | 
22 |   image_size: 384
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/med_config.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12 | 
13 |   use_distill: True
14 |   momentum: 0.995
15 |   alpha: 0.4
16 | 
17 |   # vit encoder
18 |   vit_type: "base"
19 |   vit_grad_ckpt: False
20 |   vit_ckpt_layer: 0
21 |   vit_layer_norm_epsilon: 1e-6
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config_albef.json"
27 | 
28 | preprocess:
29 |   vis_processor:
30 |       train:
31 |         name: "blip_image_train"
32 |         image_size: 384
33 |       eval:
34 |         name: "blip_image_eval"
35 |         image_size: 384
36 |   text_processor:
37 |       train:
38 |         name: "blip_question"
39 |       eval:
40 |         name: "blip_question"
41 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_caption_base_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 | 
18 |   image_size: 384
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config.json"
22 | 
23 |   # generation configs
24 |   prompt: "a picture of "
25 | 
26 | 
27 | preprocess:
28 |     vis_processor:
29 |         train:
30 |           name: "blip_image_train"
31 |         eval:
32 |           name: "blip_image_eval"
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |           prompt: "a picture of "
37 |         eval:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/vatex/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
16 |           storage: vatex/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
19 |           storage: vatex/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
22 |           storage: vatex/annotations/cap_test.json
23 |       videos:
24 |         storage: /export/share/dongxuli/data/vatex
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/avsd/defaults_dial.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   avsd_dialogue: # name of the dataset builder
 8 |     dataset_card: dataset_card/avsd_dialogue.md 
 9 |     data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json
16 |           storage: avsd/annotations/train.json 
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json
19 |           storage: avsd/annotations/val.json 
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json
22 |           storage: avsd/annotations/test.json 
23 |       features:
24 |         storage: avsd/features/ 
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msrvtt/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
16 |           storage: msrvtt/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
19 |           storage: msrvtt/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
22 |           storage: msrvtt/annotations/retrieval_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/nocaps_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: base_coco
 9 |   # pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
10 | 
11 | datasets:
12 |   nocaps: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 |           prompt: "a picture of "
21 | 
22 | run:
23 |   # task: retrieval
24 |   task: captioning
25 |   # optimizer
26 |   batch_size_train: 32
27 |   batch_size_eval: 64
28 |   num_workers: 4
29 | 
30 |   max_len: 20
31 |   min_len: 5
32 |   num_beams: 3
33 | 
34 |   seed: 42
35 |   output_dir: "output/BLIP/NoCaps"
36 | 
37 |   evaluate: True
38 |   test_splits: ["val", "test"]
39 | 
40 |   device: "cuda"
41 |   world_size: 1
42 |   dist_url: "env://"
43 |   distributed: True
44 | 
45 |   report_metric: False
46 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12 | 
13 |   num_classes: 2
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/datasets/datasets/vg_vqa_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | 
10 | from PIL import Image
11 | 
12 | from lavis.datasets.datasets.vqa_datasets import VQADataset
13 | 
14 | 
15 | class VGVQADataset(VQADataset):
16 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
17 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
18 | 
19 |     def __getitem__(self, index):
20 |         ann = self.annotation[index]
21 | 
22 |         image_path = os.path.join(self.vis_root, ann["image"])
23 |         image = Image.open(image_path).convert("RGB")
24 | 
25 |         image = self.vis_processor(image)
26 |         question = self.text_processor(ann["question"])
27 | 
28 |         answers = [ann["answer"]]
29 |         # TODO this should be configured better
30 |         weights = [0.2]
31 | 
32 |         return {
33 |             "image": image,
34 |             "text_input": question,
35 |             "answers": answers,
36 |             "weights": weights,
37 |         }
38 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 |   alpha: 0.4
15 | 
16 |   negative_all_rank: False
17 | 
18 |   # vit encoder
19 |   vit_type: "base"
20 |   vit_grad_ckpt: True
21 |   vit_ckpt_layer: 4
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config.json"
27 | 
28 |   embed_dim: 256
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_retrieval_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "alpro_video_train"
31 |         n_frms: 8
32 |         image_size: 224
33 |       eval:
34 |         name: "alpro_video_eval"
35 |         n_frms: 8
36 |         image_size: 224
37 |   text_processor:
38 |       train:
39 |         name: "blip_caption"
40 |       eval:
41 |         name: "blip_caption"
42 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/didemo_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 |   model_type: didemo
 9 | 
10 |   max_txt_len: 50
11 | 
12 |   timesformer:
13 |     n_frms: 8
14 |     image_size: 224
15 | 
16 | 
17 | datasets:
18 |   didemo_retrieval: # name of the dataset builder
19 |     vis_processor:
20 |         eval:
21 |           name: "alpro_video_eval"
22 |           n_frms: 8
23 |           image_size: 224
24 |     text_processor:
25 |         eval:
26 |           name: "blip_caption"
27 | 
28 | run:
29 |   task: retrieval
30 |   # optimization-specific
31 |   batch_size_train: 8
32 |   batch_size_eval: 64
33 |   num_workers: 4
34 | 
35 |   # k_test: 256
36 |   k_test: 1000
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALPRO/didemo_retrieval"
40 | 
41 |   evaluate: True
42 |   train_splits: ["train"]
43 |   valid_splits: ["val", "test"]
44 |   test_splits: ["test"]
45 | 
46 |   # distribution-specific
47 |   device: "cuda"
48 |   world_size: 1
49 |   dist_url: "env://"
50 |   distributed: True
51 |   use_dist_eval_sampler: False
52 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   model_type: coco
 9 | 
10 | datasets:
11 |   coco_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         train:
14 |           name: "blip_image_train"
15 |           image_size: 384
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |         eval:
23 |           name: "blip_caption"
24 | 
25 | run:
26 |   task: retrieval
27 | 
28 |   # dataloading
29 |   num_workers: 4
30 |   batch_size_train: 32
31 |   batch_size_eval: 128
32 | 
33 |   train_splits: ["train"]
34 |   valid_splits: ["val"]
35 |   test_splits: ["test"]
36 | 
37 |   # distribution
38 |   device: "cuda"
39 |   world_size: 1
40 |   dist_url: "env://"
41 |   distributed: True
42 |   use_dist_eval_sampler: False
43 | 
44 |   # model specific
45 |   k_test: 256
46 | 
47 |   # misc
48 |   seed: 42
49 |   output_dir: "output/BLIP/Retrieval_COCO"
50 | 
51 |   evaluate: True
52 | 


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_qa_msvd.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 2423
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 |     use_grad_ckpt: True
24 |     ckpt_layer: 12
25 | 
26 |   # bert config
27 |   med_config_path: "configs/models/bert_config_alpro.json"
28 | 
29 | preprocess:
30 |   vis_processor:
31 |       train:
32 |         name: "alpro_video_train"
33 |         n_frms: 16
34 |         image_size: 224
35 |       eval:
36 |         name: "alpro_video_eval"
37 |         n_frms: 16
38 |         image_size: 224
39 |   text_processor:
40 |       train:
41 |         name: "blip_caption"
42 |       eval:
43 |         name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt2.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt6.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_flant5xl
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/didemo/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   didemo_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json
16 |           storage: didemo/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json
19 |           storage: didemo/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json
22 |           storage: didemo/annotations/retrieval_test.json
23 |       videos:
24 |         storage: didemo/videos
25 |         # storage: /export/share/dongxuli/data/didemo_retrieval/videos
26 | 


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_qa_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 1500
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 | 
24 |     use_grad_ckpt: True
25 |     ckpt_layer: 12
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/bert_config_alpro.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "alpro_video_train"
34 |         n_frms: 16
35 |         image_size: 224
36 |       eval:
37 |         name: "alpro_video_eval"
38 |         n_frms: 16
39 |         image_size: 224
40 |   text_processor:
41 |       train:
42 |         name: "blip_caption"
43 |       eval:
44 |         name: "blip_caption"
45 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_retrieval:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 |           md5: aa31ac474cf6250ebb81d18348a07ed8
17 |           storage: coco/annotations/coco_karpathy_train.json
18 |         val:
19 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 |           md5: b273847456ef5580e33713b1f7de52a0
21 |           storage:  coco/annotations/coco_karpathy_val.json
22 |         test:
23 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 |           storage: coco/annotations/coco_karpathy_test.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_coco_retrieval_lavis.pt"
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_flickr_retrieval_lavis.pt
12 | 
13 |   queue_size: 65536
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   image_size: 384
18 |   vit_ckpt_layer: 0
19 |   vit_drop_path_rate: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 |   vit_grad_ckpt: False
22 | 
23 |   # bert config
24 |   med_config_path: "configs/models/med_config_albef.json"
25 | 
26 |   embed_dim: 256
27 |   momentum: 0.995
28 |   alpha: 0.4
29 |   temp: 0.07
30 |   use_distill: True
31 | 
32 |   max_txt_len: 30
33 | 
34 | preprocess:
35 |   vis_processor:
36 |       train:
37 |         name: "blip_image_train"
38 |         image_size: 384
39 |       eval:
40 |         name: "blip_image_eval"
41 |         image_size: 384
42 |   text_processor:
43 |       train:
44 |         name: "blip_caption"
45 |       eval:
46 |         name: "blip_caption"
47 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/gqa/balanced_val.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 |           storage:
18 |               - gqa/annotations/train_balanced_questions.json
19 |         val:
20 |           url:
21 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/val_balanced_questions.json
22 |           storage:
23 |               - gqa/annotations/val_balanced_questions.json
24 |         test:
25 |           url:
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 |           storage:
28 |               - gqa/annotations/test_balanced_questions.json
29 |       images:
30 |           storage: gqa/images/
31 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/gqa/balanced_testdev.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   gqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/train_balanced_questions.json
17 |           storage:
18 |               - gqa/annotations/train_balanced_questions.json
19 |         val:
20 |           url:
21 |             - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/testdev_balanced_questions.json
22 |           storage:
23 |             - gqa/annotations/testdev_balanced_questions.json
24 |         test:
25 |           url:
26 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/gqa/test_balanced_questions.json
27 |           storage:
28 |               - gqa/annotations/test_balanced_questions.json
29 |       images:
30 |           storage: gqa/images/
31 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/caption_coco_opt2.7b_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Bleu_1: 0.832
 7 | # Bleu_2: 0.691
 8 | # Bleu_3: 0.556
 9 | # Bleu_4: 0.438
10 | # METEOR: 0.317
11 | # ROUGE_L: 0.620
12 | # CIDEr: 1.461
13 | # SPICE: 0.252
14 | 
15 | model:
16 |   arch: blip2_opt
17 |   model_type: caption_coco_opt2.7b
18 |   use_grad_checkpoint: False
19 | 
20 | datasets:
21 |   coco_caption: # name of the dataset builder
22 |     vis_processor:
23 |         eval:
24 |           name: "blip_image_eval"
25 |           image_size: 364
26 |     text_processor:
27 |         eval:
28 |           name: "blip_caption"
29 | #     build_info:
30 | #         images:
31 | #             storage: '/export/share/datasets/vision/coco/images/'
32 | 
33 | run:
34 |   task: captioning
35 |   # optimizer
36 |   batch_size_train: 32
37 |   batch_size_eval: 16
38 |   num_workers: 4
39 | 
40 |   max_len: 30
41 |   min_len: 8
42 |   num_beams: 5
43 | 
44 |   seed: 42
45 |   output_dir: "output/BLIP2/Caption_coco_opt2.7b"
46 | 
47 |   evaluate: True
48 |   test_splits: ["test"]
49 | 
50 |   device: "cuda"
51 |   world_size: 1
52 |   dist_url: "env://"
53 |   distributed: True
54 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/caption_coco_opt6.7b_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Bleu_1: 0.831
 7 | # Bleu_2: 0.689
 8 | # Bleu_3: 0.552
 9 | # Bleu_4: 0.434
10 | # METEOR: 0.316
11 | # ROUGE_L: 0.618
12 | # CIDEr: 1.451
13 | # SPICE: 0.251
14 | 
15 | model:
16 |   arch: blip2_opt
17 |   model_type: caption_coco_opt6.7b
18 |   use_grad_checkpoint: False
19 | 
20 | datasets:
21 |   coco_caption: # name of the dataset builder
22 |     vis_processor:
23 |         eval:
24 |           name: "blip_image_eval"
25 |           image_size: 364
26 |     text_processor:
27 |         eval:
28 |           name: "blip_caption"
29 | #     build_info:
30 | #         images:
31 | #             storage: '/export/share/datasets/vision/coco/images/'
32 | 
33 | run:
34 |   task: captioning
35 |   # optimizer
36 |   batch_size_train: 32
37 |   batch_size_eval: 16
38 |   num_workers: 4
39 | 
40 |   max_len: 30
41 |   min_len: 8
42 |   num_beams: 5
43 | 
44 |   seed: 42
45 |   output_dir: "output/BLIP2/Caption_coco_opt6.7b"
46 | 
47 |   evaluate: True
48 |   test_splits: ["test"]
49 | 
50 |   device: "cuda"
51 |   world_size: 1
52 |   dist_url: "env://"
53 |   distributed: True
54 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/gqa_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Overall Accuracy is: 43.98
 7 | model:
 8 |   arch: blip2_t5
 9 |   model_type: pretrain_flant5xl
10 |   use_grad_checkpoint: False
11 | 
12 | datasets:
13 |   gqa: # name of the dataset builder
14 |     type: balanced_testdev
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 224
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 |     build_info:
23 |         images:
24 |             storage: "/export/share/datasets/vision/GQA/images/"
25 | 
26 | run:
27 |   task: gqa
28 |   # optimization-specific
29 |   batch_size_train: 16
30 |   batch_size_eval: 64
31 |   num_workers: 4
32 | 
33 |   # inference-specific
34 |   max_len: 10
35 |   min_len: 1
36 |   num_beams: 5
37 |   inference_method: "generate"
38 |   prompt: "Question: {} Short answer:"
39 | 
40 |   seed: 42
41 |   output_dir: "output/BLIP2/GQA"
42 | 
43 |   evaluate: True
44 |   test_splits: ["val"]
45 | 
46 |   # distribution-specific
47 |   device: "cuda"
48 |   world_size: 1
49 |   dist_url: "env://"
50 |   distributed: True
51 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/train/snli_ve_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   model_type: ve
 9 |   load_finetuned: False
10 |   num_classes: 3
11 | 
12 | datasets:
13 |   snli_ve: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |         eval:
18 |           name: "blip_image_eval"
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |         eval:
23 |           name: "blip_caption"
24 | 
25 | run:
26 |   task: multimodal_classification
27 |   # optimization-specific
28 |   lr_sched: "linear_warmup_cosine_lr"
29 |   init_lr: 2e-5
30 |   min_lr: 0
31 |   weight_decay: 0.05
32 |   max_epoch: 10
33 |   batch_size_train: 32
34 |   batch_size_eval: 64
35 |   num_workers: 4
36 | 
37 |   seed: 42
38 |   output_dir: "output/ALBEF/SNLI_VE"
39 | 
40 |   amp: False
41 |   resume_ckpt_path: null
42 | 
43 |   evaluate: False 
44 |   train_splits: ["train"]
45 |   valid_splits: ["val"]
46 |   test_splits: ["test"]
47 | 
48 |   # distribution-specific
49 |   device: "cuda"
50 |   world_size: 1
51 |   dist_url: "env://"
52 |   distributed: True
53 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_base32.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-32
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_large14.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 224
53 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_large14_336.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | #   ['RN50',
11 | #  'RN50-quickgelu',
12 | #  'RN50x4',
13 | #  'RN50x16',
14 | #  'RN101',
15 | #  'RN101-quickgelu',
16 | #  'timm-efficientnetv2_rw_s',
17 | #  'timm-resnet50d',
18 | #  'timm-resnetaa50d',
19 | #  'timm-resnetblur50',
20 | #  'timm-swin_base_patch4_window7_224',
21 | #  'timm-vit_base_patch16_224',
22 | #  'timm-vit_base_patch32_224',
23 | #  'timm-vit_small_patch16_224',
24 | #  'ViT-B-16',
25 | #  'ViT-B-16-plus',
26 | #  'ViT-B-16-plus-240',
27 | #  'ViT-B-32',
28 | #  'ViT-B-32-plus-256',
29 | #  'ViT-B-32-quickgelu',
30 | #  'ViT-g-14',
31 | #  'ViT-H-14',
32 | #  'ViT-H-16',
33 | #  'ViT-L-14',
34 | #  'ViT-L-14-280',
35 | #  'ViT-L-14-336',
36 | #  'ViT-L-16',
37 | #  'ViT-L-16-320']
38 | 
39 |   pretrained: openai
40 |   # "openai"
41 |   # following not available for all models
42 |   # "yfcc15m"
43 |   # "cc12m"
44 |   # "laion400m_e31"
45 |   # "laion400m_e32"
46 |   # "laion400m_avg"
47 | 
48 | preprocess:
49 |   vis_processor:
50 |       eval:
51 |         name: "clip_image_eval"
52 |         image_size: 336
53 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/train/nlvr_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 | 
 9 |   model_type: nlvr
10 |   load_finetuned: False
11 | 
12 | datasets:
13 |   nlvr: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |           image_size: 384
18 |         eval:
19 |           name: "blip_image_eval"
20 |           image_size: 384
21 |     text_processor:
22 |         train:
23 |           name: "blip_caption"
24 |         eval:
25 |           name: "blip_caption"
26 | 
27 | run:
28 |   task: multimodal_classification
29 | 
30 |   lr_sched: "linear_warmup_cosine_lr"
31 |   init_lr: 2.5e-5
32 |   min_lr: 0
33 |   weight_decay: 0.05
34 |   max_epoch: 15
35 | 
36 |   batch_size_train: 16
37 |   batch_size_eval: 64
38 |   num_workers: 4
39 | 
40 |   seed: 42
41 |   output_dir: "output/BLIP/NLVR"
42 | 
43 |   amp: False
44 |   resume_ckpt_path: null
45 | 
46 |   evaluate: False 
47 |   train_splits: ["train"]
48 |   valid_splits: ["val", "test"]
49 |   test_splits: ["test"]
50 | 
51 |   # distribution-specific
52 |   device: "cuda"
53 |   world_size: 1
54 |   dist_url: "env://"
55 |   distributed: True
56 | 


--------------------------------------------------------------------------------
/lavis/models/pnp_vqa_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | def prepare_qa_input(sample, num_captions, num_captions_fid):
12 |     sample_question_captions = []
13 | 
14 |     for question, captions in zip(sample['text_input'], sample['captions']):
15 |         assert isinstance(captions, list)
16 |         question_captions = []
17 |         question_caption = ''
18 |         for cap_id, cap_ in enumerate(captions[0:num_captions]):
19 |             question_caption += (cap_.strip() + '. ')
20 |             if (cap_id + 1) != num_captions and ((cap_id + 1) % num_captions_fid == 0):
21 |                 question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip()
22 |                 question_captions.append(question_caption)
23 |                 question_caption = ''
24 |             if (cap_id + 1) == num_captions:
25 |                 question_caption = question.lower().strip() + " \\n " + question_caption.lower().strip()
26 |                 question_captions.append(question_caption)
27 |         sample_question_captions.append(question_captions)
28 | 
29 |     sample['question_captions'] = sample_question_captions
30 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/coco/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_caption: # name of the dataset builder
 8 |     dataset_card: dataset_card/coco_caption.md
 9 |     # data_dir: ${env.data_dir}/datasets
10 |     data_type: images # [images|videos|features]
11 | 
12 |     build_info:
13 |       # Be careful not to append minus sign (-) before split to avoid itemizing
14 |       annotations:
15 |         train:
16 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
17 |           md5: aa31ac474cf6250ebb81d18348a07ed8
18 |           storage: coco/annotations/coco_karpathy_train.json
19 |         val:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
21 |           md5: b273847456ef5580e33713b1f7de52a0
22 |           storage:  coco/annotations/coco_karpathy_val.json
23 |         test:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
25 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
26 |           storage: coco/annotations/coco_karpathy_test.json
27 |       images:
28 |         storage: coco/images/
29 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msvd/defaults_qa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_qa: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_train.json
16 |           storage: msvd/annotations/qa_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_val.json
19 |           storage: msvd/annotations/qa_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/qa_test.json
22 |           storage: msvd/annotations/qa_test.json
23 |         ans2label:
24 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/train_ans2label.json
25 |           storage: msvd/annotations/qa_ans2label.json
26 |       videos:
27 |         storage: msvd/videos
28 | 
29 |       instance_id_key: question_id
30 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/train/caption_coco_large_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 | 
 9 |   model_type: large_coco
10 |   load_finetuned: False
11 | 
12 | datasets:
13 |   coco_caption: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |         eval:
18 |           name: "blip_image_eval"
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |           prompt: "a picture of "
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: captioning
28 |   # optimizer
29 |   lr_sched: "linear_warmup_cosine_lr"
30 |   init_lr: 2e-6
31 |   min_lr: 0
32 |   weight_decay: 0.05
33 |   max_epoch: 5
34 |   batch_size_train: 16
35 |   batch_size_eval: 64
36 |   num_workers: 4
37 | 
38 |   max_len: 20
39 |   min_len: 5
40 |   num_beams: 3
41 | 
42 |   seed: 42
43 |   output_dir: "output/BLIP/Caption_coco"
44 | 
45 |   amp: False
46 |   resume_ckpt_path: null
47 | 
48 |   evaluate: False 
49 |   train_splits: ["train"]
50 |   valid_splits: ["val"]
51 |   test_splits: ["test"]
52 | 
53 |   device: "cuda"
54 |   world_size: 1
55 |   dist_url: "env://"
56 |   distributed: True
57 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/okvqa_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Overall Accuracy is: 41.22
 7 | 
 8 | model:
 9 |   arch: blip2_t5
10 |   model_type: pretrain_flant5xl
11 |   use_grad_checkpoint: False
12 | 
13 |   # for OKVQA evaluation
14 |   apply_lemmatizer: True
15 | 
16 | datasets:
17 |   ok_vqa: # name of the dataset builder
18 |     vis_processor:
19 |         eval:
20 |           name: "blip_image_eval"
21 |           image_size: 224
22 |     text_processor:
23 |         eval:
24 |           name: "blip_question"
25 | #     build_info:
26 | #         images:
27 | #             storage: '/export/share/datasets/vision/coco/images/'
28 | 
29 | run:
30 |   task: vqa
31 |   # optimization-specific
32 |   batch_size_train: 16
33 |   batch_size_eval: 64
34 |   num_workers: 4
35 | 
36 |   # inference-specific
37 |   max_len: 10
38 |   min_len: 1
39 |   num_beams: 5
40 |   inference_method: "generate"
41 |   prompt: "Question: {} Short answer:"
42 | 
43 |   seed: 42
44 |   output_dir: "output/BLIP2/OKVQA"
45 | 
46 |   evaluate: True
47 |   test_splits: ["test"]
48 | 
49 |   # distribution-specific
50 |   device: "cuda"
51 |   world_size: 1
52 |   dist_url: "env://"
53 |   distributed: True
54 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2
 8 |   model_type: coco
 9 |   use_grad_checkpoint: False
10 | 
11 | datasets:
12 |   coco_retrieval: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "blip_image_train"
16 |           image_size: 364
17 |         eval:
18 |           name: "blip_image_eval"
19 |           image_size: 364
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | #     build_info:
26 | #         images:
27 | #             storage: '/export/share/datasets/vision/coco/images/'
28 | run:
29 |   task: retrieval
30 | 
31 |   # dataloading
32 |   num_workers: 4
33 |   batch_size_train: 16
34 |   batch_size_eval: 32
35 | 
36 |   train_splits: ["train"]
37 |   valid_splits: ["val"]
38 |   test_splits: ["test"]
39 | 
40 |   # distribution
41 |   device: "cuda"
42 |   world_size: 1
43 |   dist_url: "env://"
44 |   distributed: True
45 |   use_dist_eval_sampler: False
46 | 
47 |   # model specific
48 |   k_test: 128
49 | 
50 |   # misc
51 |   seed: 42
52 |   output_dir: "output/BLIP2/Retrieval_COCO"
53 | 
54 |   evaluate: True
55 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/train/caption_coco_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 | 
 9 |   model_type: base_coco
10 |   load_finetuned: False
11 | 
12 | datasets:
13 |   coco_caption: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |         eval:
18 |           name: "blip_image_eval"
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |           prompt: "a picture of "
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   # task: retrieval
28 |   task: captioning
29 |   # optimizer
30 |   lr_sched: "linear_warmup_cosine_lr"
31 |   init_lr: 1e-5
32 |   min_lr: 0
33 |   weight_decay: 0.05
34 |   max_epoch: 5
35 |   batch_size_train: 32
36 |   batch_size_eval: 64
37 |   num_workers: 4
38 | 
39 |   max_len: 20
40 |   min_len: 5
41 |   num_beams: 3
42 | 
43 |   seed: 42
44 |   output_dir: "output/BLIP/Caption_coco"
45 | 
46 |   amp: False
47 |   resume_ckpt_path: null
48 | 
49 |   evaluate: False 
50 |   train_splits: ["train"]
51 |   valid_splits: ["val"]
52 |   test_splits: ["test"]
53 | 
54 |   device: "cuda"
55 |   world_size: 1
56 |   dist_url: "env://"
57 |   distributed: True
58 | 


--------------------------------------------------------------------------------
/lavis/datasets/download_scripts/DownloadConceptualCaptions/LICENSE:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven Hoi. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | MIT License
 6 | 
 7 | Copyright (c) 2019 Igor Brigadir
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 | 
16 | The above copyright notice and this permission notice shall be included in all
17 | copies or substantial portions of the Software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from setuptools import setup, find_namespace_packages
 9 | import platform
10 | 
11 | DEPENDENCY_LINKS = []
12 | if platform.system() == "Windows":
13 |     DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html")
14 | 
15 | 
16 | def fetch_requirements(filename):
17 |     with open(filename) as f:
18 |         return [ln.strip() for ln in f.read().split("\n")]
19 | 
20 | 
21 | setup(
22 |     name="salesforce-lavis",
23 |     version="1.0.0.dev1",
24 |     author="Dongxu Li, Junnan Li, Hung Le, Guangsen Wang, Silvio Savarese, Steven C.H. Hoi",
25 |     description="LAVIS - A One-stop Library for Language-Vision Intelligence",
26 |     long_description=open("README.md", "r", encoding="utf-8").read(),
27 |     long_description_content_type="text/markdown",
28 |     keywords="Vision-Language, Multimodal, Image Captioning, Generative AI, Deep Learning, Library, PyTorch",
29 |     license="3-Clause BSD",
30 |     packages=find_namespace_packages(include="lavis.*"),
31 |     install_requires=fetch_requirements("requirements.txt"),
32 |     python_requires=">=3.7.0",
33 |     include_package_data=True,
34 |     dependency_links=DEPENDENCY_LINKS,
35 |     zip_safe=False,
36 | )
37 | 


--------------------------------------------------------------------------------
/lavis/datasets/download_scripts/DownloadConceptualCaptions/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | -->
 7 | 
 8 | # Download Conceptual Captions Data
 9 | 
10 | Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder
11 | 
12 | `Train_GCC-training.tsv / cc3m.tsv` Training Split (3,318,333)
13 | 
14 | run `download_data_cc3m.py` or `download_data_cc12m.py`.
15 | 
16 | Images will be in default LAVIS cache folders. You can stop and resume, the settings for splitting downloads into chunks / threads are not optimal, but it maxed out my connection so i kept them as is.
17 | 
18 | Note: A previous version of this script used a different file naming scheme, this changed and if you are resuming a previously started download, you will get duplicates.
19 | 
20 | A bunch of them will fail to download, and return web pages instead. These will need to be cleaned up later. See `downloaded_validation_report.tsv` after it downloads for HTTP errors. Around 8% of images are gone, based on validation set results. Setting the user agent could fix some errors too maybe - not sure if any requests are rejected by sites based on this.
21 | 
22 | It should take about a day or two to download the training data, keep an eye on disk space.
23 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/coco_cap_ft_iter.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: large
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         train:
14 |           name: "blip_image_train"
15 |         eval:
16 |           name: "blip_image_eval"
17 |     text_processor:
18 |         train:
19 |           name: "blip_caption"
20 |           prompt: "a picture of "
21 |         eval:
22 |           name: "blip_caption"
23 | 
24 | run:
25 |   runner: runner_iter
26 | 
27 |   max_iters: 2e4
28 |   iters_per_inner_epoch: 2e3
29 | 
30 |   # task: retrieval
31 |   task: captioning
32 |   # optimizer
33 |   lr_sched: "linear_warmup_cosine_lr"
34 |   init_lr: 2e-6
35 |   min_lr: 0
36 |   weight_decay: 0.05
37 |   batch_size_train: 16
38 |   batch_size_eval: 64
39 |   num_workers: 4
40 | 
41 |   max_len: 20
42 |   min_len: 5
43 |   num_beams: 3
44 | 
45 |   seed: 42
46 |   output_dir: "output/BLIP/Caption_coco"
47 | 
48 |   amp: False
49 |   resume_ckpt_path: null
50 | 
51 |   evaluate: False 
52 |   train_splits: ["train"]
53 |   valid_splits: ["val", "test"]
54 | 
55 |   device: "cuda"
56 |   world_size: 1
57 |   dist_url: "env://"
58 |   distributed: True
59 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/train/nlvr_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   model_type: nlvr
 9 |   load_finetuned: False
10 | 
11 | datasets:
12 |   nlvr: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "blip_image_train"
16 |           image_size: 384
17 |         eval:
18 |           name: "blip_image_eval"
19 |           image_size: 384
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: multimodal_classification
28 |   # optimization-specific
29 |   lr_sched: "linear_warmup_cosine_lr"
30 |   init_lr: 2e-5
31 |   min_lr: 1e-6
32 |   weight_decay: 0.02
33 |   warmup_lr: 1e-5
34 |   warmup_steps: 650
35 |   max_epoch: 10
36 |   batch_size_train: 16
37 |   batch_size_eval: 64
38 |   num_workers: 4
39 | 
40 |   seed: 42
41 |   output_dir: "output/ALBEF/NLVR"
42 | 
43 |   amp: False
44 |   resume_ckpt_path: null
45 | 
46 |   evaluate: False 
47 |   train_splits: ["train"]
48 |   valid_splits: ["val", "test"]
49 |   test_splits: ["test"]
50 | 
51 |   # distribution-specific
52 |   device: "cuda"
53 |   world_size: 1
54 |   dist_url: "env://"
55 |   distributed: True
56 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/coco/eval_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url:
16 |               # TODO make this order insensitive
17 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/vqa_val_eval.json
18 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/answer_list.json
19 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_OpenEnded_mscoco_val2014_questions.json
20 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vqav2/v2_mscoco_val2014_annotations.json
21 |           storage:
22 |               - coco/annotations/vqa_val_eval.json
23 |               - coco/annotations/answer_list.json
24 |               - coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json
25 |               - coco/annotations/v2_mscoco_val2014_annotations.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/docs/tutorial.evaluation.rst:
--------------------------------------------------------------------------------
 1 | Evaluating Pre-trained Models on Task Datasets
 2 | ###############################################
 3 | LAVIS provides pre-trained and finetuned model for off-the-shelf evaluation on task dataset. 
 4 | Let's now see an example to evaluate BLIP model on the captioning task, using MSCOCO dataset.
 5 | 
 6 | .. _prep coco:
 7 | 
 8 | Preparing Datasets
 9 | ******************
10 | First, let's download the dataset. LAVIS provides `automatic downloading scripts` to help prepare 
11 | most of the public dataset, to download MSCOCO dataset, simply run
12 | 
13 | .. code-block:: bash
14 | 
15 |     cd lavis/datasets/download_scripts && bash download_coco.py
16 | 
17 | This will put the downloaded dataset at a default cache location ``cache`` used by LAVIS.
18 | 
19 | If you want to use a different cache location, you can specify it by updating ``cache_root`` in ``lavis/configs/default.yaml``.
20 | 
21 | If you have a local copy of the dataset, it is recommended to create a symlink from the cache location to the local copy, e.g.
22 | 
23 | .. code-block:: bash
24 | 
25 |     ln -s /path/to/local/coco cache/coco
26 | 
27 | Evaluating pre-trained models
28 | ******************************
29 | 
30 | To evaluate pre-trained model, simply run
31 | 
32 | .. code-block:: bash
33 | 
34 |     bash run_scripts/lavis/blip/eval/eval_coco_cap.sh
35 | 
36 | Or to evaluate a large model:
37 | 
38 | .. code-block:: bash
39 | 
40 |     bash run_scripts/lavis/blip/eval/eval_coco_cap_large.sh


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/okvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   ok_vqa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_question"
19 | 
20 | run:
21 |   task: vqa_reading_comprehension
22 | 
23 |   # optimization-specific
24 |   batch_size_train: 16
25 |   batch_size_eval: 16
26 |   num_workers: 4
27 | 
28 |   # image question matching specific
29 |   block_num: 7
30 | 
31 |   # image captioning specific
32 |   top_k: 50
33 |   top_p: 1
34 |   cap_min_length: 10
35 |   cap_max_length: 20
36 |   repetition_penalty: 1
37 |   num_patches: 20
38 |   num_captions: 100
39 |   prompt: 'a picture of '
40 | 
41 |   # question answering specific
42 |   internal_bsz_fid: 1
43 |   num_captions_fid: 1
44 |   min_len: 0
45 |   max_len: 20
46 |   num_beams: 1
47 |   inference_method: "generate"
48 | 
49 |   seed: 42
50 |   output_dir: "output/PNP-VQA/OKVQA"
51 | 
52 |   evaluate: True
53 |   test_splits: ["test"]
54 | 
55 |   # distribution-specific
56 |   device: "cuda"
57 |   world_size: 1
58 |   dist_url: "env://"
59 |   distributed: True
60 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: 3b
 9 | 
10 | datasets:
11 |   ok_vqa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_question"
19 | 
20 | run:
21 |   task: vqa_reading_comprehension
22 | 
23 |   # optimization-specific
24 |   batch_size_train: 4
25 |   batch_size_eval: 4
26 |   num_workers: 4
27 | 
28 |   # image question matching specific
29 |   block_num: 7
30 | 
31 |   # image captioning specific
32 |   top_k: 50
33 |   top_p: 1
34 |   cap_min_length: 10
35 |   cap_max_length: 20
36 |   repetition_penalty: 1
37 |   num_patches: 20
38 |   num_captions: 100
39 |   prompt: 'a picture of '
40 | 
41 |   # question answering specific
42 |   internal_bsz_fid: 1
43 |   num_captions_fid: 1
44 |   min_len: 0
45 |   max_len: 20
46 |   num_beams: 1
47 |   inference_method: "generate"
48 | 
49 |   seed: 42
50 |   output_dir: "output/PNP-VQA-3b/OKVQA"
51 | 
52 |   evaluate: True
53 |   test_splits: ["test"]
54 | 
55 |   # distribution-specific
56 |   device: "cuda"
57 |   world_size: 1
58 |   dist_url: "env://"
59 |   distributed: True
60 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/vqav2_zeroshot_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | # Overall Accuracy is: 63.13
 7 | # Per Answer Type Accuracy is the following:
 8 | # other : 52.90
 9 | # yes/no : 84.28
10 | # number : 41.01
11 | 
12 | model:
13 |   arch: blip2_t5
14 |   model_type: pretrain_flant5xl
15 |   use_grad_checkpoint: False
16 | 
17 | datasets:
18 |   coco_vqa: # name of the dataset builder
19 |     type: eval
20 |     vis_processor:
21 |         eval:
22 |           name: "blip_image_eval"
23 |           image_size: 224
24 |     text_processor:
25 |         eval:
26 |           name: "blip_question"
27 | #     build_info:
28 | #         images:
29 | #             storage: '/export/share/datasets/vision/coco/images/'
30 | 
31 | run:
32 |   task: vqa
33 |   # optimization-specific
34 |   batch_size_train: 16
35 |   batch_size_eval: 64
36 |   num_workers: 4
37 | 
38 |   # inference-specific
39 |   max_len: 10
40 |   min_len: 1
41 |   num_beams: 5
42 |   inference_method: "generate"
43 |   prompt: "Question: {} Short answer:"
44 | 
45 |   seed: 42
46 |   output_dir: "output/BLIP2/VQA"
47 | 
48 |   evaluate: True
49 |   test_splits: ["val"]
50 | 
51 |   # distribution-specific
52 |   device: "cuda"
53 |   world_size: 1
54 |   dist_url: "env://"
55 |   distributed: True
56 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: large
 9 | 
10 | datasets:
11 |   ok_vqa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_question"
19 | 
20 | run:
21 |   task: vqa_reading_comprehension
22 | 
23 |   # optimization-specific
24 |   batch_size_train: 12
25 |   batch_size_eval: 12
26 |   num_workers: 4
27 | 
28 |   # image question matching specific
29 |   block_num: 7
30 | 
31 |   # image captioning specific
32 |   top_k: 50
33 |   top_p: 1
34 |   cap_min_length: 10
35 |   cap_max_length: 20
36 |   repetition_penalty: 1
37 |   num_patches: 20
38 |   num_captions: 100
39 |   prompt: 'a picture of '
40 | 
41 |   # question answering specific
42 |   internal_bsz_fid: 1
43 |   num_captions_fid: 1
44 |   min_len: 0
45 |   max_len: 20
46 |   num_beams: 1
47 |   inference_method: "generate"
48 | 
49 |   seed: 42
50 |   output_dir: "output/PNP-VQA-large/OKVQA"
51 | 
52 |   evaluate: True
53 |   test_splits: ["test"]
54 | 
55 |   # distribution-specific
56 |   device: "cuda"
57 |   world_size: 1
58 |   dist_url: "env://"
59 |   distributed: True
60 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/gqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   gqa: # name of the dataset builder
12 |     type: balanced_testdev
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: gqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 16
26 |   batch_size_eval: 16
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 5
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA/GQA"
52 | 
53 |   evaluate: True
54 |   test_splits: ["val"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: 3b
 9 | 
10 | datasets:
11 |   gqa: # name of the dataset builder
12 |     type: balanced_testdev
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: gqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 4
26 |   batch_size_eval: 4
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 5
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA-3b/GQA"
52 | 
53 |   evaluate: True
54 |   test_splits: ["val"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/vqav2_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   coco_vqa: # name of the dataset builder
12 |     type: eval
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 16
26 |   batch_size_eval: 16
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 1
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA/VQAv2_val"
52 | 
53 |   evaluate: True
54 |   test_splits: ["val"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: 3b
 9 | 
10 | datasets:
11 |   coco_vqa: # name of the dataset builder
12 |     type: eval
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 4
26 |   batch_size_eval: 4
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 1
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA-3b/VQAv2_val"
52 | 
53 |   evaluate: True
54 |   test_splits: ["val"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: large
 9 | 
10 | datasets:
11 |   gqa: # name of the dataset builder
12 |     type: balanced_testdev
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: gqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 12
26 |   batch_size_eval: 12
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 5
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA-large/GQA"
52 | 
53 |   evaluate: True
54 |   test_splits: ["val"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   coco_vqa: # name of the dataset builder
12 |     type: default
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 16
26 |   batch_size_eval: 16
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 1
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA/VQAv2_test"
52 | 
53 |   evaluate: True
54 |   test_splits: ["test"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: 3b
 9 | 
10 | datasets:
11 |   coco_vqa: # name of the dataset builder
12 |     type: default
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 4
26 |   batch_size_eval: 4
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 1
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA-3b/VQAv2_test"
52 | 
53 |   evaluate: True
54 |   test_splits: ["test"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: large
 9 | 
10 | datasets:
11 |   coco_vqa: # name of the dataset builder
12 |     type: eval
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 12
26 |   batch_size_eval: 12
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 1
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA-large/VQAv2_val"
52 | 
53 |   evaluate: True
54 |   test_splits: ["val"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pnp_vqa
 8 |   model_type: large
 9 | 
10 | datasets:
11 |   coco_vqa: # name of the dataset builder
12 |     type: default
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa_reading_comprehension
23 | 
24 |   # optimization-specific
25 |   batch_size_train: 12
26 |   batch_size_eval: 12
27 |   num_workers: 4
28 | 
29 |   # image question matching specific
30 |   block_num: 7
31 | 
32 |   # image captioning specific
33 |   top_k: 50
34 |   top_p: 1
35 |   cap_min_length: 10
36 |   cap_max_length: 20
37 |   repetition_penalty: 1
38 |   num_patches: 20
39 |   num_captions: 100
40 |   prompt: 'a picture of '
41 | 
42 |   # question answering specific
43 |   internal_bsz_fid: 1
44 |   num_captions_fid: 1
45 |   min_len: 0
46 |   max_len: 20
47 |   num_beams: 1
48 |   inference_method: "generate"
49 | 
50 |   seed: 42
51 |   output_dir: "output/PNP-VQA-large/VQAv2_test"
52 | 
53 |   evaluate: True
54 |   test_splits: ["test"]
55 | 
56 |   # distribution-specific
57 |   device: "cuda"
58 |   world_size: 1
59 |   dist_url: "env://"
60 |   distributed: True
61 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/train/okvqa_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | datasets:
13 |   ok_vqa: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |           image_size: 384
18 |         eval:
19 |           name: "blip_image_eval"
20 |           image_size: 384
21 |     text_processor:
22 |         train:
23 |           name: "blip_question"
24 |         eval:
25 |           name: "blip_question"
26 | 
27 | run:
28 |   task: vqa
29 |   # optimization-specific
30 |   lr_sched: "linear_warmup_cosine_lr"
31 |   init_lr: 2e-5
32 |   min_lr: 1e-6
33 |   weight_decay: 0.02
34 |   max_epoch: 6
35 |   batch_size_train: 16
36 |   batch_size_eval: 16
37 |   num_workers: 4
38 | 
39 |   # inference-specific
40 |   max_len: 10
41 |   min_len: 1
42 |   num_beams: 256
43 |   num_ans_candidates: 128
44 |   inference_method: "rank"
45 | 
46 |   seed: 42
47 |   output_dir: "output/BLIP/OKVQA"
48 | 
49 |   amp: False
50 |   resume_ckpt_path: null
51 | 
52 |   evaluate: False 
53 |   train_splits: ["train"]
54 |   valid_splits: ["val"]
55 |   test_splits: ["test"]
56 | 
57 |   # distribution-specific
58 |   device: "cuda"
59 |   world_size: 1
60 |   dist_url: "env://"
61 |   distributed: True
62 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/train/msrvtt_retrieval_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 |   model_type: msrvtt
 9 |   load_finetuned: False
10 | 
11 | datasets:
12 |   msrvtt_retrieval: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "alpro_video_train"
16 |           n_frms: 8
17 |           image_size: 224
18 |         eval:
19 |           name: "alpro_video_eval"
20 |           n_frms: 8
21 |           image_size: 224
22 |     text_processor:
23 |         train:
24 |           name: "blip_caption"
25 |         eval:
26 |           name: "blip_caption"
27 | 
28 | run:
29 |   task: retrieval
30 |   # optimization-specific
31 |   lr_sched: "linear_warmup_cosine_lr"
32 |   init_lr: 3e-5
33 |   min_lr: 1e-6
34 |   weight_decay: 1e-4
35 |   max_epoch: 5
36 |   batch_size_train: 8
37 |   batch_size_eval: 8
38 |   num_workers: 4
39 | 
40 |   k_test: 1000
41 | 
42 |   seed: 42
43 |   output_dir: "output/ALPRO/msrvtt_retrieval"
44 | 
45 |   amp: False
46 |   resume_ckpt_path: null
47 | 
48 |   evaluate: False 
49 |   train_splits: ["train"]
50 |   valid_splits: ["val"]
51 |   test_splits: ["test"]
52 | 
53 |   # distribution-specific
54 |   device: "cuda"
55 |   world_size: 1
56 |   dist_url: "env://"
57 |   distributed: True
58 |   use_dist_eval_sampler: False
59 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/train/okvqa_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 | 
 9 |   model_type: okvqa
10 |   load_finetuned: False
11 | 
12 |   image_size: 480
13 | 
14 | datasets:
15 |   ok_vqa: # name of the dataset builder
16 |     vis_processor:
17 |         train:
18 |           name: "blip_image_train"
19 |           image_size: 480
20 |         eval:
21 |           name: "blip_image_eval"
22 |           image_size: 480
23 |     text_processor:
24 |         train:
25 |           name: "blip_question"
26 |         eval:
27 |           name: "blip_question"
28 | 
29 | run:
30 |   task: vqa
31 |   # optimization-specific
32 |   lr_sched: "linear_warmup_cosine_lr"
33 |   init_lr: 3e-5
34 |   min_lr: 1e-5
35 |   weight_decay: 0.02
36 |   max_epoch: 7
37 |   batch_size_train: 16
38 |   batch_size_eval: 16
39 |   num_workers: 4
40 | 
41 |   # inference-specific
42 |   max_len: 10
43 |   min_len: 1
44 |   num_beams: 256
45 |   num_ans_candidates: 128
46 |   inference_method: "rank"
47 | 
48 |   seed: 42
49 |   output_dir: "output/BLIP/OKVQA"
50 | 
51 |   amp: False
52 |   resume_ckpt_path: null
53 | 
54 |   evaluate: False 
55 |   train_splits: ["train"]
56 |   test_splits: ["test"]
57 | 
58 |   # distribution-specific
59 |   device: "cuda"
60 |   world_size: 1
61 |   dist_url: "env://"
62 |   distributed: True
63 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022 Salesforce, Inc.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
11 | 
12 | 3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/train/aokvqa_ft.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | datasets:
13 |   aok_vqa: # name of the dataset builder
14 |     vis_processor:
15 |         train:
16 |           name: "blip_image_train"
17 |           image_size: 384
18 |         eval:
19 |           name: "blip_image_eval"
20 |           image_size: 384
21 |     text_processor:
22 |         train:
23 |           name: "blip_question"
24 |         eval:
25 |           name: "blip_question"
26 | 
27 | run:
28 |   task: aok_vqa
29 |   # optimization-specific
30 |   lr_sched: "linear_warmup_cosine_lr"
31 |   init_lr: 2e-5
32 |   min_lr: 1e-6
33 |   weight_decay: 0.02
34 |   max_epoch: 6
35 |   batch_size_train: 16
36 |   batch_size_eval: 16
37 |   num_workers: 4
38 | 
39 |   # inference-specific
40 |   max_len: 10
41 |   min_len: 1
42 |   num_beams: 256
43 |   num_ans_candidates: 128
44 |   inference_method: "rank"
45 | 
46 |   seed: 42
47 |   output_dir: "output/BLIP/AOKVQA"
48 | 
49 |   amp: False
50 |   resume_ckpt_path: null
51 | 
52 |   evaluate: False 
53 |   train_splits: ["train"]
54 |   valid_splits: ["val"]
55 |   test_splits: ["test"]
56 | 
57 |   # distribution-specific
58 |   device: "cuda"
59 |   world_size: 1
60 |   dist_url: "env://"
61 |   distributed: True
62 | 


--------------------------------------------------------------------------------